In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import json
import time
from datetime import datetime
import os


def setup_driver():
    options = Options()
    # options.add_argument("--headless")  # Activa si no necesitas ver el navegador
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


def aceptar_cookies(driver):
    try:
        wait = WebDriverWait(driver, 10)
        boton_aceptar = wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, "button#didomi-notice-agree-button")))
        boton_aceptar.click()
        print("🍪 Cookies aceptadas.")
        time.sleep(2)
    except:
        print("⚠️ Cookies ya aceptadas o no detectadas.")


def cerrar_popup(driver):
    try:
        popup_close = WebDriverWait(driver, 2).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button.modal__close"))
        )
        popup_close.click()
        print("❌ Modal cerrado.")
        time.sleep(1)
    except:
        pass


def scrape_alquiler_madrid():
    base_url = "https://www.pisos.com/venta/duplexs-madrid/"
    resultados = []
    driver = setup_driver()
    wait = WebDriverWait(driver, 10)

    for page_num in range(1, 7):  # De la 1 a la 100
        url = base_url if page_num == 1 else f"{base_url}{page_num}/"
        print(f"\n🔍 Procesando página {page_num}: {url}")
        driver.get(url)

        if page_num == 1:
            aceptar_cookies(driver)

        try:
            wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.ad-preview")))
            anuncios = driver.find_elements(By.CSS_SELECTOR, "div.ad-preview")
            print(f"📦 Anuncios detectados: {len(anuncios)}")
        except:
            print("⚠️ No se detectaron anuncios.")
            continue

        for anuncio in anuncios:
            try:
                titulo = anuncio.find_element(By.CSS_SELECTOR, "a.ad-preview__title").text.strip()
                ubicacion = anuncio.find_element(By.CSS_SELECTOR, "p.ad-preview__subtitle").text.strip()
                precio = anuncio.find_element(By.CSS_SELECTOR, "span.ad-preview__price").text.strip()

                caracteristicas = anuncio.find_elements(By.CSS_SELECTOR, "p.ad-preview__char.p-sm")
                habitaciones = caracteristicas[0].text.strip() if len(caracteristicas) > 0 else ""
                baños = caracteristicas[1].text.strip() if len(caracteristicas) > 1 else ""
                metros = caracteristicas[2].text.strip() if len(caracteristicas) > 2 else ""

                link = anuncio.find_element(By.CSS_SELECTOR, "a.ad-preview__title").get_attribute("href").strip()

                resultados.append({
                    "timestamp": datetime.now().isoformat(),
                    "titulo": titulo,
                    "precio": precio,
                    "ubicacion": ubicacion,
                    "habitaciones": habitaciones,
                    "baños": baños,
                    "metros": metros,
                    "link": link
                })

                if len(resultados) % 100 == 0:
                    save_to_json(resultados, "data/alquiler_madrid_parcial.json")

            except Exception as e:
                print(f"⚠️ Error extrayendo un anuncio: {e}")
                continue

        time.sleep(3)
        cerrar_popup(driver)

    driver.quit()
    return resultados



def save_to_json(data, filename="data/alquiler_madrid_completo_casas.json"):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"\n💾 Archivo guardado: {filename} ({len(data)} registros)")


if __name__ == "__main__":
    try:
        datos = scrape_alquiler_madrid()
        print(f"\n📊 Total de inmuebles extraídos: {len(datos)}")
        save_to_json(datos)
    except Exception as e:
        print(f"❌ Error durante el scraping: {e}")


🔍 Procesando página 1: https://www.pisos.com/venta/duplexs-madrid/
🍪 Cookies aceptadas.
⚠️ No se detectaron anuncios.

🔍 Procesando página 2: https://www.pisos.com/venta/duplexs-madrid/2/
❌ Error durante el scraping: Message: invalid session id
Stacktrace:
	GetHandleVerifier [0x00728073+60707]
	GetHandleVerifier [0x007280B4+60772]
	(No symbol) [0x005504FE]
	(No symbol) [0x0058B898]
	(No symbol) [0x005BCF06]
	(No symbol) [0x005B89D5]
	(No symbol) [0x005B7F66]
	(No symbol) [0x005236E5]
	(No symbol) [0x00523C3E]
	(No symbol) [0x005240CD]
	GetHandleVerifier [0x0096BB53+2435075]
	GetHandleVerifier [0x009670F3+2416035]
	GetHandleVerifier [0x0098349C+2531660]
	GetHandleVerifier [0x0073F145+155125]
	GetHandleVerifier [0x00745AED+182173]
	(No symbol) [0x005233B0]
	(No symbol) [0x00522BC3]
	GetHandleVerifier [0x00A8D23C+3620588]
	BaseThreadInitThunk [0x778CFCC9+25]
	RtlGetAppContainerNamedObjectPath [0x77C682AE+286]
	RtlGetAppContainerNamedObjectPath [0x77C6827E+238]

