In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import json
import time
import os


def setup_driver():
    options = Options()
    # options.add_argument("--headless")  # Activa si no necesitas ver el navegador
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


def aceptar_cookies(driver):
    wait = WebDriverWait(driver, 10)
    boton_aceptar = wait.until(EC.element_to_be_clickable(
        (By.XPATH, "//button[normalize-space()='Aceptar y cerrar']")))
    boton_aceptar.click()
    print("🍪 Cookies aceptadas.")
    time.sleep(1)


def cerrar_popup(driver):
    try:
        popup_close = WebDriverWait(driver, 2).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button.modal__close"))
        )
        popup_close.click()
        print("❌ Modal cerrado.")
        time.sleep(1)
    except:
        pass


def extraer_detalle(driver, url):
    detalle = {
        "url": url,
        "descripcion": "",
        "certificado_energetico": "",
        "tipo_vivienda": "",
        "superficie": "",
        "gastos_comunidad": "",
        "imagen_destacada": ""
    }

    try:
        driver.get(url)
        wait = WebDriverWait(driver, 10)

        cerrar_popup(driver)

        # Descripción
        try:
            descripcion = wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, "div.ad-detail-description"))).text
            detalle["descripcion"] = descripcion.strip()
        except:
            pass

        # Características
        try:
            caracteristicas = driver.find_elements(By.CSS_SELECTOR, "li.feature-item")
            for item in caracteristicas:
                texto = item.text.lower()
                if "certificado energético" in texto:
                    detalle["certificado_energetico"] = item.text.strip()
                elif "tipo de vivienda" in texto:
                    detalle["tipo_vivienda"] = item.text.strip()
                elif "construida" in texto or "útil" in texto:
                    detalle["superficie"] += item.text.strip() + " / "
                elif "gastos comunidad" in texto:
                    detalle["gastos_comunidad"] = item.text.strip()
        except:
            pass

        # Imagen destacada
        try:
            img_tag = driver.find_element(By.CSS_SELECTOR, "div.ad-detail__gallery img")
            detalle["imagen_destacada"] = img_tag.get_attribute("src")
        except:
            pass

    except Exception as e:
        print(f"❌ Error accediendo a {url}: {e}")

    return detalle


def cargar_urls_preview(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return [item["link"] for item in data]


def save_to_json(data, filename="data/venta_detalles.json"):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"\n💾 Detalles guardados en: {filename} ({len(data)} registros)")


if __name__ == "__main__":
    driver = setup_driver()

    # Aceptar cookies solo una vez al principio
    driver.get("https://www.pisos.com/venta/pisos-madrid/")
    aceptar_cookies(driver)
    time.sleep(1)

    urls = cargar_urls_preview("data/alquiler_madrid_completo.json")

    resultados = []
    for i, url in enumerate(urls):
        print(f"🔍 ({i+1}/{len(urls)}) Analizando: {url}")
        detalle = extraer_detalle(driver, url)
        resultados.append(detalle)

        # Guardar cada 50 registros como respaldo
        if (i + 1) % 50 == 0:
            save_to_json(resultados, "data/alquiler_detalles_parcial.json")

    driver.quit()
    save_to_json(resultados, "data/alquiler_detalles_completo.json")



In [12]:
import json

with open("C:/Users/pablo/data/venta_madrid_completo_todos.json", 'r', encoding='utf-8') as f:
    data = json.load(f)
    print(f"Número de registros: {len(data)}")


Número de registros: 3430


In [26]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import json
import time
import os

def setup_driver():
    options = Options()
    # options.add_argument("--headless")  # Actívalo si no quieres abrir el navegador
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def aceptar_cookies(driver):
    try:
        boton = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button#didomi-notice-agree-button"))
        )
        boton.click()
        time.sleep(1)
    except:
        pass

def extraer_detalles(driver, url):
    detalles = {
        "descripcion_ampliada": "",
        "certificado_energetico": "",
        "tipo_vivienda": "",
        "superficie_construida": "",
        "superficie_util": "",
        "imagen_destacada": ""
    }

    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "body")))
        aceptar_cookies(driver)

        # Descripción
        try:
            detalles["descripcion_ampliada"] = driver.find_element(By.CSS_SELECTOR, "div.description__content").text.strip()
        except:
            pass

        # Características
        features = driver.find_elements(By.CSS_SELECTOR, "div.features__feature")
        for f in features:
            try:
                label = f.find_element(By.CSS_SELECTOR, "span.features__label").text.strip()
                value = f.find_element(By.CSS_SELECTOR, "span.features__value").text.strip()
                if "Tipo de casa" in label or "Tipo de vivienda" in label:
                    detalles["tipo_vivienda"] = value
                elif "Superficie construida" in label:
                    detalles["superficie_construida"] = value
                elif "Superficie útil" in label:
                    detalles["superficie_util"] = value
            except:
                continue

        # Certificado energético
        try:
            cert_title = driver.find_element(By.XPATH, "//h2[contains(text(), 'Certificado energético')]")
            cert_value = cert_title.find_element(By.XPATH, "./following-sibling::p")
            detalles["certificado_energetico"] = cert_value.text.strip()
        except:
            pass

        # Imagen destacada
        try:
            img = driver.find_element(By.CSS_SELECTOR, "div.media-thumbnail img")
            detalles["imagen_destacada"] = img.get_attribute("src")
        except:
            pass

    except Exception as e:
        print(f"❌ Error en {url}: {e}")
    return detalles

def scrape_completo(input_file="C:/Users/pablo/data/venta_madrid_completo_todos.json", 
                    output_file="C:/Users/pablo/data/venta_madrid_detalle_completo.json"):
    with open(input_file, "r", encoding="utf-8") as f:
        inmuebles = json.load(f)

    driver = setup_driver()
    resultados = []

    for i, item in enumerate(inmuebles):
        print(f"🔎 ({i+1}/{len(inmuebles)}) {item['link']}")
        detalles = extraer_detalles(driver, item['link'])
        item.update(detalles)
        resultados.append(item)

        # Guardar parcialmente cada 100 registros
        if (i + 1) % 100 == 0:
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(resultados, f, ensure_ascii=False, indent=2)
            print(f"💾 Guardado parcial ({i+1})")

    driver.quit()

    # Guardado final
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(resultados, f, ensure_ascii=False, indent=2)
    print(f"\n✅ Guardado completo: {output_file} ({len(resultados)} registros)")

if __name__ == "__main__":
    scrape_completo()


🔎 (1/3430) https://www.pisos.com/comprar/casa_adosada-estacion_zona_norte28224-5106780590_109700/
🔎 (2/3430) https://www.pisos.com/comprar/piso-san_sebastian_de_los_reyes_casco_antiguo28703-53387680450_515177/
🔎 (3/3430) https://www.pisos.com/comprar/piso-zona_sureste-5129162041_109700/
🔎 (4/3430) https://www.pisos.com/comprar/piso-hoyo_de_manzanares_centro_urbano-46711883004_994129/
🔎 (5/3430) https://www.pisos.com/comprar/piso-sol_barrio28013-53433152926_524671/
🔎 (6/3430) https://www.pisos.com/comprar/piso-arroyos_y_tempranales28702-41742370937_106000/
🔎 (7/3430) https://www.pisos.com/comprar/piso-pinar_punta_galea28290-53410897274_100500/
🔎 (8/3430) https://www.pisos.com/comprar/piso-guindalera28028-51765010516_440000/
🔎 (9/3430) https://www.pisos.com/comprar/piso-fuente_del_berro28028-631974332_440000/
🔎 (10/3430) https://www.pisos.com/comprar/chalet-aldea_del_fresno_centro_urbano-43379000866_101800/
🔎 (11/3430) https://www.pisos.com/comprar/piso-alpedrete_centro_urbano-5342144018