In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import re
import time
import requests


def extraer_sku_y_gramos(url):
    """Usa Selenium para abrir la p√°gina y obtener SKU y gramos/ml (Shopify carga con JS)."""
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,1080")

    try:
        driver = webdriver.Chrome(options=options)
        driver.get(url)

        # Esperar a que cargue el bloque principal del producto
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "p.product__sku, p.custom_product__text"))
        )

        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        # --- SKU ---
        sku = "N/A"
        p_sku = soup.find("p", class_="product__sku")
        if p_sku:
            texto = p_sku.get_text(strip=True)
            match = re.search(r"SKU[:\s]*([A-Za-z0-9\-]+)", texto)
            if match:
                sku = match.group(1)
            else:
                sku = texto

        # --- Gramos / ml ---
        gramos = "N/A"
        p_gramos = soup.find("p", class_="custom_product__text product__text")
        if p_gramos:
            texto_g = p_gramos.get_text(strip=True)
            match_g = re.search(r"(\d+(?:[.,]\d+)?\s*(?:g|ml|gr|GR|ML))", texto_g, re.IGNORECASE)
            if match_g:
                gramos = match_g.group(1).replace(",", ".").lower()

        driver.quit()
        return sku, gramos

    except Exception as e:
        print(f"‚ö†Ô∏è Error en {url}: {e}")
        try:
            driver.quit()
        except:
            pass
        return "N/A", "N/A"


def rostro_ro():
    """Extrae TODOS los productos de la categor√≠a 'Cuidado de la Piel' desde Nala Rumania (https://nala.ro/collections/ten)."""

    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,1080")

    driver = webdriver.Chrome(options=options)
    driver.get("https://nala.ro/collections/ten")

    # Esperar carga de productos
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.grid__item"))
    )

    # --- Scroll din√°mico ---
    last_count = 0
    stable_rounds = 0
    while stable_rounds < 3:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        products = driver.find_elements(By.CSS_SELECTOR, "li.grid__item")
        if len(products) == last_count:
            stable_rounds += 1
        else:
            stable_rounds = 0
        last_count = len(products)

    print(f"Total productos detectados en la p√°gina: {last_count}")

    html = driver.page_source
    driver.quit()

    soup = BeautifulSoup(html, "html.parser")
    items = soup.select("li.grid__item")
    print(f"Productos encontrados en HTML: {len(items)}")

    productos = []
    categorias = defaultdict(int)

    for item in items:
        enlace = item.find("a", href=re.compile("/products/"))
        if not enlace:
            continue
        url_producto = "https://nala.ro" + enlace.get("href")

        # ====== üîç Extracci√≥n robusta del nombre ======
        nombre = None
        nombre_tag = item.find(attrs={"class": re.compile("(title|heading|card__title)", re.IGNORECASE)})
        if nombre_tag and nombre_tag.get_text(strip=True):
            nombre = nombre_tag.get_text(strip=True)

        if not nombre:
            hidden_name = item.find("span", class_=re.compile("visually-hidden", re.IGNORECASE))
            if hidden_name:
                nombre = hidden_name.get_text(strip=True)

        if not nombre:
            nombre = enlace.get_text(strip=True)

        if not nombre:
            nombre = "N/A"
        # =====================================================

        # --- Precio ---
        precio_tag = item.find("div", class_=re.compile("price", re.IGNORECASE))
        precio = "N/A"
        if precio_tag:
            precio_texto = precio_tag.get_text(strip=True)
            precio_match = re.search(r"\d+,\d{2}", precio_texto)
            if precio_match:
                precio = precio_match.group() + " ¬£"

# --- Ingrediente clave (detectado en rumano y traducido al espa√±ol) ---
        ingrediente_match = re.search(
        r"(cƒÉp»ôuni|zmeurƒÉ|struguri|avocado|aloe vera|argan|unt de shea|lavandƒÉ|mu»ôe»õel|castravete|cocos|lƒÉm√¢ie|piersicƒÉ|mango|grapefruit|vanilie|cafea|rodie|mentƒÉ|trandafiri|c√¢nepƒÉ|pepene verde|portocalƒÉ|ananas|coacƒÉze|cire»ôe|gƒÉlbenele|jojoba)",
        nombre,
        re.IGNORECASE
        )

        ingrediente = "N/A"
        if ingrediente_match:
            ingrediente_rumano = ingrediente_match.group(1).lower()
            ingrediente = {
                "cƒÉp»ôuni": "Fresa", "zmeurƒÉ": "Frambuesa", "struguri": "Uvas", "avocado": "Aguacate",
            "aloe vera": "Aloe Vera", "argan": "Arg√°n", "unt de shea": "Karit√©", "lavandƒÉ": "Lavanda",
            "mu»ôe»õel": "Manzanilla", "castravete": "Pepino", "cocos": "Coco", "lƒÉm√¢ie": "Lim√≥n",
            "piersicƒÉ": "Melocot√≥n", "mango": "Mango", "grapefruit": "Pomelo", "vanilie": "Vainilla",
            "cafea": "Caf√©", "rodie": "Granada", "mentƒÉ": "Menta", "trandafiri": "Rosas", "c√¢nepƒÉ": "C√°√±amo",
            "pepene verde": "Sand√≠a", "portocalƒÉ": "Naranja", "ananas": "Pi√±a", "coacƒÉze": "Grosella negra",
            "cire»ôe": "Cereza", "gƒÉlbenele": "Cal√©ndula", "jojoba": "Jojoba"
            }.get(ingrediente_rumano, "N/A")


        # --- Clasificaci√≥n por categor√≠a ---
        nombre_lower = nombre.lower()
        if "exfoliant" in nombre_lower and "buze" in nombre_lower:
            categoria = "Exfoliante Labial"
        elif "exfoliant" in nombre_lower:
            categoria = "Exfoliante Facial"
        elif "ochi" in nombre_lower:
            categoria = "Contorno de Ojos"
        elif "crema" in nombre_lower:
            categoria = "Crema Facial"
        elif "serum" in nombre_lower or "s√©rum" in nombre_lower:
            categoria = "Serum Facial"
        elif "set" in nombre_lower:
            categoria = "Set Facial"
        elif "spf" in nombre_lower or "protec»õie solarƒÉ" in nombre_lower:
            categoria = "Protector Solar Facial"
        elif "balsam" in nombre_lower:
            categoria = "B√°lsamo"
        elif "spumƒÉ" in nombre_lower:
            categoria = "Espuma Limpiadora"
        elif "gel" in nombre_lower:
            categoria = "Gel Limpiador"
        elif "tonic" in nombre_lower:
            categoria = "T√≥nico Facial"
        elif "mascƒÉ" in nombre_lower:
            categoria = "Mascarilla Facial"
        elif "micelar" in nombre_lower:
            categoria = "Agua Micelar Limpiador"
        elif "lapte" in nombre_lower:
            categoria = "Leche Limpiador Facial"
        else:
            categoria = "Otro"

        categorias[categoria] += 1

        productos.append({
            "SKU": "N/A",   # se completar√° despu√©s
            "gramos/ml": "N/A",
            "nombre": nombre,
            "categoria_general": "Rostro",
            "categoria": categoria,
            "precio": precio,
            "ingrediente_clave":ingrediente,
            "pa√≠s": "Rumania",
            "url": url_producto,
        })

    # === Extraer SKUs y gramos/ml en paralelo ===
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(extraer_sku_y_gramos, p["url"]): i for i, p in enumerate(productos)}
        for future in as_completed(futures):
            idx = futures[future]
            try:
                sku, gramos = future.result()
                productos[idx]["SKU"] = sku
                productos[idx]["gramos/ml"] = gramos
            except Exception:
                productos[idx]["SKU"] = "N/A"
                productos[idx]["gramos/ml"] = "N/A"

    df = pd.DataFrame(productos)
    print(f"Total productos finales: {len(df)}")
    # === Convertir precio y gramos/ml a columnas num√©ricas ===
    df["precio"] = (
        df["precio"]
        .str.replace("¬£", "", regex=False)
        .str.replace(",", ".", regex=False)
        .str.strip()
        .astype(float)
        )

    df["gramos/ml"] = (
        df["gramos/ml"]
        .str.extract(r"(\d+(?:\.\d+)?)")[0]   # Extrae el n√∫mero
        .astype(float)                        # Convierte a decimal
        .round()                              # Redondea si hay decimales
        .astype("Int64")                      # Convierte a entero permitiendo NaN
    )

    return df




In [11]:
df_rostro_ro = rostro_ro()
df_rostro_ro

Total productos detectados en la p√°gina: 116
Productos encontrados en HTML: 116
Total productos finales: 116


Unnamed: 0,SKU,gramos/ml,nombre,categoria_general,categoria,precio,ingrediente_clave,pa√≠s,url
0,70482,11,Unt Buze - NucƒÉ de Cocos & VanilieAdaugƒÉ la .,Rostro,Otro,19.9,Coco,Rumania,https://nala.ro/products/balsam-buze-nuca-coco...
1,64836,11,Unt Buze - Cire»ôeAdaugƒÉ la .,Rostro,Otro,19.9,Cereza,Rumania,https://nala.ro/products/balsam-buze-cirese
2,30788,120,Lo»õiune dupƒÉ bƒÉrbierit - Ulei Avocado & IenupƒÉ...,Rostro,Otro,34.9,Aguacate,Rumania,https://nala.ro/products/lotiune-dupa-barbieri...
3,70472,50,Exfoliant Facial - Exfoliere ProfundƒÉ - Cafea ...,Rostro,Exfoliante Facial,29.9,Caf√©,Rumania,https://nala.ro/products/exfoliant-facial-exfo...
4,64827,15,CremƒÉ Ochi Iluminatoare - CearcƒÉne & Pungi - C...,Rostro,Contorno de Ojos,39.9,,Rumania,https://nala.ro/products/crema-ochi-iluminatoa...
...,...,...,...,...,...,...,...,...,...
111,3152,,Rutina TOTAL LIFTAdaugƒÉ la .,Rostro,Otro,239.9,,Rumania,https://nala.ro/products/rutina-total-lift
112,3153,,Rutina HYDRA LIFT PROAdaugƒÉ la .,Rostro,Otro,229.9,,Rumania,https://nala.ro/products/rutina-hydra-lift-pro
113,3154,,Rutina HYDRA INTENSE PROAdaugƒÉ la .,Rostro,Otro,209.9,,Rumania,https://nala.ro/products/rutina-hydra-intense-pro
114,3155,,Rutina LIFT OILAdaugƒÉ la .,Rostro,Otro,139.9,,Rumania,https://nala.ro/products/rutina-lift-oil


In [12]:
def cabello_ro():
    """Extrae TODOS los productos de la categor√≠a 'Cabello' desde Nala Rumania (https://nala.ro/collections/par)."""

    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,1080")

    driver = webdriver.Chrome(options=options)
    driver.get("https://nala.ro/collections/par")

    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.grid__item"))
    )

    # Scroll din√°mico
    last_count = 0
    stable_rounds = 0
    while stable_rounds < 3:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        products = driver.find_elements(By.CSS_SELECTOR, "li.grid__item")
        if len(products) == last_count:
            stable_rounds += 1
        else:
            stable_rounds = 0
        last_count = len(products)

    print(f"Total productos detectados en la p√°gina: {last_count}")

    html = driver.page_source
    driver.quit()

    soup = BeautifulSoup(html, "html.parser")
    items = soup.select("li.grid__item")
    print(f"Productos encontrados en HTML: {len(items)}")

    productos = []
    categorias = defaultdict(int)

    for item in items:
        enlace = item.find("a", href=re.compile("/products/"))
        if not enlace:
            continue
        url_producto = "https://nala.ro" + enlace.get("href")

        # ====== üîç Extracci√≥n robusta del nombre ======
        nombre = None
        nombre_tag = item.find(attrs={"class": re.compile("(title|heading|card__title)", re.IGNORECASE)})
        if nombre_tag and nombre_tag.get_text(strip=True):
            nombre = nombre_tag.get_text(strip=True)

        if not nombre:
            hidden_name = item.find("span", class_=re.compile("visually-hidden", re.IGNORECASE))
            if hidden_name:
                nombre = hidden_name.get_text(strip=True)

        if not nombre:
            nombre = enlace.get_text(strip=True)

        if not nombre:
            nombre = "N/A"
        # =====================================================

        # --- Precio ---
        precio_tag = item.find("div", class_=re.compile("price", re.IGNORECASE))
        precio = "N/A"
        if precio_tag:
            precio_texto = precio_tag.get_text(strip=True)
            precio_match = re.search(r"\d+,\d{2}", precio_texto)
            if precio_match:
                precio = precio_match.group() + " ¬£"

        # --- Ingrediente clave (detectado en rumano y traducido al espa√±ol) ---
        ingrediente_match = re.search(
            r"(granada|lemongrass|miel|cafeina|camomila|violeta absolut|vanilie|aloe|argan|karit√©|jojoba|calendula|mu»ôe»õel|avocado|c√¢nepƒÉ|trandafiri|lavandƒÉ|mentƒÉ|cocos|lƒÉm√¢ie|piersicƒÉ|ananas|mango|castravete|grapefruit|struguri|portocalƒÉ|spirulinƒÉ|salvie|migdale|rozmarin|lime|algƒÉ|flori de portocal|violeta|lƒÉm√¢i»õƒÉ|ceai verde|ghimbir|vitamina e|keratinƒÉ|proteine vegetale|ulei de mƒÉsline|ceapƒÉ|boroj√≥)",
            nombre,
            re.IGNORECASE
        )


        ingrediente = "N/A"
        if ingrediente_match:
            ingrediente_rumano = ingrediente_match.group(1).lower()
            ingrediente = {
                "granada": "Granada", "lemongrass": "Lemongrass", "miel": "Miel", "cafeina": "Cafe√≠na",
                "camomila": "Camomila", "violeta absolut": "Violeta Absoluto", "vanilie": "Vainilla",
                "aloe": "Aloe", "argan": "Arg√°n", "karit√©": "Karit√©", "jojoba": "Jojoba",
                "calendula": "Cal√©ndula", "mu»ôe»õel": "Manzanilla", "avocado": "Aguacate", "c√¢nepƒÉ": "C√°√±amo",
                "trandafiri": "Rosas", "lavandƒÉ": "Lavanda", "mentƒÉ": "Menta", "cocos": "Coco",
                "lƒÉm√¢ie": "Lim√≥n", "piersicƒÉ": "Melocot√≥n", "ananas": "Pi√±a", "mango": "Mango",
                "castravete": "Pepino", "grapefruit": "Pomelo", "struguri": "Uvas", "portocalƒÉ": "Naranja",
                "spirulinƒÉ": "Espirulina", "salvie": "Salvia", "migdale": "Almendra", "rozmarin": "Romero",
                "lime": "Lima", "algƒÉ": "Algas", "flori de portocal": "Flor de Naranja", "violeta": "Violeta",
                "lƒÉm√¢i»õƒÉ": "Limoncillo", "ceai verde": "T√© Verde", "ghimbir": "Jengibre", "vitamina e": "Vitamina E",
                "keratinƒÉ": "Queratina", "proteine vegetale": "Prote√≠nas Vegetales", "ulei de mƒÉsline": "Aceite de Oliva",
                "ceapƒÉ": "Cebolla", "boroj√≥": "Boroj√≥"
            }.get(ingrediente_rumano, "N/A")

        # --- Clasificaci√≥n por categor√≠a ---
        nombre_lower = nombre.lower()
        if "»ôampon solid" in nombre_lower:
            categoria = "Champ√∫ S√≥lido"
        elif "»ôampon" in nombre_lower:
            categoria = "Champ√∫"
        elif "tratament" in nombre_lower:
            categoria = "Tratamiento Capilar"
        elif "mascƒÉ" in nombre_lower:
            categoria = "Mascarilla Capilar"
        elif "balsam" in nombre_lower:
            categoria = "Acondicionador"
        elif "tonic" in nombre_lower:
            categoria = "T√≥nico Capilar"
        elif "ulei" in nombre_lower:
            categoria = "Aceite Capilar"
        elif "set" in nombre_lower:
            categoria = "Set Capilar"
        elif "spray" in nombre_lower:
            categoria = "Spray Capilar"
        else:
            categoria = "Otro"

        categorias[categoria] += 1

        productos.append({
            "SKU": "N/A",
            "gramos/ml": "N/A",
            "nombre": nombre,
            "categoria_general": "Cabello",
            "categoria": categoria,
            "precio": precio,
            "ingrediente_clave": ingrediente,
            "pa√≠s": "Rumania",
            "url": url_producto,
        })

    # === Extraer SKUs y gramos/ml en paralelo ===
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(extraer_sku_y_gramos, p["url"]): i for i, p in enumerate(productos)}
        for future in as_completed(futures):
            idx = futures[future]
            try:
                sku, gramos = future.result()
                productos[idx]["SKU"] = sku
                productos[idx]["gramos/ml"] = gramos
            except Exception:
                productos[idx]["SKU"] = "N/A"
                productos[idx]["gramos/ml"] = "N/A"

    df = pd.DataFrame(productos)
    print(f"Total productos finales: {len(df)}")

    # === Convertir precio y gramos/ml a columnas num√©ricas ===
    df["precio"] = (
        df["precio"]
        .str.replace("¬£", "", regex=False)
        .str.replace(",", ".", regex=False)
        .str.strip()
        .astype(float)
    )

    df["gramos/ml"] = (
        df["gramos/ml"]
        .str.extract(r"(\d+(?:\.\d+)?)")[0]
        .astype(float)
        .round()
        .astype("Int64")
    )

    return df


In [13]:
df_cabello_ro = cabello_ro()
df_cabello_ro 

Total productos detectados en la p√°gina: 57
Productos encontrados en HTML: 57
Total productos finales: 45


Unnamed: 0,SKU,gramos/ml,nombre,categoria_general,categoria,precio,ingrediente_clave,pa√≠s,url
0,30692,200.0,»òampon PƒÉr Gras - Mu»ôe»õelAdaugƒÉ la .,Cabello,Champ√∫,39.9,Manzanilla,Rumania,https://nala.ro/products/sampon-par-gras-musetel
1,64422,200.0,"»òampon protector PƒÉr Vopsit - Extract Orez, An...",Cabello,Champ√∫,29.9,,Rumania,https://nala.ro/products/sampon-protector-par-...
2,70455,58.0,»òampon solid PƒÉr Deteriorat - GardenieAdaugƒÉ la .,Cabello,Champ√∫ S√≥lido,24.9,,Rumania,https://nala.ro/products/sampon-solid-par-dete...
3,70454,58.0,»òampon solid PƒÉr Gras - LƒÉm√¢i»õƒÉAdaugƒÉ la .,Cabello,Champ√∫ S√≥lido,24.9,Limoncillo,Rumania,https://nala.ro/products/sampon-solid-par-gras...
4,70456,58.0,»òampon solid Scalp Sensibil - CƒÉp»ôuniAdaugƒÉ la .,Cabello,Champ√∫ S√≥lido,24.9,,Rumania,https://nala.ro/products/sampon-solid-scalp-se...
5,64240,200.0,"»òampon Stimulare »ôi Fortifiere - CafeinƒÉ, Pipe...",Cabello,Champ√∫,39.9,,Rumania,https://nala.ro/products/sampon-stimulare-cres...
6,64307,200.0,»òampon Violet PƒÉr Blond - Extract Absolut Viol...,Cabello,Champ√∫,49.9,Lim√≥n,Rumania,https://nala.ro/products/sampon-violet-par-blo...
7,70436,200.0,»òampon Volum »ôi StrƒÉlucire - Sare de la Marea ...,Cabello,Champ√∫,39.9,Espirulina,Rumania,https://nala.ro/products/sampon-volum-si-stral...
8,64287,200.0,"Balsam PƒÉr Blond - NucƒÉ de Cocos, Rozmarin, LƒÉ...",Cabello,Acondicionador,39.9,Coco,Rumania,https://nala.ro/products/balsam-par-blond-ulei...
9,30700,200.0,Balsam PƒÉr Gras - Mu»ôe»õelAdaugƒÉ la .,Cabello,Acondicionador,24.9,Manzanilla,Rumania,https://nala.ro/products/balsam-par-gras-musetel


In [14]:
def corporal_ro():
    """Extrae TODOS los productos de la categor√≠a 'Corporal' en Nala Rumania."""

    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,1080")

    driver = webdriver.Chrome(options=options)
    driver.get("https://nala.ro/collections/ingrijire-corp")

    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.grid__item"))
    )

    # Scroll din√°mico
    last_count = 0
    stable_rounds = 0
    while stable_rounds < 3:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        products = driver.find_elements(By.CSS_SELECTOR, "li.grid__item")
        if len(products) == last_count:
            stable_rounds += 1
        else:
            stable_rounds = 0
        last_count = len(products)

    print(f"Total productos detectados en la p√°gina: {last_count}")

    html = driver.page_source
    driver.quit()
    soup = BeautifulSoup(html, "html.parser")
    items = soup.select("li.grid__item")
    print(f"Productos encontrados en HTML: {len(items)}")

    productos = []
    categorias = defaultdict(int)

    for item in items:
        enlace = item.find("a", href=re.compile("/products/"))
        if not enlace:
            continue
        url_producto = "https://nala.ro" + enlace.get("href")

        # Nombre
        nombre = (
            item.find(attrs={"class": re.compile("(title|heading|card__title)", re.IGNORECASE)}) or
            item.find("span", class_=re.compile("visually-hidden", re.IGNORECASE)) or
            enlace
        )
        nombre = nombre.get_text(strip=True) if nombre else "N/A"
        nombre_lower = nombre.lower()

        # Precio
        precio = "N/A"
        precio_tag = item.find("div", class_=re.compile("price", re.IGNORECASE))
        if precio_tag:
            match = re.search(r"\d+,\d{2}", precio_tag.get_text(strip=True))
            if match:
                precio = f"{match.group()} ¬£"

        # Ingrediente clave (rumano ‚Üí espa√±ol)
        ingrediente_match = re.search(
            r"(cƒÉp»ôuni|zmeurƒÉ|struguri|avocado|aloe vera|argan|unt de shea|lavandƒÉ|mu»ôe»õel|castravete|cocos|lƒÉm√¢ie|piersicƒÉ|mango|grapefruit|vanilie|cafea|rodie|mentƒÉ|trandafiri|c√¢nepƒÉ|pepene verde|portocalƒÉ|ananas|coacƒÉze|cire»ôe|gƒÉlbenele|violeta|migdale|scor»õi»ôoarƒÉ|miere|ciocolatƒÉ|caramel|liliac|flori de portocal|ceai verde|ghimbir|lemongrass|rozmarin|ulei de mƒÉsline)",
            nombre,
            re.IGNORECASE
        )

        ingrediente = "N/A"
        if ingrediente_match:
            ingrediente_rumano = ingrediente_match.group(1).lower()
            ingrediente = {
                "cƒÉp»ôuni": "Fresa", "zmeurƒÉ": "Frambuesa", "struguri": "Uvas", "avocado": "Aguacate",
                "aloe vera": "Aloe Vera", "argan": "Arg√°n", "unt de shea": "Karit√©", "lavandƒÉ": "Lavanda",
                "mu»ôe»õel": "Manzanilla", "castravete": "Pepino", "cocos": "Coco", "lƒÉm√¢ie": "Lim√≥n",
                "piersicƒÉ": "Melocot√≥n", "mango": "Mango", "grapefruit": "Pomelo", "vanilie": "Vainilla",
                "cafea": "Caf√©", "rodie": "Granada", "mentƒÉ": "Menta", "trandafiri": "Rosas", "c√¢nepƒÉ": "C√°√±amo",
                "pepene verde": "Sand√≠a", "portocalƒÉ": "Naranja", "ananas": "Pi√±a", "coacƒÉze": "Grosella negra",
                "cire»ôe": "Cereza", "gƒÉlbenele": "Cal√©ndula", "violeta": "Violeta", "migdale": "Almendra",
                "scor»õi»ôoarƒÉ": "Canela", "miere": "Miel", "ciocolatƒÉ": "Chocolate", "caramel": "Caramelo",
                "liliac": "Lila", "flori de portocal": "Flor de Naranjo", "ceai verde": "T√© Verde",
                "ghimbir": "Jengibre", "lemongrass": "Lemongrass", "rozmarin": "Romero", "ulei de mƒÉsline": "Aceite de Oliva"
            }.get(ingrediente_rumano, "N/A")

        # Clasificaci√≥n corporal
        if "m√¢ini" in nombre_lower:
            categoria = "Crema de Manos"
        elif "spray" in nombre_lower:
            categoria = "Spray Corporal"
        elif "gel" in nombre_lower:
            categoria = "Gel Corporal"
        elif "bomba" in nombre_lower:
            categoria = "Bomba de Ba√±o"
        elif "crema" in nombre_lower:
            categoria = "Crema Corporal"
        elif "ulei" in nombre_lower:
            categoria = "Aceite Corporal"
        elif "exfoliant" in nombre_lower:
            categoria = "Exfoliante Corporal"
        elif "apƒÉ" in nombre_lower and "parfumatƒÉ" in nombre_lower:
            categoria = "Bruma"
        else:
            categoria = "Otro"

        categorias[categoria] += 1

        productos.append({
            "SKU": "N/A",
            "gramos/ml": "N/A",
            "nombre": nombre,
            "categoria_general": "Corporal",
            "categoria": categoria,
            "precio": precio,
            "ingrediente_clave": ingrediente,
            "pa√≠s": "Rumania",
            "url": url_producto,
        })

    # Extraer SKU y gramos/ml
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(extraer_sku_y_gramos, p["url"]): i for i, p in enumerate(productos)}
        for future in as_completed(futures):
            idx = futures[future]
            try:
                sku, gramos = future.result()
                productos[idx]["SKU"] = sku
                productos[idx]["gramos/ml"] = gramos
            except:
                productos[idx]["SKU"] = "N/A"
                productos[idx]["gramos/ml"] = "N/A"

    df = pd.DataFrame(productos)
    print(f"Total productos finales: {len(df)}")

    # Convertir precio y gramos/ml a columnas num√©ricas
    df["precio"] = (
        df["precio"]
        .str.replace("¬£", "", regex=False)
        .str.replace(",", ".", regex=False)
        .str.strip()
        .astype(float)
    )

    df["gramos/ml"] = (
        df["gramos/ml"]
        .str.extract(r"(\d+(?:\.\d+)?)")[0]
        .astype(float)
        .round()
        .astype("Int64")
    )

    return df



In [15]:
df_corporal_ro = corporal_ro()
df_corporal_ro

Total productos detectados en la p√°gina: 154
Productos encontrados en HTML: 154
‚ö†Ô∏è Error en https://nala.ro/products/unt-corp-piper-rosu-ambra: Message: 
Stacktrace:
	GetHandleVerifier [0x0x7ff633eee8e5+80021]
	GetHandleVerifier [0x0x7ff633eee940+80112]
	(No symbol) [0x0x7ff633c7060f]
	(No symbol) [0x0x7ff633cc8854]
	(No symbol) [0x0x7ff633cc8b1c]
	(No symbol) [0x0x7ff633d1c927]
	(No symbol) [0x0x7ff633cf126f]
	(No symbol) [0x0x7ff633d1968a]
	(No symbol) [0x0x7ff633cf1003]
	(No symbol) [0x0x7ff633cb95d1]
	(No symbol) [0x0x7ff633cba3f3]
	GetHandleVerifier [0x0x7ff6341adc7d+2960429]
	GetHandleVerifier [0x0x7ff6341a7f3a+2936554]
	GetHandleVerifier [0x0x7ff6341c8977+3070247]
	GetHandleVerifier [0x0x7ff633f083ce+185214]
	GetHandleVerifier [0x0x7ff633f0fe1f+216527]
	GetHandleVerifier [0x0x7ff633ef7b24+117460]
	GetHandleVerifier [0x0x7ff633ef7cdf+117903]
	GetHandleVerifier [0x0x7ff633eddbb8+11112]
	BaseThreadInitThunk [0x0x7ffc14ebe8d7+23]
	RtlUserThreadStart [0x0x7ffc1662c53c+44]

Total

Unnamed: 0,SKU,gramos/ml,nombre,categoria_general,categoria,precio,ingrediente_clave,pa√≠s,url
0,35247,150,ApƒÉ ParfumatƒÉ Corp & PƒÉr - Ananas & Lemn Santa...,Corporal,Bruma,29.9,Pi√±a,Rumania,https://nala.ro/products/apa-parfumata-corp-pa...
1,64276,150,ApƒÉ ParfumatƒÉ Corp & PƒÉr - CƒÉp»ôuniAdaugƒÉ la .,Corporal,Bruma,29.9,Fresa,Rumania,https://nala.ro/products/apa-parfumata-corp-ca...
2,35216,150,ApƒÉ ParfumatƒÉ Corp & PƒÉr - Granita de LƒÉm√¢ieAd...,Corporal,Bruma,24.9,Lim√≥n,Rumania,https://nala.ro/products/apa-parfumata-corp-pa...
3,64277,150,ApƒÉ ParfumatƒÉ Corp & PƒÉr - PapayaAdaugƒÉ la .,Corporal,Bruma,29.9,,Rumania,https://nala.ro/products/apa-parfumata-corp-pa...
4,64278,150,ApƒÉ parfumatƒÉ Corp & PƒÉr - PomeloAdaugƒÉ la .,Corporal,Bruma,29.9,,Rumania,https://nala.ro/products/apa-parfumata-corp-pa...
...,...,...,...,...,...,...,...,...,...
149,70479,150,Ulei Uscat Corp - Lemn de SantalAdaugƒÉ la .,Corporal,Aceite Corporal,39.9,,Rumania,https://nala.ro/products/ulei-uscat-corp-lemn-...
150,74264,150,Ulei Uscat Corp - Mu»ôcatƒÉ & PortocalƒÉ AmarƒÉAda...,Corporal,Aceite Corporal,39.9,Naranja,Rumania,https://nala.ro/products/ulei-uscat-corp-musca...
151,70481,150,Ulei Uscat Corp - PapayaAdaugƒÉ la .,Corporal,Aceite Corporal,39.9,,Rumania,https://nala.ro/products/ulei-uscat-corp-papaya
152,64894,150,»òerbet Corp - GrapefruitAdaugƒÉ la .,Corporal,Otro,39.9,Pomelo,Rumania,https://nala.ro/products/serbet-corp-grapefruit-1


In [16]:
def ducha_ro():
    """Extrae TODOS los productos de la categor√≠a 'Ducha y Ba√±o' en Nala Rumania."""

    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,1080")

    driver = webdriver.Chrome(options=options)
    driver.get("https://nala.ro/collections/dus-si-baie")

    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.grid__item"))
    )

    # Scroll din√°mico
    last_count = 0
    stable_rounds = 0
    while stable_rounds < 3:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        products = driver.find_elements(By.CSS_SELECTOR, "li.grid__item")
        if len(products) == last_count:
            stable_rounds += 1
        else:
            stable_rounds = 0
        last_count = len(products)

    print(f"Total productos detectados en la p√°gina: {last_count}")

    html = driver.page_source
    driver.quit()

    soup = BeautifulSoup(html, "html.parser")
    items = soup.select("li.grid__item")
    print(f"Productos encontrados en HTML: {len(items)}")

    productos = []
    categorias = defaultdict(int)

    for item in items:
        enlace = item.find("a", href=re.compile("/products/"))
        if not enlace:
            continue
        url_producto = "https://nala.ro" + enlace.get("href")

        # Nombre
        nombre = (
            item.find(attrs={"class": re.compile("(title|heading|card__title)", re.IGNORECASE)}) or
            item.find("span", class_=re.compile("visually-hidden", re.IGNORECASE)) or
            enlace
        )
        nombre = nombre.get_text(strip=True) if nombre else "N/A"
        nombre_lower = nombre.lower()

        # Precio
        precio = "N/A"
        precio_tag = item.find("div", class_=re.compile("price", re.IGNORECASE))
        if precio_tag:
            match = re.search(r"\d+,\d{2}", precio_tag.get_text(strip=True))
            if match:
                precio = f"{match.group()} ¬£"

        # Ingrediente clave (rumano ‚Üí espa√±ol)
        ingrediente_match = re.search(
            r"(cƒÉp»ôuni|zmeurƒÉ|struguri|avocado|aloe vera|argan|unt de shea|lavandƒÉ|mu»ôe»õel|castravete|cocos|lƒÉm√¢ie|piersicƒÉ|mango|grapefruit|vanilie|cafea|rodie|mentƒÉ|trandafiri|c√¢nepƒÉ|pepene verde|portocalƒÉ|ananas|coacƒÉze|cire»ôe|gƒÉlbenele|violeta|migdale|scor»õi»ôoarƒÉ|miere|ciocolatƒÉ|caramel|liliac|flori de portocal|ceai verde|ghimbir|lemongrass|rozmarin|ulei de mƒÉsline|sare de mare|lapte|mƒÉr|scor»õi»ôoarƒÉ »ôi mƒÉr)",
            nombre,
            re.IGNORECASE
        )


        ingrediente = "N/A"
        if ingrediente_match:
            ingrediente_rumano = ingrediente_match.group(1).lower()
            ingrediente = {
                "cƒÉp»ôuni": "Fresa", "zmeurƒÉ": "Frambuesa", "struguri": "Uvas", "avocado": "Aguacate",
                "aloe vera": "Aloe Vera", "argan": "Arg√°n", "unt de shea": "Karit√©", "lavandƒÉ": "Lavanda",
                "mu»ôe»õel": "Manzanilla", "castravete": "Pepino", "cocos": "Coco", "lƒÉm√¢ie": "Lim√≥n",
                "piersicƒÉ": "Melocot√≥n", "mango": "Mango", "grapefruit": "Pomelo", "vanilie": "Vainilla",
                "cafea": "Caf√©", "rodie": "Granada", "mentƒÉ": "Menta", "trandafiri": "Rosas", "c√¢nepƒÉ": "C√°√±amo",
                "pepene verde": "Sand√≠a", "portocalƒÉ": "Naranja", "ananas": "Pi√±a", "coacƒÉze": "Grosella negra",
                "cire»ôe": "Cereza", "gƒÉlbenele": "Cal√©ndula", "violeta": "Violeta", "migdale": "Almendra",
                "scor»õi»ôoarƒÉ": "Canela", "miere": "Miel", "ciocolatƒÉ": "Chocolate", "caramel": "Caramelo",
                "liliac": "Lila", "flori de portocal": "Flor de Naranjo", "ceai verde": "T√© Verde",
                "ghimbir": "Jengibre", "lemongrass": "Lemongrass", "rozmarin": "Romero", "ulei de mƒÉsline": "Aceite de Oliva",
                "sare de mare": "Sal marina", "lapte": "Leche", "mƒÉr": "Manzana", "scor»õi»ôoarƒÉ »ôi mƒÉr": "Canela y Manzana"
            }.get(ingrediente_rumano, "N/A")


        # Clasificaci√≥n por categor√≠a
        if "ulei de du»ô" in nombre_lower:
            categoria = "Aceite de Ducha"
        elif "m√¢ini" in nombre_lower:
            categoria = "Jab√≥n de Manos"
        elif "du»ô" in nombre_lower or "gel" in nombre_lower:
            categoria = "Gel de Ducha"
        elif "bomba" in nombre_lower:
            categoria = "Bomba de Ba√±o"
        elif "cremƒÉ" in nombre_lower:
            categoria = "Crema de Ba√±o"
        elif "spumƒÉ" in nombre_lower:
            categoria = "Espuma de Ba√±o"
        elif "sare" in nombre_lower:
            categoria = "Sal de Ba√±o"
        elif "lapte" in nombre_lower:
            categoria = "Leche de Ba√±o"
        elif "natural" in nombre_lower:
            categoria = "Jab√≥n Natural"
        else:
            categoria = "Otro"

        categorias[categoria] += 1

        productos.append({
            "SKU": "N/A",
            "gramos/ml": "N/A",
            "nombre": nombre,
            "categoria_general": "Ducha y Ba√±o",
            "categoria": categoria,
            "precio": precio,
            "ingrediente_clave": ingrediente,
            "pa√≠s": "Rumania",
            "url": url_producto,
        })

    # Extraer SKU y gramos/ml
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(extraer_sku_y_gramos, p["url"]): i for i, p in enumerate(productos)}
        for future in as_completed(futures):
            idx = futures[future]
            try:
                sku, gramos = future.result()
                productos[idx]["SKU"] = sku
                productos[idx]["gramos/ml"] = gramos
            except:
                productos[idx]["SKU"] = "N/A"
                productos[idx]["gramos/ml"] = "N/A"

    df = pd.DataFrame(productos)
    print(f"Total productos finales: {len(df)}")

    # Convertir precio y gramos/ml a columnas num√©ricas
    df["precio"] = (
        df["precio"]
        .str.replace("¬£", "", regex=False)
        .str.replace(",", ".", regex=False)
        .str.strip()
        .astype(float)
    )

    df["gramos/ml"] = (
        df["gramos/ml"]
        .str.extract(r"(\d+(?:\.\d+)?)")[0]
        .astype(float)
        .round()
        .astype("Int64")
    )

    return df

In [17]:
df_ducha_ro = ducha_ro()
df_ducha_ro


Total productos detectados en la p√°gina: 108
Productos encontrados en HTML: 108
Total productos finales: 108


Unnamed: 0,SKU,gramos/ml,nombre,categoria_general,categoria,precio,ingrediente_clave,pa√≠s,url
0,35711,100,SƒÉpun Natural - Meri»ôoareAdaugƒÉ la .,Ducha y Ba√±o,Jab√≥n Natural,12.90,,Rumania,https://nala.ro/products/sapun-natural-merisoare
1,35704,100,SƒÉpun Natural - Boabe picante de TonkaAdaugƒÉ la .,Ducha y Ba√±o,Jab√≥n Natural,12.90,,Rumania,https://nala.ro/products/sapun-natural-boabe-p...
2,35483,125,BombƒÉ Baie EfervescentƒÉ - Afine & StruguriAdau...,Ducha y Ba√±o,Otro,12.90,Uvas,Rumania,https://nala.ro/products/bomba-baie-efervescen...
3,35605,200,Gel de du»ô - ParƒÉ & Pepene Ro»ôuAdaugƒÉ la .,Ducha y Ba√±o,Gel de Ducha,19.90,,Rumania,https://nala.ro/products/gel-de-dus-para-pepen...
4,35599,200,Gel de du»ô - Afine & StruguriAdaugƒÉ la .,Ducha y Ba√±o,Gel de Ducha,19.90,Uvas,Rumania,https://nala.ro/products/gel-de-dus-afine-stru...
...,...,...,...,...,...,...,...,...,...
103,65035,200,Gel de du»ô - Flori de PortocalAdaugƒÉ la .,Ducha y Ba√±o,Gel de Ducha,9.90,Flor de Naranjo,Rumania,https://nala.ro/products/gel-de-dus-flori-de-p...
104,64381,200,Gel de du»ô Exfoliant Hidratant - GrapefruitAda...,Ducha y Ba√±o,Gel de Ducha,29.90,Pomelo,Rumania,https://nala.ro/products/gel-dus-exfoliant-hid...
105,64282,150,Ulei de du»ô - GrapefruitAdaugƒÉ la .,Ducha y Ba√±o,Aceite de Ducha,29.90,Pomelo,Rumania,https://nala.ro/products/ulei-dus-grapefruit-1
106,30759,100,SƒÉpun Natural - Struguri Ro»ôiiAdaugƒÉ la .,Ducha y Ba√±o,Jab√≥n Natural,6.45,Uvas,Rumania,https://nala.ro/products/sapun-natural-strugur...


In [18]:
df_combinado = pd.concat([df_cabello_ro, df_corporal_ro , df_ducha_ro, df_rostro_ro], ignore_index=True)

df_combinado.to_csv("nala_ro.csv", index=False)