In [1]:
import re
import json
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [2]:
def process_url(url, driver):
    result = None
    try:
        driver.get(url)
        time.sleep(2)  # Esperar a que la página cargue

        # Aceptar cookies
        try:
            boton_cookies = driver.find_element(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]')
            boton_cookies.click()
            time.sleep(1)  # Esperar un momento después de aceptar cookies
            print("Cookies aceptadas")
        except Exception:
            print("No se encontró el botón de cookies o ya estaba aceptado")

        # Scroll infinito gradual
        scroll_pause_time = 0.6  # Tiempo de espera entre cada desplazamiento
        scroll_increment = 400  # Altura de desplazamiento en píxeles
        last_height = driver.execute_script("return window.scrollY")

        while True:
            driver.execute_script(f"window.scrollBy(0, {scroll_increment});")
            time.sleep(scroll_pause_time)
            new_height = driver.execute_script("return window.scrollY")
            if new_height == last_height:
                break
            last_height = new_height

        # Obtener el HTML completo después del scroll
        html_completo = driver.page_source

        # Crear la sopa
        soup = BeautifulSoup(html_completo, 'html.parser')

        # Limpiar la URL para usarla como nombre de archivo
        url_limpia = re.sub(r'[^\w\-_\.]', '_', url)

        # Guardar el HTML en un archivo con un nombre único
        with open(f"../webs/pagina_pullandbear_{url_limpia}.html", "w", encoding="utf-8") as file:
            file.write(html_completo)

        print(f"HTML extraído correctamente y guardado como pagina_pullandbear_{url_limpia}.html")
        result = soup
    except Exception as e:
        print(f"Error procesando la URL {url}: {e}")

    return result

# Configuración del driver
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# URLs a procesar
listaurl = [
    "https://www.pullandbear.com/es/hombre/ropa/camisetas-n6323",
    "https://www.pullandbear.com/es/hombre/rebajas/ropa/camisetas-y-polos-n7087",
    "https://www.pullandbear.com/es/hombre/ropa/camisas-n6313",
    "https://www.pullandbear.com/es/hombre/rebajas/ropa/camisas-n7088",
    "https://www.pullandbear.com/es/hombre/ropa/punto-n6372",
    "https://www.pullandbear.com/es/hombre/rebajas/ropa/punto-n7090",
    "https://www.pullandbear.com/es/hombre/ropa/sudaderas-n6382",
    "https://www.pullandbear.com/es/hombre/rebajas/ropa/sudaderas-n7089",
    "https://www.pullandbear.com/es/hombre/ropa/pantalones-n6363",
    "https://www.pullandbear.com/es/hombre/rebajas/ropa/pantalones-n7091",
    "https://www.pullandbear.com/es/hombre/ropa/jeans-n6347",
    "https://www.pullandbear.com/es/hombre/rebajas/ropa/jeans-n7818"
]

# Inicializar listas para almacenar la información extraída
product_names = []
product_links = []
image1_urls = []
image2_urls = []
product_prices = []

# Procesar cada URL una por una
for url in tqdm(listaurl):
    sopa = process_url(url, driver)
    if sopa:
        products = sopa.find_all('legacy-product')
        print(f"Se han encontrado {len(products)} productos")

        for product in products:
            # Extraer el nombre del producto
            name = product.find('span', class_='product-name')
            product_names.append(name.text.strip() if name else None)

            # Extraer el enlace del producto
            link = product.find('a', class_='carousel-item-container')
            product_link = link['href'] if link and 'href' in link.attrs else None
            product_links.append(product_link)

            # Extraer URLs de imágenes
            images = product.find_all('img', class_='image-responsive')
            image_urls = [img['src'] for img in images if 'src' in img.attrs]

            # Guardar las primeras dos imágenes si existen
            image1_urls.append(image_urls[0] if len(image_urls) > 0 else None)
            image2_urls.append(image_urls[1] if len(image_urls) > 1 else None)

            # Extraer y transformar el precio del producto
            price_div = product.find('div', class_='product-price--price')
            if price_div:
                raw_price = price_div.text.strip()  # Extraer el texto del precio
                # Normalizar el texto eliminando caracteres no deseados
                raw_price = raw_price.replace("\xa0", "").replace("€", "").strip()  # Quitar el símbolo de moneda y espacios
                try:
                    # Transformar el precio al formato deseado
                    transformed_price = float(raw_price.replace(",", "."))  # Reemplazar coma por punto
                    product_prices.append(transformed_price)
                except ValueError as e:
                    print("ValueError:", e)
                    product_prices.append(None)
            else:
                print("price_div not found")
                product_prices.append(None)

# Cerrar el driver
driver.quit()

# Crear un DataFrame con la información extraída
data = {
    'Product Name': product_names,
    'url': product_links,
    'Image 1 URL': image1_urls,
    'Image 2 URL': image2_urls,
    'current_price': product_prices

}
df_nuevo = pd.DataFrame(data)

# Mostrar articulos antes y despues
print("Articulos actuales:")
print(df_nuevo.shape)

# Mostrar el DataFrame
df_nuevo.head()


  0%|          | 0/12 [00:00<?, ?it/s]

Cookies aceptadas


  8%|▊         | 1/12 [00:24<04:24, 24.02s/it]

HTML extraído correctamente y guardado como pagina_pullandbear_https___www.pullandbear.com_es_hombre_ropa_camisetas-n6323.html
Se han encontrado 97 productos
No se encontró el botón de cookies o ya estaba aceptado


 17%|█▋        | 2/12 [00:39<03:12, 19.21s/it]

HTML extraído correctamente y guardado como pagina_pullandbear_https___www.pullandbear.com_es_hombre_rebajas_ropa_camisetas-y-polos-n7087.html
Se han encontrado 56 productos
No se encontró el botón de cookies o ya estaba aceptado


 25%|██▌       | 3/12 [00:47<02:06, 14.02s/it]

HTML extraído correctamente y guardado como pagina_pullandbear_https___www.pullandbear.com_es_hombre_ropa_camisas-n6313.html
Se han encontrado 10 productos
No se encontró el botón de cookies o ya estaba aceptado


 33%|███▎      | 4/12 [01:06<02:06, 15.76s/it]

HTML extraído correctamente y guardado como pagina_pullandbear_https___www.pullandbear.com_es_hombre_rebajas_ropa_camisas-n7088.html
Se han encontrado 66 productos
No se encontró el botón de cookies o ya estaba aceptado


 42%|████▏     | 5/12 [01:15<01:34, 13.57s/it]

HTML extraído correctamente y guardado como pagina_pullandbear_https___www.pullandbear.com_es_hombre_ropa_punto-n6372.html
Se han encontrado 21 productos
No se encontró el botón de cookies o ya estaba aceptado


 50%|█████     | 6/12 [01:28<01:20, 13.36s/it]

HTML extraído correctamente y guardado como pagina_pullandbear_https___www.pullandbear.com_es_hombre_rebajas_ropa_punto-n7090.html
Se han encontrado 37 productos
No se encontró el botón de cookies o ya estaba aceptado


 58%|█████▊    | 7/12 [01:58<01:32, 18.60s/it]

HTML extraído correctamente y guardado como pagina_pullandbear_https___www.pullandbear.com_es_hombre_ropa_sudaderas-n6382.html
Se han encontrado 133 productos
No se encontró el botón de cookies o ya estaba aceptado


 67%|██████▋   | 8/12 [02:14<01:11, 17.82s/it]

HTML extraído correctamente y guardado como pagina_pullandbear_https___www.pullandbear.com_es_hombre_rebajas_ropa_sudaderas-n7089.html
Se han encontrado 54 productos
No se encontró el botón de cookies o ya estaba aceptado


 75%|███████▌  | 9/12 [02:28<00:49, 16.66s/it]

HTML extraído correctamente y guardado como pagina_pullandbear_https___www.pullandbear.com_es_hombre_ropa_pantalones-n6363.html
Se han encontrado 46 productos
No se encontró el botón de cookies o ya estaba aceptado


 83%|████████▎ | 10/12 [02:44<00:32, 16.44s/it]

HTML extraído correctamente y guardado como pagina_pullandbear_https___www.pullandbear.com_es_hombre_rebajas_ropa_pantalones-n7091.html
Se han encontrado 64 productos
No se encontró el botón de cookies o ya estaba aceptado


 92%|█████████▏| 11/12 [03:00<00:16, 16.41s/it]

HTML extraído correctamente y guardado como pagina_pullandbear_https___www.pullandbear.com_es_hombre_ropa_jeans-n6347.html
Se han encontrado 68 productos
No se encontró el botón de cookies o ya estaba aceptado


100%|██████████| 12/12 [03:19<00:00, 16.60s/it]

HTML extraído correctamente y guardado como pagina_pullandbear_https___www.pullandbear.com_es_hombre_rebajas_ropa_jeans-n7818.html
Se han encontrado 65 productos





Articulos actuales:
(717, 5)


Unnamed: 0,Product Name,url,Image 1 URL,Image 2 URL,current_price
0,Camiseta Sakamoto Store,https://www.pullandbear.com/es/camiseta-sakamo...,https://static.pullandbear.net/assets/public/8...,https://static.pullandbear.net/assets/public/c...,17.99
1,Camiseta negra Sakamoto,https://www.pullandbear.com/es/camiseta-negra-...,https://static.pullandbear.net/assets/public/5...,https://static.pullandbear.net/assets/public/c...,17.99
2,Camiseta Sakamoto Ramen,https://www.pullandbear.com/es/camiseta-sakamo...,https://static.pullandbear.net/assets/public/2...,https://static.pullandbear.net/assets/public/b...,17.99
3,Camiseta Honda,https://www.pullandbear.com/es/camiseta-honda-...,https://static.pullandbear.net/assets/public/f...,https://static.pullandbear.net/2/photos//2025/...,15.99
4,Camiseta básica muscle fit,https://www.pullandbear.com/es/camiseta-basica...,https://static.pullandbear.net/assets/public/3...,https://static.pullandbear.net/assets/public/5...,7.99


In [3]:
# Función para extraer el valor numérico de los precios
def extract_numeric_price(price_text):
    if price_text:
        return float(re.sub(r'[^\d,]', '', price_text).replace(',', '.'))
    return None

# Configuración de Selenium
chrome_options = Options()
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Función para procesar una URL individual
def process_url(row):
    url = row['url']
    result = {
        "url": url,
        "description": None,
        "sale_price": None,
        "old_price": None,
        "original_price": None,
        "current_price": None,
        "color": None,
        "image_url": None,
        "mpn": None,
        "reference_code": None,
        "category_id": None,
        "Stock Status":"In stock"
    }
    try:
        driver.get(url)
        # Esperar hasta que el color esté presente (máximo 10 segundos)
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '.product-card-color-selector--popup-colors-color-name'))
        )
        # Espera adicional para asegurarse de que la página está completamente cargada
        WebDriverWait(driver, 3).until(
            lambda x: x.execute_script("return document.readyState === 'complete'")
        )

        # Extraer contenido HTML
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Extraer precios
        sale_price = soup.select_one('.prices .sale .number')
        old_price = soup.select_one('.prices .price-old .number')
        original_price = soup.select_one('.prices .price-original .number')
        single_price = soup.select_one('.price .number.hansolo')

        result['sale_price'] = extract_numeric_price(sale_price.text.strip()) if sale_price else None
        result['old_price'] = extract_numeric_price(old_price.text.strip()) if old_price else None
        result['original_price'] = extract_numeric_price(original_price.text.strip()) if original_price else None
        result['current_price'] = extract_numeric_price(single_price.text.strip()) if single_price else min(
            filter(None, [result['sale_price'], result['old_price'], result['original_price']]), default=None
        )

        # Extraer color
        color_element = soup.select_one('.product-card-color-selector--popup-colors-color-name')
        result['color'] = color_element.text.strip() if color_element else None

        # Extraer descripción y MPN
        json_ld = soup.find("script", type="application/ld+json")
        if json_ld:
            product_data = json.loads(json_ld.string)
            result['description'] = product_data.get("description", None)
            result['mpn'] = product_data.get("mpn", None)

        # Extraer datos del script JavaScript
        script_js = soup.find("script", text=re.compile("inditex.iParams"))
        if script_js:
            script_text = script_js.string
            mfname_match = re.search(r'mfname":\["(\d+)"\]', script_text)
            category_id_match = re.search(r'categoryId":\["(\d+)"\]', script_text)

            result['reference_code'] = mfname_match.group(1) if mfname_match else None
            result['category_id'] = category_id_match.group(1) if category_id_match else None

        # Extraer URL de la imagen principal solo si no existe ya
        if not result['image_url']:
            image_element = soup.select_one('img')
            result['image_url'] = image_element['src'] if image_element and 'src' in image_element.attrs else None

        # Verificar datos faltantes
        if not result['color']:
            result['missing_data'] = True

    except Exception as e:
        print(f"Error procesando la URL {url}: {e}")

    return result

# Inicializar el navegador y procesar URLs en paralelo
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

rows = df_nuevo.to_dict('records')

with ThreadPoolExecutor(max_workers=1) as executor:
    results = list(tqdm(executor.map(process_url, rows), total=len(rows)))

driver.quit()

# Crear DataFrames para resultados exitosos y con datos faltantes
df_results = pd.DataFrame(results)
df_results=pd.merge(df_nuevo,df_results,on='url', how='inner')

# Unir los DataFrames utilizando la columna 'url'
df_successful = df_results

# Eliminar elementos duplicados de la base
df_successful.drop_duplicates(inplace=True)

# Columnas clave para identificar duplicados
columnas_clave = ['current_price_x', 'color', 'mpn', 'reference_code']

# Crear un DataFrame con los duplicados eliminados
df_sin_duplicados = df_successful.drop_duplicates(subset=columnas_clave, keep='first')

# Identificar las filas descartadas como la diferencia entre los originales y los sin duplicados
descartes = df_successful.loc[~df_successful.index.isin(df_sin_duplicados.index)]

# Guardar los descartes en un archivo CSV
descartes.to_csv('../results/articulos_descartados.csv', index=False)

print(f"Se guardaron {len(descartes)} filas descartadas en 'descartes.csv'.")

# Mostrar resultados
print("Resultados:")
print(df_successful.shape)
print("Resultados sin duplicados:")
print(df_sin_duplicados.shape)
df_sin_duplicados.head()

  script_js = soup.find("script", text=re.compile("inditex.iParams"))
 15%|█▍        | 104/717 [03:06<36:56,  3.62s/it]

Error procesando la URL https://www.pullandbear.com/es/camiseta-rayas-manga-corta-l07248563?cS=615&pelement=621954144: Message: 
Stacktrace:
	GetHandleVerifier [0x00B56FB3+25091]
	(No symbol) [0x00ADE5D4]
	(No symbol) [0x009BB353]
	(No symbol) [0x009FF4BC]
	(No symbol) [0x009FF63B]
	(No symbol) [0x00A3D8B2]
	(No symbol) [0x00A21F24]
	(No symbol) [0x00A3B46E]
	(No symbol) [0x00A21C76]
	(No symbol) [0x009F3185]
	(No symbol) [0x009F430D]
	GetHandleVerifier [0x00E4D5B3+3131395]
	GetHandleVerifier [0x00E5DDA4+3198964]
	GetHandleVerifier [0x00E58CC2+3178258]
	GetHandleVerifier [0x00BF3290+664800]
	(No symbol) [0x00AE744D]
	(No symbol) [0x00AE4798]
	(No symbol) [0x00AE4936]
	(No symbol) [0x00AD7030]
	BaseThreadInitThunk [0x75947BA9+25]
	RtlInitializeExceptionChain [0x7771C0CB+107]
	RtlClearBits [0x7771C04F+191]



 19%|█▉        | 136/717 [04:08<28:27,  2.94s/it]

Error procesando la URL https://www.pullandbear.com/es/camiseta-jujutsu-kaisen-l07249926?cS=250&pelement=645405279: Message: 
Stacktrace:
	GetHandleVerifier [0x00B56FB3+25091]
	(No symbol) [0x00ADE5D4]
	(No symbol) [0x009BB353]
	(No symbol) [0x009FF4BC]
	(No symbol) [0x009FF63B]
	(No symbol) [0x00A3D8B2]
	(No symbol) [0x00A21F24]
	(No symbol) [0x00A3B46E]
	(No symbol) [0x00A21C76]
	(No symbol) [0x009F3185]
	(No symbol) [0x009F430D]
	GetHandleVerifier [0x00E4D5B3+3131395]
	GetHandleVerifier [0x00E5DDA4+3198964]
	GetHandleVerifier [0x00E58CC2+3178258]
	GetHandleVerifier [0x00BF3290+664800]
	(No symbol) [0x00AE744D]
	(No symbol) [0x00AE4798]
	(No symbol) [0x00AE4936]
	(No symbol) [0x00AD7030]
	BaseThreadInitThunk [0x75947BA9+25]
	RtlInitializeExceptionChain [0x7771C0CB+107]
	RtlClearBits [0x7771C04F+191]



 61%|██████▏   | 440/717 [13:21<15:54,  3.45s/it]

Error procesando la URL https://www.pullandbear.com/es/camiseta-manga-larga-malla-l07244501?cS=802&pelement=636875458: Message: 
Stacktrace:
	GetHandleVerifier [0x00B56FB3+25091]
	(No symbol) [0x00ADE5D4]
	(No symbol) [0x009BB353]
	(No symbol) [0x009FF4BC]
	(No symbol) [0x009FF63B]
	(No symbol) [0x00A3D8B2]
	(No symbol) [0x00A21F24]
	(No symbol) [0x00A3B46E]
	(No symbol) [0x00A21C76]
	(No symbol) [0x009F3185]
	(No symbol) [0x009F430D]
	GetHandleVerifier [0x00E4D5B3+3131395]
	GetHandleVerifier [0x00E5DDA4+3198964]
	GetHandleVerifier [0x00E58CC2+3178258]
	GetHandleVerifier [0x00BF3290+664800]
	(No symbol) [0x00AE744D]
	(No symbol) [0x00AE4798]
	(No symbol) [0x00AE4936]
	(No symbol) [0x00AD7030]
	BaseThreadInitThunk [0x75947BA9+25]
	RtlInitializeExceptionChain [0x7771C0CB+107]
	RtlClearBits [0x7771C04F+191]



 75%|███████▌  | 541/717 [16:39<08:56,  3.05s/it]

Error procesando la URL https://www.pullandbear.com/es/pantalon-cargo-multibolsillos-l07678524?cS=802&pelement=655096797: Message: 
Stacktrace:
	GetHandleVerifier [0x00B56FB3+25091]
	(No symbol) [0x00ADE5D4]
	(No symbol) [0x009BB353]
	(No symbol) [0x009FF4BC]
	(No symbol) [0x009FF63B]
	(No symbol) [0x00A3D8B2]
	(No symbol) [0x00A21F24]
	(No symbol) [0x00A3B46E]
	(No symbol) [0x00A21C76]
	(No symbol) [0x009F3185]
	(No symbol) [0x009F430D]
	GetHandleVerifier [0x00E4D5B3+3131395]
	GetHandleVerifier [0x00E5DDA4+3198964]
	GetHandleVerifier [0x00E58CC2+3178258]
	GetHandleVerifier [0x00BF3290+664800]
	(No symbol) [0x00AE744D]
	(No symbol) [0x00AE4798]
	(No symbol) [0x00AE4936]
	(No symbol) [0x00AD7030]
	BaseThreadInitThunk [0x75947BA9+25]
	RtlInitializeExceptionChain [0x7771C0CB+107]
	RtlClearBits [0x7771C04F+191]



 79%|███████▉  | 566/717 [17:28<07:14,  2.88s/it]

Error procesando la URL https://www.pullandbear.com/es/pantalon-cargo-lavado-l07678525?cS=802&pelement=662006280: Message: 
Stacktrace:
	GetHandleVerifier [0x00B56FB3+25091]
	(No symbol) [0x00ADE5D4]
	(No symbol) [0x009BB353]
	(No symbol) [0x009FF4BC]
	(No symbol) [0x009FF63B]
	(No symbol) [0x00A3D8B2]
	(No symbol) [0x00A21F24]
	(No symbol) [0x00A3B46E]
	(No symbol) [0x00A21C76]
	(No symbol) [0x009F3185]
	(No symbol) [0x009F430D]
	GetHandleVerifier [0x00E4D5B3+3131395]
	GetHandleVerifier [0x00E5DDA4+3198964]
	GetHandleVerifier [0x00E58CC2+3178258]
	GetHandleVerifier [0x00BF3290+664800]
	(No symbol) [0x00AE744D]
	(No symbol) [0x00AE4798]
	(No symbol) [0x00AE4936]
	(No symbol) [0x00AD7030]
	BaseThreadInitThunk [0x75947BA9+25]
	RtlInitializeExceptionChain [0x7771C0CB+107]
	RtlClearBits [0x7771C04F+191]



100%|██████████| 717/717 [22:11<00:00,  1.86s/it]


Se guardaron 16 filas descartadas en 'descartes.csv'.
Resultados:
(688, 16)
Resultados sin duplicados:
(672, 16)


Unnamed: 0,Product Name,url,Image 1 URL,Image 2 URL,current_price_x,description,sale_price,old_price,original_price,current_price_y,color,image_url,mpn,reference_code,category_id,Stock Status
0,Camiseta Sakamoto Store,https://www.pullandbear.com/es/camiseta-sakamo...,https://static.pullandbear.net/assets/public/8...,https://static.pullandbear.net/assets/public/c...,17.99,"Camiseta blanca licencia Sakamoto con gráfico,...",,,,17.99,Blanco roto,https://static.pullandbear.net/assets/public/0...,3245/539,3245539,1030139501,In stock
1,Camiseta negra Sakamoto,https://www.pullandbear.com/es/camiseta-negra-...,https://static.pullandbear.net/assets/public/5...,https://static.pullandbear.net/assets/public/c...,17.99,Camiseta licencia Sakamoto de color negro con ...,,,,17.99,Negro,https://static.pullandbear.net/assets/public/8...,3245/918,3245918,1030139501,In stock
2,Camiseta Sakamoto Ramen,https://www.pullandbear.com/es/camiseta-sakamo...,https://static.pullandbear.net/assets/public/2...,https://static.pullandbear.net/assets/public/b...,17.99,Camiseta licencia Sakamoto de color blanco con...,,,,17.99,Blanco roto,https://static.pullandbear.net/assets/public/7...,3245/919,3245919,1030139501,In stock
3,Camiseta Honda,https://www.pullandbear.com/es/camiseta-honda-...,https://static.pullandbear.net/assets/public/f...,https://static.pullandbear.net/2/photos//2025/...,15.99,Camiseta de manga corta blanca y cuello redond...,,,,15.99,Blanco,https://static.pullandbear.net/assets/public/7...,3245/519,3245519,1030139501,In stock
4,Camiseta básica muscle fit,https://www.pullandbear.com/es/camiseta-basica...,https://static.pullandbear.net/assets/public/3...,https://static.pullandbear.net/assets/public/5...,7.99,"MUSCLE FIT. Camiseta básica de manga corta, co...",,,,7.99,Negro,https://static.pullandbear.net/assets/public/0...,3245/502,3245502,1030204791,In stock


# HOMOGENEIZAR LOS DATOS

### COLOR

In [4]:
# Función para homogeneizar colores
def homogeneizar_color(color):
    color = str(color).lower()  # Convertir a minúsculas para uniformidad
    if any(c in color for c in ['blanco', 'hueso', 'crema','crudo']):
        return 'Blanco'
    elif any(c in color for c in ['negro', 'vigoré oscuro']):
        return 'Negro'
    elif any(c in color for c in ['gris', 'vigoré', 'plomo']):
        return 'Gris'
    elif any(c in color for c in ['azul claro', 'azul flúor', 'indigo', 'celeste']):
        return 'Azul claro'
    elif any(c in color for c in ['azul', 'marino', 'indigo']):
        return 'Azul'
    elif any(c in color for c in ['verde', 'menta', 'lima', 'botella', 'pistacho']):
        return 'Verde'
    elif any(c in color for c in ['beige', 'caqui', 'hielo', 'natural', 'piedra', 'tostado','arena']):
        return 'Marrón claro'
    elif any(c in color for c in ['marrón', 'caramelo', 'tabaco', 'chocolate', 'topo', 'tierra', 'coñac']):
        return 'Marrón'
    elif any(c in color for c in ['rojo', 'granate', 'coral', 'burgundy', 'teja', 'burdeos']):
        return 'Rojo'
    elif any(c in color for c in ['rosa', 'lila', 'berenjena', 'morado', 'malva']):
        return 'Rosa/Púrpura'
    elif any(c in color for c in ['amarillo', 'mostaza']):
        return 'Amarillo'
    elif any(c in color for c in ['naranja']):
        return 'Naranja'
    elif any(c in color for c in ['varios', 'rayas']):
        return 'Multicolor'
    else:
        return 'Otros'

# Crear una nueva columna con el color homogeneizado
df_sin_duplicados['color_homogeneizado'] = df_sin_duplicados['color'].apply(homogeneizar_color)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sin_duplicados['color_homogeneizado'] = df_sin_duplicados['color'].apply(homogeneizar_color)


### CATEGORÍA

In [5]:
def categorizar_ropa(product_name):
    product_name = str(product_name).lower()  # Convertir a minúsculas para uniformidad
    if any(p in product_name for p in ['pack']):
        return 'Pack'
    if any(p in product_name for p in ['camiseta','sudadera manga corta']):
        return 'Camiseta'
    if any(p in product_name for p in ['sudadera', 'hoodie']):
        return 'Sudadera'
    if any(p in product_name for p in ['polo']):
        return 'Polo'
    elif any(p in product_name for p in ['sobrecamisa']):
        return 'Sobrecamisa'
    elif any(p in product_name for p in ['camisa']):
        return 'Camisa'
    elif any(p in product_name for p in ['pantalón', 'pantalones', 'jeans', 'vaqueros']):
        return 'Pantalón'
    elif any(p in product_name for p in ['jersey']):
        return 'Jersey'
    elif any(p in product_name for p in ['zapato', 'botas', 'sandalias', 'calzado']):
        return 'Calzado'
    elif any(p in product_name for p in ['accesorio', 'gorra', 'bufanda', 'cinturón', 'bolso']):
        return 'Accesorio'
    else:
        return 'Otros'

# Crear una nueva columna con la categoría de la prenda
df_sin_duplicados['Categoria'] = df_sin_duplicados['Product Name'].apply(categorizar_ropa)

# Mostrar las primeras filas para comprobar
df_sin_duplicados[['Product Name', 'Categoria']].sample(10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sin_duplicados['Categoria'] = df_sin_duplicados['Product Name'].apply(categorizar_ropa)


Unnamed: 0,Product Name,Categoria
413,Sudadera One Piece Nico Robin,Sudadera
213,Camisa vaquera bolsillos delanteros,Camisa
312,Jersey ligero,Jersey
523,Sudadera capucha STWD,Sudadera
721,Jeans relaxed rotos,Pantalón
488,Sudadera verde lavada gráfico STWD,Sudadera
608,Pantalón cargo premium,Pantalón
371,Chaqueta chándal bandas,Otros
48,Sudadera manga corta interlock,Camiseta
648,Pantalón jogger soft knit,Pantalón


### CONTROLAR URLs FALTANTES

In [6]:
# Asignar "out of stock" a la columna 'Stock Status' donde 'image_url' sea vacío o nulo
df_sin_duplicados.loc[(df_sin_duplicados['image_url'].isna()) | (df_sin_duplicados['image_url'] == ''), 'Stock Status'] = 'Out of Stock'

### GUARDAR CSV

In [7]:
# Guardar los cambios en un archivo nuevo si lo necesitas
df_sin_duplicados.to_csv('../results/all_products_info_with_categories.csv', index=False)