In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_parquet(
    r"C:\Users\veter\Desktop\nocountry\equipo_45\doversaddlery_products_listing.parquet"
)

In [3]:
pd.options.display.max_columns = None
df

Unnamed: 0,Item_ID,Name,Stock,Description,Price,Images,URL
0,360064,Troxel¬Æ Sport 2.0‚Ñ¢ Helmet,In stock,Browse our carefully curated categories to dis...,$61.99,[https://www.doversaddlery.com/cdn/shop/files/...,https://www.doversaddlery.com/collections/helm...
1,240084,WeatherBeeta¬Æ Free Standard Neck Turnout Sheet,In stock,Browse our carefully curated categories to dis...,$254.99,[https://www.doversaddlery.com/cdn/shop/files/...,https://www.doversaddlery.com/collections/hors...
2,43229,Rider‚Äôs International‚Ñ¢¬†Laced Rein Dog Collar,In stock,Browse our carefully curated categories to dis...,$19.95,[https://www.doversaddlery.com/cdn/shop/files/...,https://www.doversaddlery.com/collections/farm...
3,381034,Noble Equestrian‚Ñ¢ Ladies' Traditions Paddock B...,In stock,Browse our carefully curated categories to dis...,$139.95,[https://www.doversaddlery.com/cdn/shop/files/...,https://www.doversaddlery.com/collections/ridi...
4,381037,Tredstep‚Ñ¢ Ladies‚Äô Donatello III Dress Boots,10 in stock,Browse our carefully curated categories to dis...,$379.00,[https://www.doversaddlery.com/cdn/shop/files/...,https://www.doversaddlery.com/collections/ridi...
...,...,...,...,...,...,...,...
932,205167,Kastel Denmark Ladies‚Äô Crewneck Long Sleeve Sh...,4 in stock,Browse our carefully curated categories to dis...,$99.00,[https://www.doversaddlery.com/cdn/shop/files/...,https://www.doversaddlery.com/collections/new/...
933,381948,TuffRider¬Æ Children‚Äôs Starter Back-Zip Synthet...,7 in stock,Browse our carefully curated categories to dis...,$124.99,[https://www.doversaddlery.com/cdn/shop/files/...,https://www.doversaddlery.com/collections/kids...
934,204177,Goode Rider‚Ñ¢ Girls‚Äô Elite Polo Shirt,7 in stock,Browse our carefully curated categories to dis...,$75.00,[https://www.doversaddlery.com/cdn/shop/files/...,https://www.doversaddlery.com/collections/clea...
935,205162,Kastel Denmark Ladies‚Äô Quarter-Zip Cap Sleeve ...,9 in stock,Browse our carefully curated categories to dis...,$89.00,[https://www.doversaddlery.com/cdn/shop/files/...,https://www.doversaddlery.com/collections/new/...


In [4]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 937 entries, 0 to 936
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Item_ID      937 non-null    str   
 1   Name         937 non-null    str   
 2   Stock        924 non-null    str   
 3   Description  937 non-null    str   
 4   Price        937 non-null    str   
 5   Images       937 non-null    object
 6   URL          937 non-null    str   
dtypes: object(1), str(6)
memory usage: 281.0+ KB


In [None]:
import numpy as np

# --- üõçÔ∏è CLEANING HORSE PRODUCTS DATABASE (FROM PARQUET) ---

# 1. Usamos el 'df' original
df_prod_clean = df.copy()

print("--- Starting Product Database Cleaning ---")

# 2. LIMPIEZA DE TEXTO (Name, Description, Item_ID)
cols_texto = ['Item_ID', 'Name', 'Description']

for col in cols_texto:
    if col in df_prod_clean.columns:
        df_prod_clean[col] = (df_prod_clean[col]
                              .astype(str)
                              .str.lower()
                              .str.strip()
                              .replace({'nan': 'unknown', 'none': 'unknown', 'sin informaci√≥n': 'unknown'}))

print("‚úÖ Text columns normalized (lowercase & English nans).")

# 3. LIMPIEZA DE PRECIO (De Texto a N√∫mero)
if 'Price' in df_prod_clean.columns:
    # Quitamos s√≠mbolos de moneda y comas
    df_prod_clean['Price'] = df_prod_clean['Price'].astype(str).str.replace(r'[$,]', '', regex=True)
    df_prod_clean['Price'] = pd.to_numeric(df_prod_clean['Price'], errors='coerce')
    # Rellenamos nulos con la mediana
    df_prod_clean['Price'] = df_prod_clean['Price'].fillna(df_prod_clean['Price'].median())

# 4. LIMPIEZA DE STOCK (De Texto a N√∫mero)
if 'Stock' in df_prod_clean.columns:
    # Extraemos el n√∫mero por si dice "10 units" o algo as√≠
    df_prod_clean['Stock'] = df_prod_clean['Stock'].astype(str).str.extract(r'(\d+)').astype(float)
    # Rellenamos los 13 nulos con 0 para ser conservadores
    df_prod_clean['Stock'] = df_prod_clean['Stock'].fillna(0).astype(int)

print("‚úÖ Price and Stock converted to numbers.")

# 5. PROTEGER LINKS (Images y URL) - No usar .lower() aqu√≠
cols_links = ['Images', 'URL']
for col in cols_links:
    if col in df_prod_clean.columns:
        df_prod_clean[col] = (df_prod_clean[col]
                              .astype(str)
                              .str.replace(r"[\[\]\'\"]", "", regex=True) # Quitar corchetes
                              .str.strip()
                              .replace({'nan': 'unknown'}))

print("‚úÖ Links and Images cleaned but case-sensitive preserved.")

# --- VERIFICACI√ìN FINAL ---
print("\n--- FINAL PRODUCT REPORT ---")
print(f"Total rows: {len(df_prod_clean)}")
print(f"Total nulls: {df_prod_clean.isnull().sum().sum()}")
print("\nQuick Preview:")
print(df_prod_clean[['Name', 'Price', 'Stock']].head())

--- Starting Product Database Cleaning ---
‚úÖ Text columns normalized (lowercase & English nans).
‚úÖ Price and Stock converted to numbers.
‚úÖ Links and Images cleaned but case-sensitive preserved.

--- FINAL PRODUCT REPORT ---
Total rows: 937
Total nulls: 0

Quick Preview:
                                                Name   Price  Stock
0                          troxel¬Æ sport 2.0‚Ñ¢ helmet   61.99      0
1     weatherbeeta¬Æ free standard neck turnout sheet  254.99      0
2       rider‚Äôs international‚Ñ¢¬†laced rein dog collar   19.95      0
3  noble equestrian‚Ñ¢ ladies' traditions paddock b...  139.95      0
4        tredstep‚Ñ¢ ladies‚Äô donatello iii dress boots  379.00     10


In [6]:
df_prod_clean.sample(100)

Unnamed: 0,Item_ID,Name,Stock,Description,Price,Images,URL
421,351369,ds ladies‚Äô stay tight leather half chaps,0,browse our carefully curated categories to dis...,89.95,https://www.doversaddlery.com/cdn/shop/files/0...,https://www.doversaddlery.com/collections/ridi...
656,351543,ladies‚Äô wellesley piped knee-patch breech,2,browse our carefully curated categories to dis...,109.95,https://www.doversaddlery.com/cdn/shop/files/0...,https://www.doversaddlery.com/collections/clea...
168,38882,tuffrider¬Æ ladies‚Äô starter front-zip paddock b...,0,browse our carefully curated categories to dis...,59.99,https://www.doversaddlery.com/cdn/shop/files/0...,https://www.doversaddlery.com/collections/ridi...
602,241065,dover dog blanket,1,browse our carefully curated categories to dis...,39.95,https://www.doversaddlery.com/cdn/shop/files/0...,https://www.doversaddlery.com/collections/farm...
413,352131,kerrits¬Æ ladies‚Äô balance coolcore¬Æ lite knee-p...,0,browse our carefully curated categories to dis...,99.00,https://www.doversaddlery.com/cdn/shop/files/0...,https://www.doversaddlery.com/collections/new/...
...,...,...,...,...,...,...,...
55,381135,riding sport‚Ñ¢¬†ladies' essential leather zip pa...,0,browse our carefully curated categories to dis...,89.95,https://www.doversaddlery.com/cdn/shop/files/0...,https://www.doversaddlery.com/collections/ridi...
419,4850,easyboot¬Æ cloud,0,browse our carefully curated categories to dis...,111.95,https://www.doversaddlery.com/cdn/shop/files/0...,https://www.doversaddlery.com/collections/hors...
562,1650,bit butter,0,browse our carefully curated categories to dis...,23.95,https://www.doversaddlery.com/cdn/shop/files/0...,https://www.doversaddlery.com/collections/hors...
10,22848,eqyss avocado mist conditioner,0,browse our carefully curated categories to dis...,23.99,https://www.doversaddlery.com/cdn/shop/files/0...,https://www.doversaddlery.com/collections/hors...


In [None]:
#df_prod_clean.to_parquet('products_listing_limpio.parquet', index=False)