In [12]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [13]:
def load_and_inspect_data(csv_path):
    """Load CSV (auto-detects delimiter) and perform initial inspection"""
    print("=" * 70)
    print("STEP 1: LOADING AND INSPECTING DATA")
    print("=" * 70)

    # Auto-detect delimiter using csv.Sniffer
    import csv
    with open(csv_path, 'r', encoding='utf-8') as f:
        sample = f.read(2048)
        sniffer = csv.Sniffer()
        try:
            dialect = sniffer.sniff(sample)
            sep = dialect.delimiter
        except csv.Error:
            sep = ','  # fallback if detection fails

    print(f"Detected delimiter: '{sep}'")

    # Load data
    df = pd.read_csv(csv_path, sep=sep)

    # Basic inspection
    print(f"\nDataset Shape: {df.shape[0]} rows × {df.shape[1]} columns")
    print(f"\nColumn Names:\n{df.columns.tolist()}")

    print(f"\nData Types:")
    print(df.dtypes)

    print(f"\nFirst 3 rows:")
    print(df.head(3))

    print(f"\nBasic Statistics:")
    print(df.describe())

    return df

In [14]:
# ==================== STEP 2: Handle Missing Values ====================

def analyze_missing_values(df):
    """Analyze missing values in detail"""
    print("\n" + "=" * 70)
    print("STEP 2: MISSING VALUES ANALYSIS")
    print("=" * 70)

    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100

    missing_df = pd.DataFrame({
        'Column': missing.index,
        'Missing_Count': missing.values,
        'Missing_Percentage': missing_pct.values
    })
    missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

    if len(missing_df) > 0:
        print("\nColumns with Missing Values:")
        print(missing_df.to_string(index=False))
    else:
        print("\nNo missing values found!")

    return missing_df

def handle_missing_values(df):
    """Clean missing values based on column type"""
    print("\n" + "=" * 70)
    print("STEP 3: HANDLING MISSING VALUES")
    print("=" * 70)

    df_clean = df.copy()

    # 1. Remove rows without image URLs (critical for CNN)
    before = len(df_clean)
    df_clean = df_clean.dropna(subset=['image_url'])
    print(f"\n1. Removed {before - len(df_clean)} rows without image URLs")

    # 2. Remove rows without product titles (needed for categorization)
    before = len(df_clean)
    df_clean = df_clean.dropna(subset=['title'])
    print(f"2. Removed {before - len(df_clean)} rows without titles")

    # 3. Handle missing prices - fill with median by brand
    if df_clean['price'].isnull().sum() > 0:
        print(f"3. Found {df_clean['price'].isnull().sum()} missing prices")

        # Fill with brand median
        df_clean['price'] = df_clean.groupby('brand')['price'].transform(
            lambda x: x.fillna(x.median())
        )

        # If still missing, fill with overall median
        df_clean['price'].fillna(df_clean['price'].median(), inplace=True)
        print(f"   Filled missing prices with brand/overall median")

    # 4. Handle missing ratings - fill with brand average
    if df_clean['rating'].isnull().sum() > 0:
        print(f"4. Found {df_clean['rating'].isnull().sum()} missing ratings")

        df_clean['rating'] = df_clean.groupby('brand')['rating'].transform(
            lambda x: x.fillna(x.mean())
        )

        # If still missing, fill with overall mean
        df_clean['rating'].fillna(df_clean['rating'].mean(), inplace=True)
        print(f"   Filled missing ratings with brand/overall average")

    print(f"\nFinal dataset size: {len(df_clean)} rows")

    return df_clean

In [15]:
def run_preprocessing_pipeline(csv_path):
    """Run complete preprocessing pipeline"""
    print("\n" + "=" * 70)
    print("FASHION DATASET PREPROCESSING PIPELINE")
    print("=" * 70)

    # Step 1: Load data
    df = load_and_inspect_data(csv_path)

    # Step 2-3: Handle missing values
    missing_analysis = analyze_missing_values(df)
    df = handle_missing_values(df)

In [16]:
if __name__ == "__main__":
    # Run the complete pipeline
    csv_file = 'products.csv'  # Change to your file path

    cleaned_df = run_preprocessing_pipeline(csv_file)

    print("\n" + "=" * 70)
    print("Ready for image download and model training!")
    print("=" * 70)


FASHION DATASET PREPROCESSING PIPELINE
STEP 1: LOADING AND INSPECTING DATA
Detected delimiter: ','

Dataset Shape: 13156 rows × 8 columns

Column Names:
['product_id', 'brand', 'title', 'price', 'category', 'rating', 'image_url', 'product_url']

Data Types:
product_id      object
brand           object
title           object
price          float64
category        object
rating         float64
image_url       object
product_url     object
dtype: object

First 3 rows:
   product_id     brand                                              title  \
0  B08YRWN3WB  JANSPORT  Big Student Large laptop backpack Black EK0A5B...   
1  B08YRXFZZM  JANSPORT                                Superbreak Day Pack   
2  B09Q2PQ7ZB   BAODINI  Mini Travel Umbrella With Case Small Compact U...   

    price    category  rating  \
0  189.00  New season     4.7   
1  119.00  New season     4.6   
2   17.79  New season     4.2   

                                           image_url  \
0  https://m.media-amazon.