In [None]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [None]:
def load_and_inspect_data(csv_path):
    """Load CSV (auto-detects delimiter) and perform initial inspection"""
    print("=" * 70)
    print("STEP 1: LOADING AND INSPECTING DATA")
    print("=" * 70)

    # Auto-detect delimiter using csv.Sniffer
    import csv
    with open(csv_path, 'r', encoding='utf-8') as f:
        sample = f.read(2048)
        sniffer = csv.Sniffer()
        try:
            dialect = sniffer.sniff(sample)
            sep = dialect.delimiter
        except csv.Error:
            sep = ','  # fallback if detection fails

    print(f"Detected delimiter: '{sep}'")

    # Load data
    df = pd.read_csv(csv_path, sep=sep)

    # Basic inspection
    print(f"\nDataset Shape: {df.shape[0]} rows × {df.shape[1]} columns")
    print(f"\nColumn Names:\n{df.columns.tolist()}")

    print(f"\nData Types:")
    print(df.dtypes)

    print(f"\nFirst 3 rows:")
    print(df.head(3))

    print(f"\nBasic Statistics:")
    print(df.describe())

    return df

In [None]:
def run_preprocessing_pipeline(csv_path):
    """Run complete preprocessing pipeline"""
    print("\n" + "=" * 70)
    print("FASHION DATASET PREPROCESSING PIPELINE")
    print("=" * 70)

    # Step 1: Load data
    df = load_and_inspect_data(csv_path)

In [None]:
if __name__ == "__main__":
    # Run the complete pipeline
    csv_file = 'products.csv'  # Change to your file path

    cleaned_df = run_preprocessing_pipeline(csv_file)

    print("\n" + "=" * 70)
    print("Ready for image download and model training!")
    print("=" * 70)


FASHION DATASET PREPROCESSING PIPELINE
STEP 1: LOADING AND INSPECTING DATA
Detected delimiter: ','

Dataset Shape: 13156 rows × 8 columns

Column Names:
['product_id', 'brand', 'title', 'price', 'category', 'rating', 'image_url', 'product_url']

Data Types:
product_id      object
brand           object
title           object
price          float64
category        object
rating         float64
image_url       object
product_url     object
dtype: object

First 3 rows:
   product_id     brand                                              title  \
0  B08YRWN3WB  JANSPORT  Big Student Large laptop backpack Black EK0A5B...   
1  B08YRXFZZM  JANSPORT                                Superbreak Day Pack   
2  B09Q2PQ7ZB   BAODINI  Mini Travel Umbrella With Case Small Compact U...   

    price    category  rating  \
0  189.00  New season     4.7   
1  119.00  New season     4.6   
2   17.79  New season     4.2   

                                           image_url  \
0  https://m.media-amazon.