In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def load_and_inspect_data(csv_path):
    """Load CSV (auto-detects delimiter) and perform initial inspection"""
    print("=" * 70)
    print("STEP 1: LOADING AND INSPECTING DATA")
    print("=" * 70)

    # Auto-detect delimiter using csv.Sniffer
    import csv
    with open(csv_path, 'r', encoding='utf-8') as f:
        sample = f.read(2048)
        sniffer = csv.Sniffer()
        try:
            dialect = sniffer.sniff(sample)
            sep = dialect.delimiter
        except csv.Error:
            sep = ','  # fallback if detection fails

    print(f"Detected delimiter: '{sep}'")

    # Load data
    df = pd.read_csv(csv_path, sep=sep)

    # Basic inspection
    print(f"\nDataset Shape: {df.shape[0]} rows √ó {df.shape[1]} columns")
    print(f"\nColumn Names:\n{df.columns.tolist()}")

    print(f"\nData Types:")
    print(df.dtypes)

    print(f"\nFirst 3 rows:")
    print(df.head(3))

    print(f"\nBasic Statistics:")
    print(df.describe())

    return df

In [None]:
# ==================== STEP 2: Handle Missing Values ====================

def analyze_missing_values(df):
    """Analyze missing values in detail"""
    print("\n" + "=" * 70)
    print("STEP 2: MISSING VALUES ANALYSIS")
    print("=" * 70)

    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100

    missing_df = pd.DataFrame({
        'Column': missing.index,
        'Missing_Count': missing.values,
        'Missing_Percentage': missing_pct.values
    })
    missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

    if len(missing_df) > 0:
        print("\nColumns with Missing Values:")
        print(missing_df.to_string(index=False))
    else:
        print("\nNo missing values found!")

    return missing_df

def handle_missing_values(df):
    """Clean missing values based on column type"""
    print("\n" + "=" * 70)
    print("STEP 3: HANDLING MISSING VALUES")
    print("=" * 70)

    df_clean = df.copy()

    # 1. Remove rows without image URLs (critical for CNN)
    before = len(df_clean)
    df_clean = df_clean.dropna(subset=['image_url'])
    print(f"\n1. Removed {before - len(df_clean)} rows without image URLs")

    # 2. Remove rows without product titles (needed for categorization)
    before = len(df_clean)
    df_clean = df_clean.dropna(subset=['title'])
    print(f"2. Removed {before - len(df_clean)} rows without titles")

    # 3. Handle missing prices - fill with median by brand
    if df_clean['price'].isnull().sum() > 0:
        print(f"3. Found {df_clean['price'].isnull().sum()} missing prices")

        # Fill with brand median
        df_clean['price'] = df_clean.groupby('brand')['price'].transform(
            lambda x: x.fillna(x.median())
        )

        # If still missing, fill with overall median
        df_clean['price'].fillna(df_clean['price'].median(), inplace=True)
        print(f"   Filled missing prices with brand/overall median")

    # 4. Handle missing ratings - fill with brand average
    if df_clean['rating'].isnull().sum() > 0:
        print(f"4. Found {df_clean['rating'].isnull().sum()} missing ratings")

        df_clean['rating'] = df_clean.groupby('brand')['rating'].transform(
            lambda x: x.fillna(x.mean())
        )

        # If still missing, fill with overall mean
        df_clean['rating'].fillna(df_clean['rating'].mean(), inplace=True)
        print(f"   Filled missing ratings with brand/overall average")

    print(f"\nFinal dataset size: {len(df_clean)} rows")

    return df_clean

In [None]:
# ==================== STEP 4: Data Validation ====================

def validate_data(df):
    """Validate data quality and constraints"""
    print("\n" + "=" * 70)
    print("STEP 4: DATA VALIDATION")
    print("=" * 70)

    issues = []

    # 1. Check for duplicates
    duplicates = df.duplicated(subset=['product_id']).sum()
    if duplicates > 0:
        issues.append(f"Found {duplicates} duplicate product_ids")
        print(f"\n‚ö†Ô∏è  WARNING: {duplicates} duplicate product IDs found")
    else:
        print(f"\n‚úì No duplicate product IDs")

    # 2. Validate URLs
    invalid_urls = df[~df['image_url'].str.contains('http', na=False)].shape[0]
    if invalid_urls > 0:
        issues.append(f"Found {invalid_urls} invalid image URLs")
        print(f"‚ö†Ô∏è  WARNING: {invalid_urls} invalid image URLs")
    else:
        print(f"‚úì All image URLs are valid")

    # 3. Check price range
    negative_prices = df[df['price'] < 0].shape[0]
    if negative_prices > 0:
        issues.append(f"Found {negative_prices} negative prices")
        print(f"‚ö†Ô∏è  WARNING: {negative_prices} negative prices")
    else:
        print(f"‚úì All prices are non-negative")

    # Identify outliers
    q1 = df['price'].quantile(0.25)
    q3 = df['price'].quantile(0.75)
    iqr = q3 - q1
    outliers = df[(df['price'] < q1 - 1.5*iqr) | (df['price'] > q3 + 1.5*iqr)].shape[0]
    print(f"   Found {outliers} price outliers (outside 1.5√óIQR)")

    # 4. Validate ratings
    invalid_ratings = df[(df['rating'] < 0) | (df['rating'] > 5)].shape[0]
    if invalid_ratings > 0:
        issues.append(f"Found {invalid_ratings} invalid ratings")
        print(f"‚ö†Ô∏è  WARNING: {invalid_ratings} ratings outside 0-5 range")
    else:
        print(f"‚úì All ratings are in valid range (0-5)")

    # 5. Check title length
    short_titles = df[df['title'].str.len() < 10].shape[0]
    if short_titles > 0:
        print(f"   Found {short_titles} very short titles (<10 chars)")

    return issues

def remove_duplicates(df):
    """Remove duplicate products"""
    print("\n" + "=" * 70)
    print("STEP 5: REMOVING DUPLICATES")
    print("=" * 70)

    before = len(df)

    # Remove exact duplicates
    df_clean = df.drop_duplicates(subset=['product_id'], keep='first')

    print(f"\nRemoved {before - len(df_clean)} duplicate products")
    print(f"Remaining: {len(df_clean)} unique products")

    return df_clean

In [None]:
# ==================== STEP 6: Category Extraction ====================

def extract_categories(df):
    """Extract product categories from titles"""
    print("\n" + "=" * 70)
    print("STEP 6: EXTRACTING PRODUCT CATEGORIES")
    print("=" * 70)

    # Define comprehensive category keywords
    category_patterns = {
        'Footwear': [
            'shoe', 'sneaker', 'boot', 'sandal', 'slipper', 'slide',
            'clog', 'flip-flop', 'flip flop', 'footwear', 'trainer',
            'loafer', 'moccasin', 'runner', 'running shoe'
        ],
        'Bags': [
            'backpack', 'bag', 'duffel', 'tote', 'handbag', 'satchel',
            'crossbody', 'luggage', 'briefcase', 'pouch', 'purse',
            'shoulder bag', 'messenger', 'clutch', 'wallet bag'
        ],
        'Tops': [
            'shirt', 't-shirt', 'tee', 'hoodie', 'sweatshirt', 'blouse',
            'tank', 'polo', 'sweater', 'jacket', 'coat', 'cardigan',
            'blazer', 'vest', 'top'
        ],
        'Bottoms': [
            'pant', 'jean', 'short', 'trouser', 'legging', 'skirt',
            'jogger', 'sweatpant', 'cargo'
        ],
        'Accessories': [
            'watch', 'belt', 'hat', 'cap', 'scarf', 'glove',
            'sunglasses', 'glasses', 'jewelry', 'jewellery', 'earring',
            'necklace', 'bracelet', 'ring', 'umbrella', 'headband',
            'tie', 'bowtie', 'suspender'
        ],
        'Socks': [
            'sock', 'hosiery', 'stocking'
        ],
        'Underwear': [
            'underwear', 'brief', 'boxer', 'bra', 'panties', 'lingerie',
            'undergarment', 'trunk'
        ],
        'Sportswear': [
            'athletic', 'sport', 'gym', 'fitness', 'training',
            'performance', 'active'
        ]
    }

    def categorize_product(title):
        """Categorize a single product based on title"""
        title_lower = title.lower()

        # Check each category (order matters for specificity)
        for category, keywords in category_patterns.items():
            if any(keyword in title_lower for keyword in keywords):
                return category

        return 'Other'

    # Apply categorization
    df['product_category'] = df['title'].apply(categorize_product)

    # Display results
    print("\nCategory Distribution:")
    category_counts = df['product_category'].value_counts()

    for category, count in category_counts.items():
        percentage = (count / len(df)) * 100
        print(f"  {category:15s}: {count:5d} ({percentage:5.2f}%)")

    print(f"\nTotal Products: {len(df)}")

    return df

In [None]:
# ==================== STEP 7: Text Cleaning ====================

def clean_text_fields(df):
    """Clean and standardize text fields"""
    print("\n" + "=" * 70)
    print("STEP 7: CLEANING TEXT FIELDS")
    print("=" * 70)

    df_clean = df.copy()

    # Clean titles
    print("\n1. Cleaning product titles...")
    df_clean['title'] = df_clean['title'].str.strip()
    df_clean['title'] = df_clean['title'].str.replace(r'\s+', ' ', regex=True)
    df_clean['title'] = df_clean['title'].str.replace(r'[^\w\s\-\.]', ' ', regex=True)

    # Clean brand names
    print("2. Standardizing brand names...")
    df_clean['brand'] = df_clean['brand'].str.strip()
    df_clean['brand'] = df_clean['brand'].str.upper()

    # Fix common brand name issues
    brand_mapping = {
        'JANSPORT': 'JANSPORT',
        'JAN SPORT': 'JANSPORT',
        'ADIDAS': 'ADIDAS',
        'NIKE': 'NIKE',
        'PUMA': 'PUMA',
        'SKECHERS': 'SKECHERS',
        'SKETCHERS': 'SKECHERS',
    }
    df_clean['brand'] = df_clean['brand'].replace(brand_mapping)

    print(f"   Found {df_clean['brand'].nunique()} unique brands")

    return df_clean

In [None]:
# ==================== STEP 8: Feature Engineering ====================

def add_features(df):
    """Add useful features for analysis"""
    print("\n" + "=" * 70)
    print("STEP 8: FEATURE ENGINEERING")
    print("=" * 70)

    df_enhanced = df.copy()

    # 1. Price bins
    df_enhanced['price_category'] = pd.cut(
        df_enhanced['price'],
        bins=[0, 50, 100, 200, 500, float('inf')],
        labels=['Budget', 'Economy', 'Mid-Range', 'Premium', 'Luxury']
    )
    print("\n1. Added price_category feature")

    # 2. Rating bins
    df_enhanced['rating_category'] = pd.cut(
        df_enhanced['rating'],
        bins=[0, 3.5, 4.0, 4.5, 5.0],
        labels=['Low', 'Medium', 'High', 'Excellent']
    )
    print("2. Added rating_category feature")

    # 3. Title length
    df_enhanced['title_length'] = df_enhanced['title'].str.len()
    print("3. Added title_length feature")

    # 4. Word count
    df_enhanced['word_count'] = df_enhanced['title'].str.split().str.len()
    print("4. Added word_count feature")

    # 5. Has discount (if price seems like a discount)
    df_enhanced['is_discounted'] = df_enhanced['price'] % 10 == 9
    print("5. Added is_discounted indicator")

    return df_enhanced

In [None]:
# ==================== STEP 9: Export Clean Data ====================

def export_clean_data(df, output_dir='cleaned_data'):
    """Export cleaned dataset"""
    print("\n" + "=" * 70)
    print("STEP 9: EXPORTING CLEANED DATA")
    print("=" * 70)

    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)

    # 1. Export full cleaned dataset
    csv_path = output_path / 'fashion_products_cleaned.csv'
    df.to_csv(csv_path, index=False)
    print(f"\n1. Saved cleaned dataset: {csv_path}")
    print(f"   Shape: {df.shape}")

    # 2. Export by category
    category_dir = output_path / 'by_category'
    category_dir.mkdir(exist_ok=True)

    for category in df['product_category'].unique():
        cat_df = df[df['product_category'] == category]
        cat_path = category_dir / f'{category.lower()}_products.csv'
        cat_df.to_csv(cat_path, index=False)

    print(f"2. Saved category-specific files to: {category_dir}")

    # 3. Export data summary
    summary_path = output_path / 'data_summary.txt'
    with open(summary_path, 'w') as f:
        f.write("FASHION DATASET SUMMARY\n")
        f.write("=" * 70 + "\n\n")

        f.write(f"Total Products: {len(df)}\n")
        f.write(f"Unique Brands: {df['brand'].nunique()}\n")
        f.write(f"Categories: {df['product_category'].nunique()}\n\n")

        f.write("Category Distribution:\n")
        f.write(df['product_category'].value_counts().to_string())
        f.write("\n\n")

        f.write("Price Statistics:\n")
        f.write(df['price'].describe().to_string())
        f.write("\n\n")

        f.write("Rating Statistics:\n")
        f.write(df['rating'].describe().to_string())
        f.write("\n\n")

        f.write("Top 10 Brands:\n")
        f.write(df['brand'].value_counts().head(10).to_string())

    print(f"3. Saved data summary: {summary_path}")

    return output_path

In [None]:
def run_preprocessing_pipeline(csv_path):
    """Run complete preprocessing pipeline"""
    print("\n" + "=" * 70)
    print("FASHION DATASET PREPROCESSING PIPELINE")
    print("=" * 70)

    # Step 1: Load data
    df = load_and_inspect_data(csv_path)

    # Step 2-3: Handle missing values
    missing_analysis = analyze_missing_values(df)
    df = handle_missing_values(df)

    # Step 4-5: Validate and clean
    issues = validate_data(df)
    df = remove_duplicates(df)

    # Step 6: Extract categories
    df = extract_categories(df)

    # Step 7: Clean text
    df = clean_text_fields(df)

     # Step 8: Add features
    df = add_features(df)

    # Step 9: Export
    output_path = export_clean_data(df)

    # Final summary
    print("\n" + "=" * 70)
    print("PREPROCESSING COMPLETE!")
    print("=" * 70)
    print(f"\nFinal Dataset Statistics:")
    print(f"  Total Products: {len(df)}")
    print(f"  Categories: {df['product_category'].nunique()}")
    print(f"  Brands: {df['brand'].nunique()}")
    print(f"  Price Range: {df['price'].min():.2f} - {df['price'].max():.2f} AED")
    print(f"  Average Rating: {df['rating'].mean():.2f}")

    print(f"\n‚úì Cleaned data saved to: {output_path}")
    print(f"\nNext Step: Run image download script")

    return df

In [None]:
if __name__ == "__main__":
    # Run the complete pipeline
    csv_file = 'products.csv'  # Change to your file path

    cleaned_df = run_preprocessing_pipeline(csv_file)

    print("\n" + "=" * 70)
    print("Ready for image download and model training!")
    print("=" * 70)


FASHION DATASET PREPROCESSING PIPELINE
STEP 1: LOADING AND INSPECTING DATA
Detected delimiter: ','

Dataset Shape: 13156 rows √ó 8 columns

Column Names:
['product_id', 'brand', 'title', 'price', 'category', 'rating', 'image_url', 'product_url']

Data Types:
product_id      object
brand           object
title           object
price          float64
category        object
rating         float64
image_url       object
product_url     object
dtype: object

First 3 rows:
   product_id     brand                                              title  \
0  B08YRWN3WB  JANSPORT  Big Student Large laptop backpack Black EK0A5B...   
1  B08YRXFZZM  JANSPORT                                Superbreak Day Pack   
2  B09Q2PQ7ZB   BAODINI  Mini Travel Umbrella With Case Small Compact U...   

    price    category  rating  \
0  189.00  New season     4.7   
1  119.00  New season     4.6   
2   17.79  New season     4.2   

                                           image_url  \
0  https://m.media-amazon

In [None]:
# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# ==================== 1. CATEGORY DISTRIBUTION ====================

def plot_category_distribution(df, save_path='plots'):
    """Visualize category distribution"""
    Path(save_path).mkdir(exist_ok=True)

    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    # Bar chart
    category_counts = df['product_category'].value_counts()
    axes[0].barh(category_counts.index, category_counts.values, color='steelblue')
    axes[0].set_xlabel('Number of Products', fontsize=12, fontweight='bold')
    axes[0].set_title('Product Count by Category', fontsize=14, fontweight='bold')
    axes[0].grid(axis='x', alpha=0.3)

    # Add count labels
    for i, v in enumerate(category_counts.values):
        axes[0].text(v + 50, i, str(v), va='center', fontweight='bold')

    # Pie chart
    colors = plt.cm.Set3(range(len(category_counts)))
    wedges, texts, autotexts = axes[1].pie(
        category_counts.values,
        labels=category_counts.index,
        autopct='%1.1f%%',
        colors=colors,
        startangle=90
    )
    axes[1].set_title('Category Distribution', fontsize=14, fontweight='bold')

    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontweight('bold')

    plt.tight_layout()
    plt.savefig(f'{save_path}/category_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()

# ==================== 2. CLASS IMBALANCE ====================

def plot_class_balance(df, save_path='plots'):
    """Check for class imbalance issues"""
    Path(save_path).mkdir(exist_ok=True)

    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    category_counts = df['product_category'].value_counts()

    # 1. Class balance visualization
    axes[0].bar(range(len(category_counts)), category_counts.values, color='steelblue')
    axes[0].set_xticks(range(len(category_counts)))
    axes[0].set_xticklabels(category_counts.index, rotation=45, ha='right')
    axes[0].set_ylabel('Number of Samples', fontsize=11, fontweight='bold')
    axes[0].set_title('Class Balance Check', fontsize=13, fontweight='bold')
    axes[0].grid(axis='y', alpha=0.3)

    # Add threshold line for minimum recommended samples
    min_recommended = 300
    axes[0].axhline(y=min_recommended, color='r', linestyle='--',
                    linewidth=2, label=f'Min Recommended: {min_recommended}')
    axes[0].legend()

    # Add value labels
    for i, v in enumerate(category_counts.values):
        axes[0].text(i, v + 50, str(v), ha='center', fontweight='bold')

    # 2. Imbalance ratio
    max_samples = category_counts.max()
    imbalance_ratios = max_samples / category_counts

    colors = ['green' if ratio < 2 else 'orange' if ratio < 5 else 'red'
              for ratio in imbalance_ratios]

    axes[1].barh(category_counts.index, imbalance_ratios.values, color=colors)
    axes[1].set_xlabel('Imbalance Ratio (Max/Current)', fontsize=11, fontweight='bold')
    axes[1].set_title('Class Imbalance Analysis', fontsize=13, fontweight='bold')
    axes[1].axvline(x=2, color='orange', linestyle='--', linewidth=1, alpha=0.5, label='Moderate (2x)')
    axes[1].axvline(x=5, color='red', linestyle='--', linewidth=1, alpha=0.5, label='Severe (5x)')
    axes[1].legend()
    axes[1].grid(axis='x', alpha=0.3)
    axes[1].invert_yaxis()

    # Add value labels
    for i, v in enumerate(imbalance_ratios.values):
        axes[1].text(v + 0.1, i, f'{v:.2f}x', va='center', fontweight='bold')

    plt.tight_layout()
    plt.savefig(f'{save_path}/class_balance.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Print warnings
    print("\n" + "=" * 70)
    print("Class Balance Report:")
    print("-" * 70)
    severe_imbalance = imbalance_ratios[imbalance_ratios > 5]
    if len(severe_imbalance) > 0:
        print("Categories with severe imbalance (>5x):")
        for cat in severe_imbalance.index:
            print(f"  - {cat}: {category_counts[cat]} samples ({imbalance_ratios[cat]:.2f}x)")
    else:
        print("‚úì Class balance is acceptable (all classes within 5x of largest)")
    print("=" * 70)

# ==================== 3. CORRELATION HEATMAP ====================

def plot_correlation_heatmap(df, save_path='plots'):
    """Plot correlation heatmap for numerical features"""
    Path(save_path).mkdir(exist_ok=True)

    # Select numerical columns
    numeric_cols = ['price', 'rating', 'title_length', 'word_count']
    numeric_cols = [col for col in numeric_cols if col in df.columns]

    if len(numeric_cols) < 2:
        print("Not enough numerical features for correlation analysis")
        return

    corr_matrix = df[numeric_cols].corr()

    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
                square=True, linewidths=1, fmt='.3f',
                cbar_kws={"shrink": 0.8})
    plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(f'{save_path}/correlation_heatmap.png', dpi=300, bbox_inches='tight')
    plt.close()

# ==================== MAIN EXECUTION ====================

def run_essential_visualizations(csv_path, plot_dir='plots'):
    # Load cleaned data
    print("Loading dataset...")
    df = pd.read_csv(csv_path)
    print(f"‚úì Loaded {len(df):,} records")
    print(f"‚úì Found {df['product_category'].nunique()} categories\n")

    # Generate visualizations
    print("Generating visualizations...\n")

    plot_category_distribution(df, plot_dir)
    plot_class_balance(df, plot_dir)
    plot_correlation_heatmap(df, plot_dir)

    print("\n" + "=" * 70)
    print("VISUALIZATION COMPLETE!")
    print("=" * 70)
    print(f"\nAll plots saved to: {plot_dir}/")
    print("  - category_distribution.png")
    print("  - class_balance.png")
    print("  - correlation_heatmap.png")

# ==================== USAGE ====================

if __name__ == "__main__":
    # Run on cleaned data
    csv_file = 'cleaned_data/fashion_products_cleaned.csv'

    run_essential_visualizations(csv_file, plot_dir='plots')

Loading dataset...
‚úì Loaded 11,368 records
‚úì Found 9 categories

Generating visualizations...


Class Balance Report:
----------------------------------------------------------------------
Categories with severe imbalance (>5x):
  - Socks: 297 samples (8.10x)
  - Underwear: 159 samples (15.13x)
  - Sportswear: 101 samples (23.81x)

VISUALIZATION COMPLETE!

All plots saved to: plots/
  - category_distribution.png
  - class_balance.png
  - correlation_heatmap.png


In [None]:
import pandas as pd
import requests
from pathlib import Path
from PIL import Image
from io import BytesIO
import time
import warnings
warnings.filterwarnings('ignore')

# ==================== DOWNLOAD FUNCTION ====================

def download_images(csv_path, output_dir='/content/drive/MyDrive/fashion_cnn/data/images'):
    """Download images from URLs and organize by category - SAVED TO GOOGLE DRIVE"""

    print("=" * 70)
    print("DOWNLOADING IMAGES TO GOOGLE DRIVE")
    print("=" * 70)

    # Load cleaned data
    df = pd.read_csv(csv_path)
    print(f"\nTotal products to download: {len(df)}")
    print(f"Categories: {df['product_category'].nunique()}")

    # Show category breakdown
    print("\nImages per category:")
    category_counts = df['product_category'].value_counts()
    for cat, count in category_counts.items():
        print(f"  {cat:15s}: {count}")

    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    print(f"\nüìÅ Saving to Google Drive: {output_dir}")
    print("   (Images will persist even after Colab disconnects)")

    failed_downloads = []
    success_count = 0
    skipped_count = 0

    print(f"\nStarting download...")
    print("-" * 70)

    total = len(df)

    # Download with manual progress updates
    for idx, row in df.iterrows():
        category = row['product_category']
        product_id = row['product_id']
        image_url = row['image_url']

        # Create category directory
        category_dir = output_path / category
        category_dir.mkdir(parents=True, exist_ok=True)

        image_path = category_dir / f"{product_id}.jpg"

        # Skip if already downloaded
        if image_path.exists():
            skipped_count += 1
            # Print progress every 100 images
            if (idx + 1) % 100 == 0:
                print(f"Progress: {idx + 1}/{total} ({(idx+1)/total*100:.1f}%) - Downloaded: {success_count}, Skipped: {skipped_count}, Failed: {len(failed_downloads)}")
            continue

        # Download with retries
        max_retries = 3
        downloaded = False

        for attempt in range(max_retries):
            try:
                # Set headers to mimic browser
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                }

                response = requests.get(image_url, timeout=15, headers=headers)

                if response.status_code == 200:
                    # Open and convert image
                    img = Image.open(BytesIO(response.content))
                    img = img.convert('RGB')

                    # Save image to Google Drive
                    img.save(image_path, 'JPEG', quality=95)
                    success_count += 1
                    downloaded = True
                    break
                else:
                    if attempt == max_retries - 1:
                        failed_downloads.append({
                            'product_id': product_id,
                            'category': category,
                            'url': image_url,
                            'error': f'HTTP {response.status_code}'
                        })

            except Exception as e:
                if attempt == max_retries - 1:
                    failed_downloads.append({
                        'product_id': product_id,
                        'category': category,
                        'url': image_url,
                        'error': str(e)[:100]
                    })

            # Wait before retry
            if not downloaded and attempt < max_retries - 1:
                time.sleep(1)

        # Print progress every 100 images
        if (idx + 1) % 100 == 0:
            print(f"Progress: {idx + 1}/{total} ({(idx+1)/total*100:.1f}%) - Downloaded: {success_count}, Skipped: {skipped_count}, Failed: {len(failed_downloads)}")

        # Rate limiting - pause every 100 images
        if (idx + 1) % 100 == 0:
            time.sleep(2)

    # Final summary
    print("\n" + "=" * 70)
    print("DOWNLOAD COMPLETE!")
    print("=" * 70)
    print(f"\n‚úì Successfully downloaded: {success_count}")
    print(f"‚äô Already existed (skipped): {skipped_count}")
    print(f"‚úó Failed: {len(failed_downloads)}")
    print(f"\nTotal processed: {success_count + skipped_count + len(failed_downloads)}/{len(df)}")

    success_rate = ((success_count + skipped_count) / len(df)) * 100
    print(f"Success rate: {success_rate:.2f}%")

    # Save failed downloads to Google Drive
    if len(failed_downloads) > 0:
        failed_path = '/content/drive/MyDrive/fashion_cnn/failed_downloads.csv'
        print(f"\n‚ö†Ô∏è  Some downloads failed. Details saved to Google Drive:")
        print(f"   {failed_path}")
        failed_df = pd.DataFrame(failed_downloads)
        failed_df.to_csv(failed_path, index=False)

    # Verify directory structure
    print(f"\n" + "=" * 70)
    print("VERIFICATION - FILES PER CATEGORY")
    print("=" * 70)

    for category in sorted(df['product_category'].unique()):
        category_path = output_path / category
        if category_path.exists():
            file_count = len(list(category_path.glob('*.jpg')))
            expected = len(df[df['product_category'] == category])
            status = "‚úì" if file_count >= expected * 0.95 else "‚ö†Ô∏è"
            print(f"  {status} {category:15s}: {file_count:5d} / {expected:5d} images")
        else:
            print(f"  ‚úó {category:15s}: Directory not found!")

    print("\n" + "=" * 70)
    print(f"üìÅ All images saved to Google Drive!")
    print(f"   Location: {output_dir}")
    print("=" * 70)

    return failed_downloads

# ==================== RUN DOWNLOAD ====================

print("Starting image download script...")
print("Images will be saved to Google Drive (permanent storage)")
print("This will take approximately 2-3 hours for 11,000+ images\n")

# Update paths to use Google Drive
csv_file = '/content/drive/MyDrive/fashion_products_cleaned.csv'
output_folder = '/content/drive/MyDrive/fashion_cnn/data/images'

# Run download
failed = download_images(csv_file, output_dir=output_folder)

print("\n‚úÖ Download process complete!")
print(f"‚úÖ {len(failed)} images failed to download")
print("\nüìÅ Check your Google Drive: MyDrive/fashion_cnn/data/images/")
print("\nNext step: Update training script to use Google Drive paths")

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

# ==================== CONFIGURATION ====================

class Config:
    # Google Drive Paths
    CSV_PATH = '/content/drive/MyDrive/fashion_products_cleaned.csv'
    IMAGE_DIR = '/content/drive/MyDrive/fashion_cnn/data/images'
    MODEL_SAVE_PATH = '/content/drive/MyDrive/fashion_cnn/models'
    RESULTS_PATH = '/content/drive/MyDrive/fashion_cnn/results'

    # Training parameters
    BATCH_SIZE = 32
    LEARNING_RATE = 0.001
    NUM_EPOCHS = 25
    EARLY_STOPPING_PATIENCE = 5

    # Data split
    TRAIN_SPLIT = 0.70
    VAL_SPLIT = 0.15
    TEST_SPLIT = 0.15

    # Model
    IMG_SIZE = 224
    NUM_WORKERS = 2  # Reduced for Google Drive

    # Device
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Random seed
    RANDOM_SEED = 42

# Set random seeds
torch.manual_seed(Config.RANDOM_SEED)
np.random.seed(Config.RANDOM_SEED)

# Create directories
Path(Config.MODEL_SAVE_PATH).mkdir(parents=True, exist_ok=True)
Path(Config.RESULTS_PATH).mkdir(parents=True, exist_ok=True)

# ==================== DATASET CLASS ====================

class FashionDataset(Dataset):
    """Custom Dataset for fashion images"""
    def __init__(self, dataframe, img_dir, transform=None):
        self.df = dataframe.reset_index(drop=True)
        self.img_dir = Path(img_dir)
        self.transform = transform

        # Create label mapping
        self.categories = sorted(dataframe['product_category'].unique())
        self.label_map = {cat: idx for idx, cat in enumerate(self.categories)}
        self.idx_to_label = {idx: cat for cat, idx in self.label_map.items()}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Load image
        img_path = self.img_dir / row['product_category'] / f"{row['product_id']}.jpg"

        try:
            image = Image.open(img_path).convert('RGB')
        except Exception as e:
            # Return a blank image if loading fails
            image = Image.new('RGB', (Config.IMG_SIZE, Config.IMG_SIZE), color='white')

        # Apply transforms
        if self.transform:
            image = self.transform(image)

        # Get label
        label = self.label_map[row['product_category']]

        return image, label

# ==================== DATA TRANSFORMS ====================

def get_transforms(train=True):
    """Get data augmentation transforms"""
    if train:
        return transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.RandomCrop(Config.IMG_SIZE),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomRotation(15),
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    else:
        return transforms.Compose([
            transforms.Resize((Config.IMG_SIZE, Config.IMG_SIZE)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

# ==================== DATA PREPARATION ====================

def prepare_data(csv_path):
    """Load and split data into train/val/test"""
    print("=" * 70)
    print("PREPARING DATA")
    print("=" * 70)

    # Load data
    df = pd.read_csv(csv_path)
    print(f"\nTotal products: {len(df)}")
    print(f"Categories: {df['product_category'].nunique()}")

    # Category distribution
    print("\nCategory distribution:")
    print(df['product_category'].value_counts())

    # Split: 70% train, 15% val, 15% test
    train_df, temp_df = train_test_split(
        df,
        test_size=(Config.VAL_SPLIT + Config.TEST_SPLIT),
        stratify=df['product_category'],
        random_state=Config.RANDOM_SEED
    )

    val_df, test_df = train_test_split(
        temp_df,
        test_size=0.5,
        stratify=temp_df['product_category'],
        random_state=Config.RANDOM_SEED
    )

    print(f"\nData split:")
    print(f"  Train: {len(train_df)} ({len(train_df)/len(df)*100:.1f}%)")
    print(f"  Val:   {len(val_df)} ({len(val_df)/len(df)*100:.1f}%)")
    print(f"  Test:  {len(test_df)} ({len(test_df)/len(df)*100:.1f}%)")

    return train_df, val_df, test_df

def create_dataloaders(train_df, val_df, test_df):
    """Create PyTorch DataLoaders"""
    print("\n" + "=" * 70)
    print("CREATING DATALOADERS")
    print("=" * 70)

    # Create datasets
    train_dataset = FashionDataset(train_df, Config.IMAGE_DIR, transform=get_transforms(train=True))
    val_dataset = FashionDataset(val_df, Config.IMAGE_DIR, transform=get_transforms(train=False))
    test_dataset = FashionDataset(test_df, Config.IMAGE_DIR, transform=get_transforms(train=False))

    # Calculate class weights
    class_counts = train_df['product_category'].value_counts().sort_index()
    total_samples = len(train_df)
    class_weights = torch.FloatTensor([total_samples / count for count in class_counts.values])
    class_weights = class_weights.to(Config.DEVICE)

    print(f"\nClass weights (for imbalance):")
    for cat, weight in zip(train_dataset.categories, class_weights):
        print(f"  {cat:15s}: {weight:.3f}")

    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=Config.BATCH_SIZE,
        shuffle=True,
        num_workers=Config.NUM_WORKERS,
        pin_memory=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=Config.BATCH_SIZE,
        shuffle=False,
        num_workers=Config.NUM_WORKERS,
        pin_memory=True
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=Config.BATCH_SIZE,
        shuffle=False,
        num_workers=Config.NUM_WORKERS,
        pin_memory=True
    )

    print(f"\nDataLoader info:")
    print(f"  Batches per epoch (train): {len(train_loader)}")
    print(f"  Batches per epoch (val):   {len(val_loader)}")
    print(f"  Batches per epoch (test):  {len(test_loader)}")

    return train_loader, val_loader, test_loader, class_weights, train_dataset.categories

# ==================== MODEL DEFINITION ====================

class FashionClassifier(nn.Module):
    """ResNet50-based fashion classifier"""
    def __init__(self, num_classes, pretrained=True):
        super(FashionClassifier, self).__init__()

        # Load pre-trained ResNet50
        self.resnet = models.resnet50(pretrained=pretrained)

        # Freeze early layers
        for param in list(self.resnet.parameters())[:-30]:
            param.requires_grad = False

        # Get number of features
        num_features = self.resnet.fc.in_features

        # Replace final layer
        self.resnet.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(num_features, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        return self.resnet(x)

def create_model(num_classes):
    """Initialize model"""
    print("\n" + "=" * 70)
    print("CREATING MODEL")
    print("=" * 70)

    model = FashionClassifier(num_classes, pretrained=True)
    model = model.to(Config.DEVICE)

    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f"\nModel: ResNet50 with Transfer Learning")
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Frozen parameters: {total_params - trainable_params:,}")
    print(f"Device: {Config.DEVICE}")

    return model

# ==================== TRAINING FUNCTIONS ====================

def train_epoch(model, train_loader, criterion, optimizer, device):
    """Train for one epoch"""
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100. * correct / total

    return epoch_loss, epoch_acc

def validate(model, val_loader, criterion, device):
    """Validate the model"""
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    epoch_loss = running_loss / len(val_loader)
    epoch_acc = 100. * correct / total

    return epoch_loss, epoch_acc

# ==================== MAIN TRAINING LOOP ====================

def train_model(model, train_loader, val_loader, class_weights):
    """Complete training loop"""
    print("\n" + "=" * 70)
    print("TRAINING MODEL")
    print("=" * 70)

    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = optim.Adam(model.parameters(), lr=Config.LEARNING_RATE)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', patience=3, factor=0.5
    )

    history = {
        'train_loss': [], 'train_acc': [],
        'val_loss': [], 'val_acc': []
    }

    best_val_acc = 0.0
    epochs_no_improve = 0

    print(f"\nStarting training for {Config.NUM_EPOCHS} epochs")
    print(f"Batch size: {Config.BATCH_SIZE}")
    print(f"Learning rate: {Config.LEARNING_RATE}\n")

    start_time = time.time()

    for epoch in range(Config.NUM_EPOCHS):
        print(f"\nEpoch {epoch+1}/{Config.NUM_EPOCHS}")
        print("-" * 70)

        # Train
        train_loss, train_acc = train_epoch(
            model, train_loader, criterion, optimizer, Config.DEVICE
        )

        # Validate
        val_loss, val_acc = validate(
            model, val_loader, criterion, Config.DEVICE
        )

        scheduler.step(val_loss)

        # Save history
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

        # Print results
        print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
        print(f"Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.2f}%")

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            epochs_no_improve = 0
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_acc': val_acc,
            }, f'{Config.MODEL_SAVE_PATH}/best_model.pth')
            print(f"‚úì Saved best model (Val Acc: {val_acc:.2f}%)")
        else:
            epochs_no_improve += 1

        # Early stopping
        if epochs_no_improve >= Config.EARLY_STOPPING_PATIENCE:
            print(f"\nEarly stopping after {epoch+1} epochs")
            break

    total_time = time.time() - start_time
    print(f"\nTraining complete! Time: {total_time/60:.2f} minutes")
    print(f"Best validation accuracy: {best_val_acc:.2f}%")

    return history, best_val_acc

# ==================== EVALUATION ====================

def evaluate_model(model, test_loader, categories, device):
    """Evaluate on test set"""
    print("\n" + "=" * 70)
    print("EVALUATING ON TEST SET")
    print("=" * 70)

    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            outputs = model(images)
            _, predicted = outputs.max(1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.numpy())

    test_acc = 100. * sum(np.array(all_preds) == np.array(all_labels)) / len(all_labels)
    print(f"\nTest Accuracy: {test_acc:.2f}%")

    # Classification report
    report = classification_report(all_labels, all_preds, target_names=categories, digits=4)
    print("\n" + report)

    cm = confusion_matrix(all_labels, all_preds)

    return test_acc, cm, report

# ==================== VISUALIZATION ====================

def plot_training_history(history):
    """Plot training history"""
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))

    axes[0].plot(history['train_loss'], label='Train', marker='o')
    axes[0].plot(history['val_loss'], label='Val', marker='s')
    axes[0].set_title('Loss', fontweight='bold')
    axes[0].set_xlabel('Epoch')
    axes[0].legend()
    axes[0].grid(alpha=0.3)

    axes[1].plot(history['train_acc'], label='Train', marker='o')
    axes[1].plot(history['val_acc'], label='Val', marker='s')
    axes[1].set_title('Accuracy', fontweight='bold')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('%')
    axes[1].legend()
    axes[1].grid(alpha=0.3)

    plt.tight_layout()
    plt.savefig(f'{Config.RESULTS_PATH}/training_history.png', dpi=300)
    print(f"‚úì Saved: {Config.RESULTS_PATH}/training_history.png")
    plt.close()

def plot_confusion_matrix(cm, categories):
    """Plot confusion matrix"""
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=categories, yticklabels=categories)
    plt.title('Confusion Matrix', fontweight='bold')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.savefig(f'{Config.RESULTS_PATH}/confusion_matrix.png', dpi=300)
    print(f"‚úì Saved: {Config.RESULTS_PATH}/confusion_matrix.png")
    plt.close()

# ==================== MAIN ====================

def main():
    """Main pipeline"""
    print("\n" + "=" * 70)
    print("FASHION CLASSIFICATION - TRAINING PIPELINE")
    print("=" * 70)
    print(f"Device: {Config.DEVICE}\n")

    # Check GPU
    if not torch.cuda.is_available():
        print("‚ö†Ô∏è  WARNING: GPU not available! Training will be VERY slow.")
        print("   Enable GPU: Runtime ‚Üí Change runtime type ‚Üí T4 GPU\n")

    # Prepare data
    train_df, val_df, test_df = prepare_data(Config.CSV_PATH)
    train_loader, val_loader, test_loader, class_weights, categories = create_dataloaders(
        train_df, val_df, test_df
    )

    # Create model
    model = create_model(len(categories))

    # Train
    history, best_val_acc = train_model(model, train_loader, val_loader, class_weights)

    # Load best model
    checkpoint = torch.load(f'{Config.MODEL_SAVE_PATH}/best_model.pth')
    model.load_state_dict(checkpoint['model_state_dict'])

    # Evaluate
    test_acc, cm, report = evaluate_model(model, test_loader, categories, Config.DEVICE)

    # Plot results
    plot_training_history(history)
    plot_confusion_matrix(cm, categories)

    print("\n" + "=" * 70)
    print("‚úÖ TRAINING COMPLETE!")
    print("=" * 70)
    print(f"Best Val Acc: {best_val_acc:.2f}%")
    print(f"Test Acc: {test_acc:.2f}%")
    print(f"\nüìÅ Model saved: {Config.MODEL_SAVE_PATH}/best_model.pth")
    print(f"üìÅ Results saved: {Config.RESULTS_PATH}/")

if __name__ == "__main__":
    main()