# Fashion Product Images Dataset - Data Preprocessing

## 1. Dataset Overview
- **Goal**: The dataset is designed for **fashion product classification and recommendation**.
- **Source**: [Kaggle Dataset](https://www.kaggle.com/datasets/paramaggarwal/fashion-product-images-dataset)
- **Data Files**:
  - `styles.csv`: Contains product metadata.
  - `images.csv`: Maps product IDs to image filenames.
  - `styles/`: Contains JSON metadata.
  - `images/`: Contains `.jpg` images of products.
  - `preprocessed/`: Contains all the data after preprocessing.

In [1]:
import pandas as pd
import os
import json
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

In [2]:
dataset_path = '../FashionRecommendationProject/Dataset/'

styles_file = os.path.join(dataset_path, 'styles.csv')
df_styles = pd.read_csv(styles_file, on_bad_lines='skip', encoding='utf-8')

print("Columns in styles.csv:", df_styles.columns.tolist())

Columns in styles.csv: ['id', 'gender', 'masterCategory', 'subCategory', 'articleType', 'baseColour', 'season', 'year', 'usage', 'productDisplayName']


In [20]:
styles_json_dir = os.path.join(dataset_path, "styles/")
resized_images_dir = os.path.join(dataset_path, "preprocessed/resized_images/")

## 2. Extracting Required Fields & Renaming Columns

In [28]:
json_data = []
json_files = os.listdir(styles_json_dir)

for json_file in json_files:
    if json_file.endswith(".json"):
        json_path = os.path.join(styles_json_dir, json_file)

        with open(json_path, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)

            data = raw_data.get("data", {})

            json_data.append({
                "image_filename": json_file.replace(".json", ".jpg"),  
                "price": data.get("price"),
                "discountedPrice": data.get("discountedPrice"),
                "brandName": data.get("brandName"),
                "ageGroup": data.get("ageGroup"),
                "gender": data.get("gender"),
                "baseColour": data.get("baseColour"),
                "season": data.get("season")
            })

df_json = pd.DataFrame(json_data)

## 3. Handling Missing Values

In [29]:
resized_image_filenames = [img for img in os.listdir(resized_images_dir) if img.endswith(".jpg")]
df_images = pd.DataFrame({"image_filename": resized_image_filenames})

df_selected = df_images.merge(df_json, on="image_filename", how="left")

print("Extracted columns:", df_selected.columns.tolist())

for col in ["brandName", "ageGroup", "gender", "baseColour", "season"]:
    if col in df_selected.columns:
        df_selected[col] = df_selected[col].fillna("Unknown")

# Convert numeric columns and fill missing values with median
for col in ["price", "discountedPrice"]:
    if col in df_selected.columns:
        df_selected[col] = pd.to_numeric(df_selected[col], errors="coerce")

        if df_selected[col].notnull().sum() > 0:
            df_selected[col] = df_selected[col].fillna(df_selected[col].median())
        else:
            df_selected[col] = df_selected[col].fillna(0)  # If all values are missing, fill with 0

print("Missing values after preprocessing:")
print(df_selected.isnull().sum())


Extracted columns: ['image_filename', 'price', 'discountedPrice', 'brandName', 'ageGroup', 'gender', 'baseColour', 'season']
Missing values after preprocessing:
image_filename     0
price              0
discountedPrice    0
brandName          0
ageGroup           0
gender             0
baseColour         0
season             0
dtype: int64


## 4. Normalization of Prices

In [30]:
for col in ["price", "discountedPrice"]:
    if col in df_selected.columns and df_selected[col].notnull().sum() > 0:
        df_selected[col + "_norm"] = (df_selected[col] - df_selected[col].min()) / (df_selected[col].max() - df_selected[col].min())

print(df_selected.head())

  image_filename   price  discountedPrice  brandName      ageGroup gender  \
0      10000.jpg   649.0            324.0  Palm Tree    Kids-Girls  Women   
1      10001.jpg   549.0            274.0  Palm Tree    Kids-Girls  Women   
2      10002.jpg   549.0            274.0  Palm Tree    Kids-Girls  Women   
3      10003.jpg  2695.0           2695.0       Nike  Adults-Women  Women   
4      10004.jpg  1995.0           1995.0       Nike    Adults-Men    Men   

  baseColour  season  price_norm  discountedPrice_norm  
0      White  Summer    0.022418              0.011192  
1       Blue  Summer    0.018964              0.009465  
2       Blue  Summer    0.018964              0.009465  
3      White    Fall    0.093092              0.093092  
4       Grey    Fall    0.068912              0.068912  


## 5. Image Preprocessing (Resizing)

In [10]:
image_dir = os.path.join(dataset_path, 'images/')
output_dir = os.path.join(dataset_path, 'preprocessed/resized_images/')
os.makedirs(output_dir, exist_ok=True)

def resize_images(input_dir, output_dir, size=(1048, 1048)):
    for img_name in os.listdir(input_dir):
        img_path = os.path.join(input_dir, img_name)
        output_path = os.path.join(output_dir, img_name)
        
        try:
            img = Image.open(img_path)
            img = img.resize(size)
            img.save(output_path)
        except Exception as e:
            print(f"Error processing image {img_name}: {e}")

resize_images(image_dir, output_dir)
print("Images resized and saved")

Images resized and saved


## 6. Save Preprocessed Data

In [31]:
df_selected.to_csv(os.path.join(dataset_path, 'preprocessed/fashion_products_cleaned.csv'), index=False)
df_selected.to_json(os.path.join(dataset_path, 'preprocessed/fashion_products_cleaned.json'), orient='records', indent=4)
print('Preprocessed dataset saved')

Preprocessed dataset saved


## 7. Update on the Preprocessed Data
after the preprocessing we realised that we don't need the unkown values so we dropped them and instead of recreating the whole  process we used the preprocesssed data and dropped the missing values
The final csv files are `fashion_products_cleaned_final.csv` and `fashion_products_cleaned_final.json` 

In [33]:
preprocessed_path = os.path.join(dataset_path, "preprocessed")
styles_json_dir = os.path.join(dataset_path, "styles/")
resized_images_dir = os.path.join(preprocessed_path, "resized_images/")
df_selected = pd.read_csv(os.path.join(preprocessed_path, "fashion_products_cleaned.csv"))

# Remove price_norm and discountedPrice_norm
df_selected.drop(columns=["price_norm", "discountedPrice_norm"], errors="ignore", inplace=True)

# Remove rows where any value in brandName, ageGroup, gender, baseColour, or season is "Unknown"
filtered_df = df_selected[
    ~(df_selected[["brandName", "ageGroup", "gender", "baseColour", "season"]].eq("Unknown")).any(axis=1)
].copy()

# Delete corresponding images that were removed
deleted_images = set(df_selected["image_filename"]) - set(filtered_df["image_filename"])

for img in deleted_images:
    img_path = os.path.join(resized_images_dir, img)
    if os.path.exists(img_path):
        os.remove(img_path)  # Delete the image file
        print(f"Deleted: {img_path}")

# Save the final cleaned dataset
filtered_df.to_csv(os.path.join(preprocessed_path, "fashion_products_cleaned_final.csv"), index=False)

# Save JSON without "Unknown" values
filtered_json = filtered_df.to_dict(orient="records")
clean_json = [{k: v for k, v in record.items() if v != "Unknown"} for record in filtered_json]

with open(os.path.join(preprocessed_path, "fashion_products_cleaned_final.json"), "w") as f:
    json.dump(clean_json, f, indent=4)

print("Final dataset saved! Records with 'Unknown' removed.")

Deleted: ../FashionRecommendationProject/Dataset/preprocessed\resized_images/5924.jpg
Deleted: ../FashionRecommendationProject/Dataset/preprocessed\resized_images/6742.jpg
Deleted: ../FashionRecommendationProject/Dataset/preprocessed\resized_images/58944.jpg
Deleted: ../FashionRecommendationProject/Dataset/preprocessed\resized_images/5614.jpg
Deleted: ../FashionRecommendationProject/Dataset/preprocessed\resized_images/7350.jpg
Deleted: ../FashionRecommendationProject/Dataset/preprocessed\resized_images/3867.jpg
Deleted: ../FashionRecommendationProject/Dataset/preprocessed\resized_images/42842.jpg
Deleted: ../FashionRecommendationProject/Dataset/preprocessed\resized_images/42982.jpg
Deleted: ../FashionRecommendationProject/Dataset/preprocessed\resized_images/39336.jpg
Deleted: ../FashionRecommendationProject/Dataset/preprocessed\resized_images/5152.jpg
Deleted: ../FashionRecommendationProject/Dataset/preprocessed\resized_images/42263.jpg
Deleted: ../FashionRecommendationProject/Dataset/