In [1]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
styles = pd.read_csv("styles.csv", on_bad_lines='skip')
images = pd.read_csv("images.csv")

In [3]:
images['id'] = images['filename'].str.replace('.jpg', '', regex=False).astype(int)

In [4]:
# Merge datasets
df = pd.merge(styles, images, on='id')

In [5]:

# Drop rows with missing essential info
df.dropna(subset=[
    'productDisplayName', 'gender', 'masterCategory', 'subCategory',
    'articleType', 'baseColour', 'season', 'usage', 'link'
], inplace=True)

In [6]:
# Create full image URL
df['image_url'] = df['link']

In [7]:
# Create combined text field for vectorization
df['combined'] = (
    df['gender'].str.lower() + ' ' +
    df['masterCategory'].str.lower() + ' ' +
    df['subCategory'].str.lower() + ' ' +
    df['articleType'].str.lower() + ' ' +
    df['baseColour'].str.lower() + ' ' +
    df['season'].str.lower() + ' ' +
    df['usage'].str.lower() + ' ' +
    df['productDisplayName'].str.lower()
)

In [8]:
# --- Build TF-IDF ---
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['combined'])

In [9]:
pickle.dump(df, open("df.pkl", "wb"))
pickle.dump(vectorizer, open("tfidf.pkl", "wb"))
pickle.dump(tfidf_matrix, open("tfidf_matrix.pkl", "wb"))