In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import re
from IPython.display import Image, display
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
styles = pd.read_csv('styles.csv', on_bad_lines='skip')
images = pd.read_csv('images.csv')
images['id'] = images['filename'].str.replace('.jpg', '', regex=False).astype(int)
df = pd.merge(styles, images, on='id')
df.dropna(subset=['productDisplayName', 'gender', 'masterCategory', 'subCategory', 'articleType', 'baseColour', 'season', 'usage', 'link'], inplace=True)
df['image_url'] = df['link']
df.reset_index(drop=True, inplace=True)
df.head(2)

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,filename,link,image_url
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,15970.jpg,http://assets.myntassets.com/v1/images/style/p...,http://assets.myntassets.com/v1/images/style/p...
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,39386.jpg,http://assets.myntassets.com/v1/images/style/p...,http://assets.myntassets.com/v1/images/style/p...


In [3]:
# Text preprocessing
def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9 ]', ' ', text)
    text = ' '.join(text.split())
    return text
for col in ['productDisplayName', 'gender', 'masterCategory', 'subCategory', 'articleType', 'baseColour', 'season', 'usage']:
    df[col + '_proc'] = df[col].apply(preprocess_text)
df.head(2)

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,...,link,image_url,productDisplayName_proc,gender_proc,masterCategory_proc,subCategory_proc,articleType_proc,baseColour_proc,season_proc,usage_proc
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,...,http://assets.myntassets.com/v1/images/style/p...,http://assets.myntassets.com/v1/images/style/p...,turtle check men navy blue shirt,men,apparel,topwear,shirts,navy blue,fall,casual
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,...,http://assets.myntassets.com/v1/images/style/p...,http://assets.myntassets.com/v1/images/style/p...,peter england men party blue jeans,men,apparel,bottomwear,jeans,blue,summer,casual


In [4]:
# Weighted feature combination
def build_weighted_text(row):
    return (
        (row['productDisplayName_proc'] + ' ') * 3 +
        (row['articleType_proc'] + ' ') * 2 +
        (row['baseColour_proc'] + ' ') * 2 +
        (row['gender_proc'] + ' ') +
        (row['masterCategory_proc'] + ' ') +
        (row['subCategory_proc'] + ' ') +
        (row['season_proc'] + ' ') +
        (row['usage_proc'] + ' ')
    ).strip()
df['combined'] = df.apply(build_weighted_text, axis=1)
df.head(2)

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,...,image_url,productDisplayName_proc,gender_proc,masterCategory_proc,subCategory_proc,articleType_proc,baseColour_proc,season_proc,usage_proc,combined
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,...,http://assets.myntassets.com/v1/images/style/p...,turtle check men navy blue shirt,men,apparel,topwear,shirts,navy blue,fall,casual,turtle check men navy blue shirt turtle check ...
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,...,http://assets.myntassets.com/v1/images/style/p...,peter england men party blue jeans,men,apparel,bottomwear,jeans,blue,summer,casual,peter england men party blue jeans peter engla...


In [5]:
# Advanced TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))
tfidf_matrix = vectorizer.fit_transform(df['combined'])
print('TF-IDF shape:', tfidf_matrix.shape)

TF-IDF shape: (44077, 5000)


In [6]:
# Dimensionality reduction
svd = TruncatedSVD(n_components=200, random_state=42)
reduced_matrix = svd.fit_transform(tfidf_matrix)
print('Reduced matrix shape:', reduced_matrix.shape)

Reduced matrix shape: (44077, 200)


In [15]:
def smart_filter(df, product):
    filtered = df[(df['gender'] == product['gender']) & (df['articleType'] == product['articleType'])]
    if len(filtered) < 5:
        filtered = df[df['gender'] == product['gender']]
    return filtered

def extract_gender_and_type(query):
    query = query.lower()
    gender = None
    for g in ['men', 'women', 'boys', 'girls', 'unisex']:
        if g in query:
            gender = g.capitalize()
            break
    article_type = None
    for at in df['articleType_proc'].unique():
        if at in query:
            article_type = at
            break
    return gender, article_type

def find_product(name):
    name_proc = preprocess_text(name)
    gender, article_type = extract_gender_and_type(name_proc)
    filtered = df
    if gender:
        filtered = filtered[filtered['gender'].str.lower() == gender.lower()]
    if article_type:
        filtered = filtered[filtered['articleType_proc'] == article_type]
    match = filtered[filtered['productDisplayName_proc'].str.contains(name_proc)]
    if not match.empty:
        return match.iloc[0]
    name_words = set(name_proc.split())
    best_score = 0
    best_row = None
    for _, row in filtered.iterrows():
        prod_words = set(row['productDisplayName_proc'].split())
        score = len(name_words & prod_words) / max(1, len(name_words | prod_words))
        if score > best_score:
            best_score = score
            best_row = row
    if best_score > 0.3:
        return best_row
    return None

def recommend(query, top_n=5):
    # Preprocess query
    query_proc = preprocess_text(query)
    # Extract gender and articleType from query
    gender = None
    for g in ['men', 'women', 'boys', 'girls', 'unisex']:
        if g in query_proc:
            gender = g.capitalize()
            break
    article_type = None
    for at in df['articleType_proc'].unique():
        if at in query_proc:
            article_type = at
            break

    # Strictly filter by gender and articleType
    filtered = df.copy()
    if gender:
        filtered = filtered[filtered['gender'].str.lower() == gender.lower()]
    if article_type:
        filtered = filtered[filtered['articleType_proc'] == article_type]

    if len(filtered) == 0:
        print("❌ No products found for your query.")
        return None

    # Fuzzy match for product name in filtered set
    match = filtered[filtered['productDisplayName_proc'].str.contains(query_proc)]
    if not match.empty:
        product = match.iloc[0]
    else:
        # Fallback: just use the first in filtered
        product = filtered.iloc[0]

    # Recommendation logic
    idx = filtered.index.get_loc(product.name)
    filtered_idx = filtered.index.tolist()
    product_vec = reduced_matrix[product.name].reshape(1, -1)
    filtered_vecs = reduced_matrix[filtered_idx]
    sims = cosine_similarity(product_vec, filtered_vecs).flatten()
    top_idx = sims.argsort()[-top_n-1:-1][::-1]
    results = filtered.iloc[top_idx]
    for _, row in results.iterrows():
        print(f"\n🛍️ {row['productDisplayName']} ({row['articleType']} - {row['baseColour']})")
        display(Image(url=row['image_url'], width=200))
    return results[['productDisplayName', 'articleType', 'baseColour', 'image_url']]

In [22]:
# Example usage
recommend('atx jeans', top_n=5)


🛍️ Peter England Men Blue Jeans (Jeans - Blue)



🛍️ Peter England Men Blue Jeans (Jeans - Blue)



🛍️ Peter England Men Blue Jeans (Jeans - Blue)



🛍️ Peter England Men Blue Jeans (Jeans - Blue)



🛍️ Peter England Men Blue Jeans (Jeans - Blue)


Unnamed: 0,productDisplayName,articleType,baseColour,image_url
21082,Peter England Men Blue Jeans,Jeans,Blue,http://assets.myntassets.com/v1/images/style/p...
30687,Peter England Men Blue Jeans,Jeans,Blue,http://assets.myntassets.com/v1/images/style/p...
29627,Peter England Men Blue Jeans,Jeans,Blue,http://assets.myntassets.com/v1/images/style/p...
3858,Peter England Men Blue Jeans,Jeans,Blue,http://assets.myntassets.com/v1/images/style/p...
27814,Peter England Men Blue Jeans,Jeans,Blue,http://assets.myntassets.com/v1/images/style/p...


In [23]:
import pickle

In [24]:
# Bundle all necessary objects into a dictionary
model = {
    'df': df,
    'vectorizer': vectorizer,
    'svd': svd,
    'reduced_matrix': reduced_matrix
}

In [25]:
# Save to a .pkl file
with open('recommendation_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("✅ Model saved as recommendation_model.pkl")

✅ Model saved as recommendation_model.pkl
