In [33]:
import numpy as np
import pandas as pd

In [34]:
styles = pd.read_csv("styles.csv", on_bad_lines='skip')
images = pd.read_csv("images.csv")

In [35]:
images['id'] = images['filename'].str.replace('.jpg', '', regex=False).astype(int)

In [36]:
images.head()

Unnamed: 0,filename,link,id
0,15970.jpg,http://assets.myntassets.com/v1/images/style/p...,15970
1,39386.jpg,http://assets.myntassets.com/v1/images/style/p...,39386
2,59263.jpg,http://assets.myntassets.com/v1/images/style/p...,59263
3,21379.jpg,http://assets.myntassets.com/v1/images/style/p...,21379
4,53759.jpg,http://assets.myntassets.com/v1/images/style/p...,53759


In [37]:
df = pd.merge(styles, images, on='id')

In [38]:
df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,filename,link
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,15970.jpg,http://assets.myntassets.com/v1/images/style/p...
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,39386.jpg,http://assets.myntassets.com/v1/images/style/p...
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch,59263.jpg,http://assets.myntassets.com/v1/images/style/p...
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants,21379.jpg,http://assets.myntassets.com/v1/images/style/p...
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt,53759.jpg,http://assets.myntassets.com/v1/images/style/p...


In [39]:
df.shape

(44424, 12)

In [40]:
df.isnull().sum()

id                      0
gender                  0
masterCategory          0
subCategory             0
articleType             0
baseColour             15
season                 21
year                    1
usage                 317
productDisplayName      7
filename                0
link                    0
dtype: int64

In [41]:
# Drop rows with missing critical info
df.dropna(subset=[
    'productDisplayName', 'gender', 'masterCategory', 'subCategory',
    'articleType', 'baseColour', 'season', 'usage', 'link'
], inplace=True)

In [42]:
df.isnull().sum()

id                    0
gender                0
masterCategory        0
subCategory           0
articleType           0
baseColour            0
season                0
year                  0
usage                 0
productDisplayName    0
filename              0
link                  0
dtype: int64

In [43]:
# ✅ Use working image URL directly
df['image_url'] = df['link']

In [45]:
df['image_url'][1]

'http://assets.myntassets.com/v1/images/style/properties/4850873d0c417e6480a26059f83aac29_images.jpg'

In [46]:
# --- STEP 3: Combine Features ---
df['combined'] = (
    df['gender'].str.lower() + ' ' +
    df['masterCategory'].str.lower() + ' ' +
    df['subCategory'].str.lower() + ' ' +
    df['articleType'].str.lower() + ' ' +
    df['baseColour'].str.lower() + ' ' +
    df['season'].str.lower() + ' ' +
    df['usage'].str.lower() + ' ' +
    df['productDisplayName'].str.lower()
)

In [47]:
df['combined']

0        men apparel topwear shirts navy blue fall casu...
1        men apparel bottomwear jeans blue summer casua...
2        women accessories watches watches silver winte...
3        men apparel bottomwear track pants black fall ...
4        men apparel topwear tshirts grey summer casual...
                               ...                        
44419    men footwear shoes casual shoes white summer c...
44420    men footwear flip flops flip flops red summer ...
44421    men apparel topwear tshirts blue fall casual p...
44422    women personal care fragrance perfume and body...
44423    women accessories watches watches pink winter ...
Name: combined, Length: 44077, dtype: object

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# memory error
from sklearn.neighbors import NearestNeighbors

In [49]:
# --- STEP 4: Vectorize Text ---
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = vectorizer.fit_transform(df['combined'])

In [50]:
# --- Use NearestNeighbors for memory-efficient similarity search ---
nn = NearestNeighbors(n_neighbors=6, metric='cosine', algorithm='brute')
nn.fit(tfidf_matrix)

In [26]:
# Map product names to index
# product_to_index = pd.Series(df.index, index=df['productDisplayName'].str.lower()).to_dict()

In [57]:
# product_to_index

In [51]:
# Helper to match partial product name
def find_best_match(name):
    name = name.lower()
    matches = df[df['productDisplayName'].str.lower().str.contains(name)]
    if not matches.empty:
        return matches.iloc[0]['productDisplayName']
    return None

In [52]:
# Recommend by name (with fuzzy match)
def recommend_by_name(product_name, top_n=5):
    matched_name = find_best_match(product_name)
    if not matched_name:
        print("❌ No close match found for:", product_name)
        return None

    print(f"🔍 Closest match: {matched_name}")
    idx = df[df['productDisplayName'] == matched_name].index[0]
    distances, indices = nn.kneighbors(tfidf_matrix[idx], n_neighbors=top_n + 1)

    recommended = df.iloc[indices[0][1:]][[
        'productDisplayName', 'articleType', 'baseColour', 'image_url'
    ]].reset_index(drop=True)

    return recommended

In [53]:
results = recommend_by_name("Men Navy Blue jeans")
print(results)

🔍 Closest match: Denizen Women Navy Blue Jeans
             productDisplayName articleType baseColour  \
0  Nike Men's Blue Polo T-shirt     Tshirts       Blue   
1    Nike Men Blue Polo T-shirt     Tshirts       Blue   
2  Nike Men's Blue Polo T-shirt     Tshirts      White   
3         Nike Men Blue T-shirt     Tshirts       Blue   
4    Nike Men Blue Polo Tshirts     Tshirts       Blue   

                                           image_url  
0  http://assets.myntassets.com/v1/images/style/p...  
1  http://assets.myntassets.com/v1/images/style/p...  
2  http://assets.myntassets.com/v1/images/style/p...  
3  http://assets.myntassets.com/v1/images/style/p...  
4  http://assets.myntassets.com/v1/images/style/p...  


In [32]:
results = recommend_by_name("Men Navy Blue jeans")
print(results)

❌ Product not found.
None


In [54]:
from IPython.display import Image, display

In [55]:
results = recommend_by_name("Men Navy Blue jeans")

🔍 Closest match: Denizen Women Navy Blue Jeans


In [56]:
# Show table and images
if results is not None:
    for _, row in results.iterrows():
        print(f"\n🛍️ {row['productDisplayName']} ({row['articleType']} - {row['baseColour']})")
        display(Image(url=row['image_url'], width=200))


🛍️ Nike Men's Blue Polo T-shirt (Tshirts - Blue)



🛍️ Nike Men Blue Polo T-shirt (Tshirts - Blue)



🛍️ Nike Men's Blue Polo T-shirt (Tshirts - White)



🛍️ Nike Men Blue T-shirt (Tshirts - Blue)



🛍️ Nike Men Blue Polo Tshirts (Tshirts - Blue)


In [None]:
# solved v1 problems
# improve accuracy