In [1]:
import numpy as np
import pandas as pd

In [2]:
images = pd.read_csv("images.csv")

In [3]:
images.head()

Unnamed: 0,filename,link
0,15970.jpg,http://assets.myntassets.com/v1/images/style/p...
1,39386.jpg,http://assets.myntassets.com/v1/images/style/p...
2,59263.jpg,http://assets.myntassets.com/v1/images/style/p...
3,21379.jpg,http://assets.myntassets.com/v1/images/style/p...
4,53759.jpg,http://assets.myntassets.com/v1/images/style/p...


In [9]:
images['link'][0]

'http://assets.myntassets.com/v1/images/style/properties/7a5b82d1372a7a5c6de67ae7a314fd91_images.jpg'

In [14]:
images['filename'][0]

'15970.jpg'

In [7]:
df = pd.read_csv("styles.csv", on_bad_lines='skip')  # Skip malformed lines

In [8]:
df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44424 entries, 0 to 44423
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  44424 non-null  int64  
 1   gender              44424 non-null  object 
 2   masterCategory      44424 non-null  object 
 3   subCategory         44424 non-null  object 
 4   articleType         44424 non-null  object 
 5   baseColour          44409 non-null  object 
 6   season              44403 non-null  object 
 7   year                44423 non-null  float64
 8   usage               44107 non-null  object 
 9   productDisplayName  44417 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 3.4+ MB


In [11]:
df.describe()

Unnamed: 0,id,year
count,44424.0,44423.0
mean,29696.334301,2012.806497
std,17049.490518,2.12648
min,1163.0,2007.0
25%,14768.75,2011.0
50%,28618.5,2012.0
75%,44683.25,2015.0
max,60000.0,2019.0


In [12]:
df.isnull().sum()

id                      0
gender                  0
masterCategory          0
subCategory             0
articleType             0
baseColour             15
season                 21
year                    1
usage                 317
productDisplayName      7
dtype: int64

In [13]:
images.isnull().sum()

filename    0
link        0
dtype: int64

In [15]:
# Remove missing image links if any
# images = images.dropna()

In [16]:
# Remove file extension to match with styles `id`
images['id'] = images['filename'].str.replace('.jpg', '').astype(int)

In [18]:
# Merge
data = pd.merge(df, images, on='id')

In [19]:
data.head(3)

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,filename,link
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,15970.jpg,http://assets.myntassets.com/v1/images/style/p...
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,39386.jpg,http://assets.myntassets.com/v1/images/style/p...
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch,59263.jpg,http://assets.myntassets.com/v1/images/style/p...


In [21]:
data.shape

(44424, 12)

In [22]:
columns_to_use = ['id', 'gender', 'masterCategory', 'subCategory', 'articleType', 'baseColour', 'season', 'usage', 'productDisplayName', 'link']
df = data[columns_to_use].dropna()

In [23]:
for col in ['gender', 'masterCategory', 'subCategory', 'articleType', 'baseColour', 'season', 'usage']:
    df[col] = df[col].str.lower()

In [24]:
df.head(3)

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,usage,productDisplayName,link
0,15970,men,apparel,topwear,shirts,navy blue,fall,casual,Turtle Check Men Navy Blue Shirt,http://assets.myntassets.com/v1/images/style/p...
1,39386,men,apparel,bottomwear,jeans,blue,summer,casual,Peter England Men Party Blue Jeans,http://assets.myntassets.com/v1/images/style/p...
2,59263,women,accessories,watches,watches,silver,winter,casual,Titan Women Silver Watch,http://assets.myntassets.com/v1/images/style/p...


In [25]:
df['combined'] = df['gender'] + ' ' + df['masterCategory'] + ' ' + df['subCategory'] + ' ' + df['articleType'] + ' ' + df['baseColour'] + ' ' + df['season'] + ' ' + df['usage'] + ' ' + df['productDisplayName']

In [26]:
df.head(3)

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,usage,productDisplayName,link,combined
0,15970,men,apparel,topwear,shirts,navy blue,fall,casual,Turtle Check Men Navy Blue Shirt,http://assets.myntassets.com/v1/images/style/p...,men apparel topwear shirts navy blue fall casu...
1,39386,men,apparel,bottomwear,jeans,blue,summer,casual,Peter England Men Party Blue Jeans,http://assets.myntassets.com/v1/images/style/p...,men apparel bottomwear jeans blue summer casua...
2,59263,women,accessories,watches,watches,silver,winter,casual,Titan Women Silver Watch,http://assets.myntassets.com/v1/images/style/p...,women accessories watches watches silver winte...


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined'])

In [29]:
# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [30]:
def recommend_product(product_index, top_n=5):
    sim_scores = list(enumerate(cosine_sim[product_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    recommendations = [df.iloc[i[0]] for i in sim_scores]
    return pd.DataFrame(recommendations)

In [31]:
product_index = 0  # First product
recommendations = recommend_product(product_index)
print(recommendations[['productDisplayName', 'articleType', 'baseColour', 'link']])

                       productDisplayName articleType baseColour  \
26654    Turtle Men Check Navy Blue Shirt      shirts  navy blue   
19066         Turtle Check Men Blue Shirt      shirts       blue   
39893         Turtle Check Men Blue Shirt      shirts       blue   
41909         Turtle Check Men Blue Shirt      shirts       blue   
19115  Scullers Men Navy Blue Check Shirt      shirts  navy blue   

                                                    link  
26654  http://assets.myntassets.com/v1/images/style/p...  
19066  http://assets.myntassets.com/v1/images/style/p...  
39893  http://assets.myntassets.com/v1/images/style/p...  
41909  http://assets.myntassets.com/v1/images/style/p...  
19115  http://assets.myntassets.com/v1/images/style/p...  


In [32]:
df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,usage,productDisplayName,link,combined
0,15970,men,apparel,topwear,shirts,navy blue,fall,casual,Turtle Check Men Navy Blue Shirt,http://assets.myntassets.com/v1/images/style/p...,men apparel topwear shirts navy blue fall casu...
1,39386,men,apparel,bottomwear,jeans,blue,summer,casual,Peter England Men Party Blue Jeans,http://assets.myntassets.com/v1/images/style/p...,men apparel bottomwear jeans blue summer casua...
2,59263,women,accessories,watches,watches,silver,winter,casual,Titan Women Silver Watch,http://assets.myntassets.com/v1/images/style/p...,women accessories watches watches silver winte...
3,21379,men,apparel,bottomwear,track pants,black,fall,casual,Manchester United Men Solid Black Track Pants,http://assets.myntassets.com/v1/images/style/p...,men apparel bottomwear track pants black fall ...
4,53759,men,apparel,topwear,tshirts,grey,summer,casual,Puma Men Grey T-shirt,http://assets.myntassets.com/v1/images/style/p...,men apparel topwear tshirts grey summer casual...


In [34]:
product_index = 2  # First product
recommendations = recommend_product(product_index)
print(recommendations[['productDisplayName', 'articleType', 'baseColour', 'link']])

                  productDisplayName articleType baseColour  \
3896        Titan Women Silver Watch     watches     silver   
20481       Titan Women Silver Watch     watches     silver   
23713       Titan Women Silver Watch     watches     silver   
25833       Titan Women Silver Watch     watches     silver   
4860   Titan Women Silver Dial Watch     watches     silver   

                                                    link  
3896   http://assets.myntassets.com/v1/images/style/p...  
20481  http://assets.myntassets.com/v1/images/style/p...  
23713  http://assets.myntassets.com/v1/images/style/p...  
25833  http://assets.myntassets.com/v1/images/style/p...  
4860   http://assets.myntassets.com/v1/images/style/p...  


In [36]:
# problem 
# 1 image url concate
# 2. recomended by index