In [49]:
import pandas as pd
import numpy as np
import warnings
from bs4 import BeautifulSoup
from text_preprocessing import clean_text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings("ignore")

In [50]:
df = pd.read_csv("Fashion Dataset.csv")

In [51]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,p_id,name,price,colour,brand,img,ratingCount,avg_rating,description,p_attributes
0,0,17048614.0,Khushal K Women Black Ethnic Motifs Printed Ku...,5099.0,Black,Khushal K,http://assets.myntassets.com/assets/images/170...,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32..."


In [52]:
df.isnull().sum()

Unnamed: 0         0
p_id              18
name              18
price             18
colour            21
brand             18
img               18
ratingCount     7749
avg_rating      7749
description       18
p_attributes      18
dtype: int64

In [53]:
df.shape

(14330, 11)

In [54]:
df.drop(["ratingCount", "avg_rating", "p_id", "Unnamed: 0"], axis=1, inplace=True)

In [55]:
df.dropna(inplace=True)

In [56]:
df.isnull().sum()

name            0
price           0
colour          0
brand           0
img             0
description     0
p_attributes    0
dtype: int64

In [57]:
df.shape

(14309, 7)

In [58]:
df.duplicated().sum()

44

In [59]:
df.drop_duplicates(inplace=True)

In [60]:
df.duplicated().sum()

0

In [61]:
df.sample()

Unnamed: 0,name,price,colour,brand,img,description,p_attributes
2677,Miss Poem Women Grey Light Fade Stretchable Jeans,2799.0,Grey,Miss Poem,http://assets.myntassets.com/assets/images/178...,"<ul> <li> Medium shade, light fade grey jeans...","{'Add-Ons': 'NA', 'Brand Fit Name': 'NA', 'Cha..."


In [62]:
df["colour"] = df["colour"].apply(lambda x: x.replace(" ", "").lower().split())

In [63]:
df.sample()

Unnamed: 0,name,price,colour,brand,img,description,p_attributes
9882,Anouk Pink & Blue Pure Cotton Printed Ready to...,4599.0,[pink],Anouk,http://assets.myntassets.com/assets/images/183...,Pink and blue printed lehenga choli with dupat...,"{'Blouse Closure': 'NA', 'Blouse Fabric': 'Cot..."


In [64]:
df["brand"] = df["brand"].apply(lambda x: x.replace(" ", "").lower().split())

In [65]:
df.sample()

Unnamed: 0,name,price,colour,brand,img,description,p_attributes
5108,V-Mart Girls Navy Blue & Gold Printed Skirts,700.0,[navyblue],[v-mart],http://assets.myntassets.com/assets/images/192...,<p>Navy Blue coloured printed flared midi skir...,"{'Add-Ons': 'NA', 'Body or Garment Size': 'To-..."


In [66]:
def formatting_description(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    return text.lower().split()

In [67]:
df["description"] = df["description"].apply(formatting_description)

In [68]:
df.sample()

Unnamed: 0,name,price,colour,brand,img,description,p_attributes
5358,Juniper Women Navy Blue Printed Flared Palazzos,1799.0,[navyblue],[juniper],http://assets.myntassets.com/assets/images/123...,"[a, pair, of, navy, blue, printed, woven, flar...",{'Body or Garment Size': 'To-Fit Denotes Body ...


In [69]:
def formatting_p_attributes(text):
    attributes = eval(text)
    extracted_attributes = []
    if "Top Fabric" in attributes:
        extracted_attributes.append(attributes["Top Fabric"].replace(" ", "").lower())
    if "Occasion" in attributes:
        extracted_attributes.append(attributes["Occasion"].replace(" ", "").lower())
    if "Sustainable" in attributes:
        extracted_attributes.append(attributes["Sustainable"].replace(" ", "").lower())
    if "Wash Care" in attributes:
        extracted_attributes.append(attributes["Wash Care"].replace(" ", "").lower())
    if "Top Pattern" in attributes:
        extracted_attributes.append(attributes["Top Pattern"].replace(" ", "").lower())
    if "Top Shape" in attributes:
        extracted_attributes.append(attributes["Top Shape"].replace(" ", "").lower())
    if "Top Type" in attributes:
        extracted_attributes.append(attributes["Top Type"].replace(" ", "").lower())
    if "Bottom Pattern" in attributes:
        extracted_attributes.append(
            attributes["Bottom Pattern"].replace(" ", "").lower()
        )

    if "Bottom Type" in attributes:
        extracted_attributes.append(attributes["Bottom Type"].replace(" ", "").lower())
    if "Bottom Closure" in attributes:
        extracted_attributes.append(
            attributes["Bottom Closure"].replace(" ", "").lower()
        )

    return extracted_attributes

In [70]:
df['p_attributes']=df['p_attributes'].apply(formatting_p_attributes)

In [71]:
df.sample(1)

Unnamed: 0,name,price,colour,brand,img,description,p_attributes
7389,Yufta Women White & Grey Printed Tunic with Pa...,1699.0,[white],[yufta],http://assets.myntassets.com/assets/images/709...,"[this, co-ords, set, consists, of, tunic, and,...","[purecotton, casual, regular, machinewash, sol..."


In [72]:
def formatting_price(text):
    price_class = []
    if text <= 1000:
        price_class.append('priceveryaffordable')
    elif 1000 < text <= 3000:
        price_class.append('priceaffordable')
    elif 3000 < text <= 6000:
        price_class.append('pricemoderate')
    elif 6000 < text <= 9000:
        price_class.append('priceexpensive')
    else:
        price_class.append('priceveryexpensive')
    return price_class


In [73]:
df['price']=df['price'].apply(formatting_price)

Creating Tag

In [74]:
df['tag']=df['brand']+df['description']+df['colour']+df['p_attributes']+df['price']

In [75]:
df.head(1)

Unnamed: 0,name,price,colour,brand,img,description,p_attributes,tag
0,Khushal K Women Black Ethnic Motifs Printed Ku...,[pricemoderate],[black],[khushalk],http://assets.myntassets.com/assets/images/170...,"[black, printed, kurta, with, palazzos, with, ...","[viscoserayon, festive, regular, machinewash, ...","[khushalk, black, printed, kurta, with, palazz..."


In [76]:
df['tag']=df['tag'].apply(lambda x:" ".join(x))

In [77]:
new_df = df[['name', 'tag']]

In [78]:
new_df.sample()

Unnamed: 0,name,tag
10734,Xenilla Pink & Blue Embroidered Semi-Stitched ...,xenilla pink and blue embroidered lehenga chol...


In [79]:
new_df['tag'][0]

"khushalk black printed kurta with palazzos with dupatta kurta design: ethnic motifs printed anarkali shape regular style mandarin collar, three-quarter regular sleeves calf length with flared hem viscose rayon machine weave fabric palazzos design: printed palazzos elasticated waistband slip-on closure dupatta length 2.43 meters width: 88 cmthe model (height 5'8) is wearing a size s100% rayonmachine wash black viscoserayon festive regular machinewash printed anarkali kurta printed palazzos slip-on pricemoderate"

In [80]:
new_df['tag']=clean_text(new_df,'tag')['tag']


=== Cleaning Process ===

⬇️ Removing HTML Tags ⬇️

⬇️ Lowercasing Text ⬇️

⬇️ Removing Punctuation ⬇️

⬇️ Removing Numbers ⬇️

⬇️ Removing Whitespace ⬇️

⬇️ Removing Stopwords ⬇️

⬇️ Stemming Words ⬇️

=== Cleaning Completed ===



In [81]:
new_df['tag'].iloc[0]

'khushalk black print kurta with palazzo with dupatta kurta design ethnic motif print anarkali shape regular style mandarin collar threequart regular sleev calf length with flare hem viscos rayon machin weav fabric palazzo design print palazzo elast waistband slipon closur dupatta length meter width cmthe model height is wear a size s rayonmachin wash black viscoserayon festiv regular machinewash print anarkali kurta print palazzo slipon pricemoder'

Word to Vector using TF-IDF (Term Frequency-Inverse Document Frequency)

In [82]:
tfidf = TfidfVectorizer(max_features=5000)

In [83]:
tfidf_matrix = tfidf.fit_transform(new_df['tag'])

In [84]:
tfidf_matrix.shape

(14265, 5000)

In [85]:
tfidf.get_feature_names_out()

array(['aahwan', 'aari', 'aarika', ..., 'zivam', 'ziyaa', 'zola'],
      dtype=object)

In [86]:
similarity = cosine_similarity(tfidf_matrix,tfidf_matrix)

In [87]:
similarity.shape

(14265, 14265)

In [93]:
def recommend(name):
    index = df[df["name"] == name].index[0]
    similar = sorted(
        list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1]
    )[1:6]
    recommend_list = []
    for s in similar:
        recommend_list.append([df["name"].iloc[s[0]], df["img"].iloc[s[0]]])

    return recommend_list


In [94]:
df['name'].iloc[0]

'Khushal K Women Black Ethnic Motifs Printed Kurta with Palazzos & With Dupatta'

In [95]:
recommend('Khushal K Women Black Ethnic Motifs Printed Kurta with Palazzos & With Dupatta')

[['Khushal K Women White Ethnic Motifs Printed Gotta Patti Kurta with Palazzos & With Dupatta',
  'http://assets.myntassets.com/assets/images/17447640/2022/3/9/32d8a19f-bbbc-4938-b05c-612b760831c51646798656702KhushalKWomenWhiteEmbroideredLayeredGottaPattiKurtiwithPalaz1.jpg'],
 ['mirari Women Green Floral Printed Pure Cotton Kurta with Palazzos',
  'http://assets.myntassets.com/assets/images/18765612/2022/6/17/fe143871-ed3f-4e70-9cb7-3cd683202dc51655464222268MIRARIGreenPrintedKurtaWithPalazzoSet1.jpg'],
 ['Ishin Women Red Floral Embroidered Regular Gotta Patti Pure Cotton Kurta with Palazzos & With Dupatta',
  'http://assets.myntassets.com/assets/images/15535314/2021/11/26/d32d9fc9-bcb9-4065-bc4b-453d6c799a2c1637929961158IshinWomensRedEmbroideredGottaPattiAnarkaliKurtaWithPalazzoD1.jpg'],
 ['Vishudh Women Blue Floral Printed Kurta with Palazzos & With Dupatta',
  'http://assets.myntassets.com/assets/images/18261878/2022/5/16/daa3c330-ef9d-4470-858e-ad9968076d621652657917110VishudhWomen

In [91]:
import pickle

In [92]:
with open('df.pkl', 'wb') as f:
    pickle.dump(df, f)

with open('similarity.pkl', 'wb') as f:
    pickle.dump(similarity, f)

with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)