In [249]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [250]:
df_train = pd.read_csv("/content/train.csv")

In [251]:
df_train.head()

Unnamed: 0,id,title,Rating,maincateg,platform,price1,actprice1,Offer %,norating1,noreviews1,star_5f,star_4f,star_3f,star_2f,star_1f,fulfilled1
0,16695,Fashionable & Comfortable Bellies For Women (...,3.9,Women,Flipkart,698,999,30.13%,38.0,7.0,17.0,9.0,6.0,3,3,0
1,5120,Combo Pack of 4 Casual Shoes Sneakers For Men ...,3.8,Men,Flipkart,999,1999,50.03%,531.0,69.0,264.0,92.0,73.0,29,73,1
2,18391,Cilia Mode Leo Sneakers For Women (White),4.4,Women,Flipkart,2749,4999,45.01%,17.0,4.0,11.0,3.0,2.0,1,0,1
3,495,Men Black Sports Sandal,4.2,Men,Flipkart,518,724,15.85%,46413.0,6229.0,1045.0,12416.0,5352.0,701,4595,1
4,16408,Men Green Sports Sandal,3.9,Men,Flipkart,1379,2299,40.02%,77.0,3.0,35.0,21.0,7.0,7,7,1


In [252]:
df_train.shape

(15730, 16)

In [253]:
df_train.isnull().sum()

id              0
title           0
Rating          0
maincateg     526
platform        0
price1          0
actprice1       0
Offer %         0
norating1     678
noreviews1    578
star_5f       588
star_4f       539
star_3f       231
star_2f         0
star_1f         0
fulfilled1      0
dtype: int64

In [254]:
# Forward fill null values in 'maincateg' column
df_train['maincateg'].fillna(method='ffill', inplace=True)

In [255]:
remove = ['norating1', 'noreviews1', 'star_5f', 'star_4f', 'star_3f']
# Drop specified columns
df_train.drop(columns = remove, inplace=True)

In [256]:
df_train.isnull().sum()

id            0
title         0
Rating        0
maincateg     0
platform      0
price1        0
actprice1     0
Offer %       0
star_2f       0
star_1f       0
fulfilled1    0
dtype: int64

In [257]:
from nltk.stem.snowball import SnowballStemmer
def tokenize_stem(text):
    ps = SnowballStemmer('english')
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [ps.stem(token) for token in tokens]
    return stemmed_tokens

In [258]:
nltk.download('punkt')
df_train['stemmed_tokens'] = df_train['title'].apply(tokenize_stem)
#df_train['stemmed_tokens'] = df_train.apply(lambda row:tokenize_stem(row['title']))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [259]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
tfidfv = TfidfVectorizer(tokenizer=tokenize_stem)

def cosine_sim(txt1,txt2):
    matrix = tfidfv.fit_transform([txt1,txt2])
    return cosine_similarity(matrix)

In [260]:
def search_product(query):
    stemmed_query = tokenize_stem(query)
     # Compute cosine similarity between query and each product title
    df_train['similarity'] = df_train['stemmed_tokens'].apply(lambda x: cosine_sim(' '.join(stemmed_query), ' '.join(x))[0][0])
     # Weight the similarity by ratings
    df_train['weighted_similarity'] = df_train['similarity'] * df_train['Rating']
    res = df_train.sort_values(by=['similarity'], ascending=False).head(10)[['title', 'Rating']]
    return res

In [261]:
df_train.head()

Unnamed: 0,id,title,Rating,maincateg,platform,price1,actprice1,Offer %,star_2f,star_1f,fulfilled1,stemmed_tokens
0,16695,Fashionable & Comfortable Bellies For Women (...,3.9,Women,Flipkart,698,999,30.13%,3,3,0,"[fashion, &, comfort, belli, for, women, (, br..."
1,5120,Combo Pack of 4 Casual Shoes Sneakers For Men ...,3.8,Men,Flipkart,999,1999,50.03%,29,73,1,"[combo, pack, of, 4, casual, shoe, sneaker, fo..."
2,18391,Cilia Mode Leo Sneakers For Women (White),4.4,Women,Flipkart,2749,4999,45.01%,1,0,1,"[cilia, mode, leo, sneaker, for, women, (, whi..."
3,495,Men Black Sports Sandal,4.2,Men,Flipkart,518,724,15.85%,701,4595,1,"[men, black, sport, sandal]"
4,16408,Men Green Sports Sandal,3.9,Men,Flipkart,1379,2299,40.02%,7,7,1,"[men, green, sport, sandal]"


In [262]:
df_train.columns

Index(['id', 'title', 'Rating', 'maincateg', 'platform', 'price1', 'actprice1',
       'Offer %', 'star_2f', 'star_1f', 'fulfilled1', 'stemmed_tokens'],
      dtype='object')

In [263]:
df_train['title'][5]

'Women Pink Flats Sandal'

In [264]:
search_product('Women Pink Flats Sandal')



Unnamed: 0,title,Rating
0,Fashionable & Comfortable Bellies For Women (...,3.9
4880,Women SL-152 Black Red Walking Shoes For Women...,4.3
12133,Stylish Latest and Casual Boots For Women (Blue),4.2
4909,Girls Stylish Cut Work Design trending shoes S...,3.8
12137,Boots For Women (Blue),4.3
4906,"punjabi jutti , nagra , nagra shoes , jutti , ...",4.0
4905,Perfect Stylish Girls Casual Shoes Sneakers Fo...,4.0
12139,Perfect Stylish Casual Sneakers For Women & Gi...,4.1
12141,"Sneakers For Women (White, Black)",4.4
12142,"Latest Collection, Comfortable & Fashionable B...",3.9
