In [37]:
#import liabries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.stem.porter import PorterStemmer
import os
import mysql.connector
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel,linear_kernel,cosine_similarity 
import pickle

In [38]:
# Selecting only # features from the data set
fields = [ 'item_id','item_name', 'description']
new_df = pd.read_csv("frqbitems.csv.csv",
                  encoding="cp437",
                  index_col=False,
                  usecols=fields, 
                  skipinitialspace=True)
new_df.head(15)

Unnamed: 0,item_id,item_name,description
0,6703,Chicken Boneless,Chicken Boneless
1,6757,Anchovy/ Nethli,Anchovy/ Nethli
2,6810,Chicken Buffalo Cut Wingettes,Chicken Buffalo Cut Wingettes
3,15902,Popular Essential White Peas,Popular Essentials is a brand supplying high-q...
4,13549,Fresh Water Rohu,Fresh Water Rohu
5,17287,Haldiram Boondi,Khara boondi is a favourite snack to munch on ...
6,15871,Popular Essential Red Lobia,Popular Essentials is a brand supplying high-q...
7,15890,Popular Essentials Premium Ground Nut,Popular Essentials is a brand supplying high-q...
8,15835,Aura White Thill,Aura White Thill
9,15827,24 Mantra Organic Brown Chana,Buying 24 Mantra from Sresta means you are a v...


In [39]:
# To check size of the data
new_df.shape

(645, 3)

In [40]:
#new_df.isnull().sum()
new_df.dropna(inplace=True)

In [41]:
# To check what item name is comingin 4th index
new_df['description'][4]

'Fresh Water Rohu'

In [42]:
# Droping dublicate values from data set
new_df.drop_duplicates(inplace=True)

In [43]:
#new_df.duplicated().sum()

In [44]:
# Performing data cleaning as unnecessary values are showing in the description column
new_df['description'] = new_df['description'].str.replace(r'/', '')
new_df['description'] = new_df['description'].str.replace(r'ΓÇ¥', '')
new_df['description'] = new_df['description'].str.replace(r'-', '')
new_df['description'] = new_df['description'].str.replace(r'ΓÇ£', '')
new_df['description'] = new_df['description'].str.replace(r'crÃ¨', '')
new_df['description'] = new_df['description'].str.replace(r'â€™s', '')


In [45]:
# we achieved Cleaned data set after performing pre-processing
new_df.head(30)

Unnamed: 0,item_id,item_name,description
0,6703,Chicken Boneless,Chicken Boneless
1,6757,Anchovy/ Nethli,Anchovy Nethli
2,6810,Chicken Buffalo Cut Wingettes,Chicken Buffalo Cut Wingettes
3,15902,Popular Essential White Peas,Popular Essentials is a brand supplying highqu...
4,13549,Fresh Water Rohu,Fresh Water Rohu
5,17287,Haldiram Boondi,Khara boondi is a favourite snack to munch on ...
6,15871,Popular Essential Red Lobia,Popular Essentials is a brand supplying highqu...
7,15890,Popular Essentials Premium Ground Nut,Popular Essentials is a brand supplying highqu...
8,15835,Aura White Thill,Aura White Thill
9,15827,24 Mantra Organic Brown Chana,Buying 24 Mantra from Sresta means you are a v...


In [46]:
# To check is it picking 5th index description of the item 
new_df['description'][5]

'Khara boondi is a favourite snack to munch on at almost any moment. Enjoy the amazing taste of tangy and spicy Khara Boondi Namkeen. Eat it as a side dish with your meals or just eat it on its own.'

In [76]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
        
        
    return " ".join(y)

new_df['description'] = new_df['description'].apply(stem)

In [77]:
#we use TF-IDF vectorizerthat calculates the TF-IDF score for each description, word-by-word.
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfv = TfidfVectorizer(max_features=None,
                      strip_accents='unicode',
                      analyzer='word',
                      ngram_range=(1,4),
                      stop_words='english',
                      encoding="cp437",
                      lowercase=True,
                      use_idf=True,
                      min_df=0.003, # ignore words which comes in less than 3% of documents(0.003)
                      max_df=0.50) #exclude words which occurs in 50% of documents(0.5)


In [78]:
vectors = tfv.fit_transform(new_df['description']).toarray()
vectors

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [79]:
#vectors[0]

In [80]:
from sklearn.metrics.pairwise import sigmoid_kernel,linear_kernel,cosine_similarity

similarity = cosine_similarity(vectors)
similarity.shape

(447, 447)

In [81]:
# Providing recommendation of 5 elements in terms of decreasing values
sorted(list(enumerate(similarity[4])),reverse=True,key=lambda x:x[1])[1:6]

[(138, 1.0),
 (347, 1.0),
 (281, 0.3226417583699119),
 (28, 0.21147984892826413),
 (241, 0.09681669018686968)]

In [82]:
# Creating function which will take item name and return item_ids in form of recommendation
def recommend(item_name):
    item_index = new_df[new_df['item_name'] == item_name].index[0]
    distances = similarity[item_index]
    item_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    
    for i in item_list:
        print(new_df.iloc[i[0]].item_id)   

In [83]:
# for Chicken Boneless it has recommended these items which having these item_id's
recommend('Chicken Boneless')

17546
74787
6720
6714
13525
