### Case Study 2: Five Similar products using CosineSimilarity  ###

In [94]:
#Basic imports for data handling and text proccessing
import pandas as pd
import numpy as np
import nltk
from unidecode import unidecode

In [95]:
#Reading the products.csv data file and storing it in Dataframe 
prod_data = pd.read_csv("prods.csv")
data = prod_data.copy()#Text with original names before text cleaning

In [96]:
#Data information for the number of records under each features
prod_data.info()
#There are 1000 products with product_id,department_id, aisle_id and product_name

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
product_id       1000 non-null int64
product_name     1000 non-null object
aisle_id         1000 non-null int64
department_id    1000 non-null int64
dtypes: int64(3), object(1)
memory usage: 31.3+ KB


In [97]:
#Products data 
#Deliverable : 5 Similar products should be found for each of the product using product name
prod_data.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [98]:
#To change all the non- English letters to english letters
#Example:"Héllô Càèùverâêt Jîôûç ïîäüë" -->  Hello Caeuveraet Jiouc iiaue
prod_data['product_name'] = pd.Series(map(lambda text: unidecode(text), prod_data['product_name']))
prod_data['product_name'].head()

0                           Chocolate Sandwich Cookies
1                                     All-Seasons Salt
2                 Robust Golden Unsweetened Oolong Tea
3    Smart Ones Classic Favorites Mini Rigatoni Wit...
4                            Green Chile Anytime Sauce
Name: product_name, dtype: object

In [99]:
#Function to removev all numericals from product name
#If some quantities represented as '1000mcg' only 1000 is removed keeping 'mcg'
#hence similar tablets with units has high similarity and low cosine angle between them

from nltk.tokenize import word_tokenize

def removeDigits(sent):
    words = word_tokenize(sent)
    without_dig = []
    for word in words:
        if not word.isdigit():
            without_dig.append(word)
        Product_name = ' '.join(without_dig)
    return Product_name
prod_data['product_name'] = pd.Series(map(lambda name: removeDigits(name), prod_data['product_name']))

In [100]:
#Product Name - Text Cleaning -Preproccessing before tfidf conversion
prod_data = prod_data[pd.notnull(prod_data['product_name'])]
#Text data cleaning
prod_data['product_name'] = prod_data['product_name'].str.replace('[^\w\s]','')
print("Text Cleaned by removing escape characters")

prod_data['product_name'] = prod_data['product_name'].str.replace("\r","")
print("Text Cleaned by removing  line feeds")

prod_data['product_name'] = prod_data['product_name'].str.replace(r'\s+[a-zA-Z]\s+', ' ')   
print("Text Cleaned by removing  special characters")

prod_data['product_name'] = prod_data['product_name'].str.lower()
print("Text lower cased")


Text Cleaned by removing escape characters
Text Cleaned by removing  line feeds
Text Cleaned by removing  special characters
Text lower cased


In [101]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer

#Stemming for further text proccessing
SnowballStemmer=SnowballStemmer('english')
def stemSentence(sentence):
    stem_sentence=[]
    for word in sentence:
        stem_sentence.append(SnowballStemmer.stem(word))
        stem_sentence.append("")
    return "".join(stem_sentence)

#Stemming product description
prod_data['product_name'] = prod_data['product_name'].apply(stemSentence)

In [102]:
prod_data['product_name'].head()#After text cleaning

0                           chocolate sandwich cookies
1                                      allseasons salt
2                 robust golden unsweetened oolong tea
3    smart ones classic favorites mini rigatoni wit...
4                            green chile anytime sauce
Name: product_name, dtype: object

In [103]:
#Product name is represented as Tf-Idf matrix 
from sklearn.feature_extraction import text
product_names=prod_data['product_name'].tolist()
tfidf=text.TfidfVectorizer(input=product_names,stop_words="english",ngram_range=(1,4),lowercase=True)
product_name_matrix=tfidf.fit_transform(product_names)

In [104]:
product_name_matrix.shape

(1000, 8448)

In [105]:
#Forming a cocsine similarity matrix with each product name
from sklearn.metrics.pairwise import cosine_similarity
sim_prod_matrix=cosine_similarity(product_name_matrix)

In [106]:
sim_prod_matrix.shape

(1000, 1000)

In [107]:
#Creating a dictionary of similar product suggetsion details with their respective information
def get_similar_products(x):
    products =  ",".join(data['product_name'].loc[x.argsort()[-6:-1]])
    dept_ids =  list(data['department_id'].loc[x.argsort()[-6:-1]])
    product_ids =  list(data['product_id'].loc[x.argsort()[-6:-1]])
    aisle_ids =  list(data['aisle_id'].loc[x.argsort()[-6:-1]])
    return dict({'products':products,'dept_ids':dept_ids,'product_ids':product_ids,'aisle_ids':aisle_ids})

prod_data['similar_products']=[get_similar_products(x) for x in sim_prod_matrix]

In [114]:
prod_data['similar_products'][10]# type ->Dictionary

{'products': '100% Cranberry Juice Blend,Premium Apple Juice,All Natural 100% Apple Juice,Peach Aloe Vera Drink,Organic Mango Peach Carrot Kickstart Smoothie',
 'dept_ids': [7, 7, 7, 7, 1],
 'product_ids': [806, 935, 82, 677, 552],
 'aisle_ids': [98, 98, 98, 98, 116]}

#### Each row of this similar_product column contains a dictionary of similar items , hence appending lin seperate Columns into prod_data

In [135]:
#Suggestion list for similar product based on cosine similarity
suggested_prod_ids = []
suggested_prod_names = []
suggested_dept_ids = []
suggested_aisle_ids = []
#Taking each dictionary formed and extracting product_ids,aisle_ids,department_ids and product_names
for product in prod_data['similar_products']:
    suggested_prod_ids.append((' ,'.join(str(id) for id in product['product_ids'])))    
    suggested_prod_names.append(product['products'])
    suggested_dept_ids.append((' ,'.join(str(id) for id in product['dept_ids'])))
    suggested_aisle_ids.append((' ,'.join(str(id) for id in product['aisle_ids'])))


In [136]:
#Storing all the three columns to their respective products 
data['Suggested_products'] = pd.Series(suggested_prod_names)
data['Suggested_pro_ids'] = pd.Series(suggested_prod_ids)
data['Suggested_dept_ids'] = pd.Series(suggested_dept_ids)
data['suggested_aisle_id'] = pd.Series(suggested_aisle_ids)


In [137]:
data.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,Suggested_products,Suggested_pro_ids,Suggested_dept_ids,suggested_aisle_id
0,1,Chocolate Sandwich Cookies,61,19,"Vegetable Pie In A Pocket Sandwich,Gluten Free...","932 ,172 ,102 ,576 ,559","1 ,19 ,19 ,19 ,19","38 ,61 ,61 ,61 ,61"
1,2,All-Seasons Salt,104,13,"Black Salt Caramel Dark Chocolate Bar,Wild Alb...","921 ,79 ,273 ,425 ,463","19 ,15 ,19 ,6 ,13","45 ,95 ,107 ,33 ,104"
2,3,Robust Golden Unsweetened Oolong Tea,94,7,"Brambleberry Herbal Tea,Chai Tea Bags,Unsweete...","237 ,281 ,862 ,560 ,569","7 ,7 ,16 ,7 ,19","94 ,94 ,91 ,94 ,61"
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,"Caramel Sauce,Whipping Cream,Ice Cream, Cookie...","275 ,469 ,431 ,649 ,774","19 ,16 ,1 ,7 ,9","103 ,53 ,37 ,77 ,131"
4,5,Green Chile Anytime Sauce,5,13,"Green Peas, Organic, Petite,Petite Green Peas,...","327 ,969 ,846 ,851 ,275","1 ,1 ,17 ,19 ,19","116 ,116 ,111 ,107 ,103"


In [138]:
#Final output for similar product for finding out for a given product what are top 5 most similar ones 
#and which Aisle and Department they belong to
data.to_csv("Product_matches.csv",index = False)
#Converted to csv file and downloaded

In [139]:
#For 5th product the similar product suggested is:
data.iloc[999]

product_id                                                         1000
product_name                                                   Apricots
aisle_id                                                             18
department_id                                                        10
Suggested_products    Creamline Yogurt Wild Blueberry,Eco-Pac Kamut ...
Suggested_pro_ids                               339 ,340 ,341 ,342 ,329
Suggested_dept_ids                                     16 ,14 ,9 ,8 ,16
suggested_aisle_id                                  120 ,121 ,9 ,41 ,84
Name: 999, dtype: object