In [1]:
# import required libraries
# First have a look on feature engineering part otherwise this notebook will not make much sense to you
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading csv file into DataFrame
df = pd.read_csv('prods.csv')

In [3]:
# Looking at top 5 data rows
df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [4]:
# DataFrame Shape indicaticating number of rows and columns
df.shape

(1000, 4)

In [5]:
# Data information like column names their data types and info about null columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
product_id       1000 non-null int64
product_name     1000 non-null object
aisle_id         1000 non-null int64
department_id    1000 non-null int64
dtypes: int64(3), object(1)
memory usage: 27.4+ KB


In [6]:
# No. of unique aisle id
df.aisle_id.nunique()

128

In [7]:
# No. of unique department id
df.department_id.nunique()

21

In [8]:
df.product_name = df.product_name.str.strip()    
#df.product_name = df.product_name.str.replace(r"[^a-zA-Z\_]+", "") 
# Removing comma from data which can temper the requirements further
df.product_name=df.product_name.str.replace(",", " ").tolist()
# storing product names into list named as data
data=df.product_name.tolist()

In [9]:
# Tfidf Vector instance creation which will provide the weightage of each word in data list and will help to represent our text into vector form
# stop_words='english' will remove all stop words used in english language 
tf=text.TfidfVectorizer(input=data,stop_words='english')
# fit_transform will convert our data into matrix using Tfidf vector instance
mat=tf.fit_transform(data)
pd.DataFrame(mat.toarray(),columns=tf.get_feature_names())

Unnamed: 0,000,000mcg,013021,10,100,1000,1000mg,118,12,13,...,yogurt,yokids,yukon,yummy,zero,zesty,zfruit,zipper,zita,ziti
0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
5,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
6,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
7,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
8,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.479578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
9,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [10]:
# Our converted text into form of vectors which is stored in matrix can be used to find similar types using cosine similarity
sim_unigram=cosine_similarity(mat)

In [11]:
# Checking top 5 similar products according to values obtained from cosine similarity
df['product_name'].loc[sim_unigram[3].argsort()[-6:-1]]

543                 Classic Original Hummus
648                            Classic coke
430              Ice Cream  Cookies & Cream
996    Mini Double Chocolate Ice Cream Bars
773       Organic 100% Whole Wheat Rigatoni
Name: product_name, dtype: object

In [17]:
# Creating new column which will store top 5 similar products according to product name stored in particular row obtained from cosine similarity
def get(x):
    return ','.join(df['product_name'].loc[x.argsort()[-6:-1]])
df['recommended_products']=[get(x) for x in sim_unigram]

In [23]:

# Creating new column which will store aisle id of each 5 products stored in recommended_products column
def get_aisle(x):
        return ','.join([str(df[df.product_name==i]['aisle_id'].item()) for i in x.split(',')])
df['recommended_pro_ids']=df.recommended_products.apply(get_aisle)

In [24]:
df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,recommended_products,recommended_pro_ids
0,1,Chocolate Sandwich Cookies,61,19,"Danish Butter Cookies,Oreo Cookies and Cream C...",6137616161
1,2,All-Seasons Salt,104,13,"Black Salt Caramel Dark Chocolate Bar,Thin Sta...",4510733104104
2,3,Robust Golden Unsweetened Oolong Tea,94,7,Fresh Cut Golden Sweet No Salt Added Whole Ker...,8191919461
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,"Classic Original Hummus,Classic coke,Ice Cream...",67773737131
4,5,Green Chile Anytime Sauce,5,13,"Green Peas Organic Petite,Petite Green Peas,...",116116103111107


In [25]:
# Creating new column which will store department id of each 5 products stored in recommended_products column
def get_dept(x):
        return ','.join([str(df[df.product_name==i]['department_id'].item()) for i in x.split(',')])
df['recommended_depts_ids']=df.recommended_products.apply(get_dept)

In [26]:
df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,recommended_products,recommended_pro_ids,recommended_depts_ids
0,1,Chocolate Sandwich Cookies,61,19,"Danish Butter Cookies,Oreo Cookies and Cream C...",6137616161,191191919
1,2,All-Seasons Salt,104,13,"Black Salt Caramel Dark Chocolate Bar,Thin Sta...",4510733104104,191961313
2,3,Robust Golden Unsweetened Oolong Tea,94,7,Fresh Cut Golden Sweet No Salt Added Whole Ker...,8191919461,151616719
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,"Classic Original Hummus,Classic coke,Ice Cream...",67773737131,207119
4,5,Green Chile Anytime Sauce,5,13,"Green Peas Organic Petite,Petite Green Peas,...",116116103111107,11191719


In [27]:
# Final output is stored into new csv file
df.to_csv('Capston_Result.csv',header=df.columns,index=False)

In [29]:
# File which is stored as output of our case study
df1=pd.read_csv('Capston_Result.csv')
df1.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,recommended_products,recommended_pro_ids,recommended_depts_ids
0,1,Chocolate Sandwich Cookies,61,19,"Danish Butter Cookies,Oreo Cookies and Cream C...",6137616161,191191919
1,2,All-Seasons Salt,104,13,"Black Salt Caramel Dark Chocolate Bar,Thin Sta...",4510733104104,191961313
2,3,Robust Golden Unsweetened Oolong Tea,94,7,Fresh Cut Golden Sweet No Salt Added Whole Ker...,8191919461,151616719
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,"Classic Original Hummus,Classic coke,Ice Cream...",67773737131,207119
4,5,Green Chile Anytime Sauce,5,13,"Green Peas Organic Petite,Petite Green Peas,...",116116103111107,11191719
