# Case Study: Finding Similar Products

In [36]:
#Import necessary libraries
import pandas as pd
import numpy as np

In [37]:
# read the data
df1=pd.read_csv("prods.csv")

In [38]:
#check the data first 5 elements
df1.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [39]:
# Processing the Product name column before finding cosine similarity
from sklearn.feature_extraction import text
Text=df1['product_name'].tolist()
tfidf=text.TfidfVectorizer(input=Text,stop_words="english") # removing Stop words
matrix=tfidf.fit_transform(Text) #getting the TFIDF matrix
print(matrix.shape)

(1000, 1685)


In [40]:
### Get Similarity Scores using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
sim_unigram=cosine_similarity(matrix)

In [41]:
#sim_unigram

In [42]:
# Function to get the similar products
def get_similar_products(x):
    return ",".join(df1['product_name'].loc[x.argsort()[-6:-1]])
df1['similar_products']=[get_similar_products(x) for x in sim_unigram]

In [43]:
# first product name
df1['product_name'][0]

'Chocolate Sandwich Cookies'

In [44]:
# its similar products
df1['similar_products'].str.split("\n")[0]

['Danish Butter Cookies,Oreo Cookies and Cream Chocolate Frozen Dairy Dessert,Vanilla Sugar Cookies,Gluten Free All Natural Chocolate Chip Cookies,Cookie Chips Crunchy Dark Chocolate Chocolate Chip Cookies']

In [45]:
df1['similar_products'].str.split("\n")[1]

["Black Salt Caramel Dark Chocolate Bar,Thin Stackers Brown Rice  Salt Free,Sardines in Water Salt Added,Salt Free Seasoning,Nature's Seasons Seasoning Blend"]

# Data with Similar Products

In [46]:
df1.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,similar_products
0,1,Chocolate Sandwich Cookies,61,19,"Danish Butter Cookies,Oreo Cookies and Cream C..."
1,2,All-Seasons Salt,104,13,"Black Salt Caramel Dark Chocolate Bar,Thin Sta..."
2,3,Robust Golden Unsweetened Oolong Tea,94,7,Fresh Cut Golden Sweet No Salt Added Whole Ker...
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,"Classic Original Hummus,Classic coke,Ice Cream..."
4,5,Green Chile Anytime Sauce,5,13,"Green Peas, Organic, Petite,Petite Green Peas,..."


In [47]:
#Export to CSV

In [48]:
df1.to_csv(r'D:\CapstoneProjecct\Similarproducts.csv')

In [13]:
# get the corresponding AisleID and department ID

In [49]:
def get_similar_aisle_id(x):
    return list(df1['aisle_id'][0:5])
df1['similar_products_aisle_id']=[get_similar_aisle_id(x) for x in sim_unigram]

In [50]:
df1.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,similar_products,similar_products_aisle_id
0,1,Chocolate Sandwich Cookies,61,19,"Danish Butter Cookies,Oreo Cookies and Cream C...","[61, 104, 94, 38, 5]"
1,2,All-Seasons Salt,104,13,"Black Salt Caramel Dark Chocolate Bar,Thin Sta...","[61, 104, 94, 38, 5]"
2,3,Robust Golden Unsweetened Oolong Tea,94,7,Fresh Cut Golden Sweet No Salt Added Whole Ker...,"[61, 104, 94, 38, 5]"
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,"Classic Original Hummus,Classic coke,Ice Cream...","[61, 104, 94, 38, 5]"
4,5,Green Chile Anytime Sauce,5,13,"Green Peas, Organic, Petite,Petite Green Peas,...","[61, 104, 94, 38, 5]"


In [51]:
def get_similar_aisle_id(x):
    return ",".join(df1['product_name'].loc[x.argsort()[-6:-1]])
df1['similar_products_unigram']=[get_similar_aisle_id(x) for x in sim_unigram]

In [52]:
df1["similar_products_unigram"][0].split(",")

['Danish Butter Cookies',
 'Oreo Cookies and Cream Chocolate Frozen Dairy Dessert',
 'Vanilla Sugar Cookies',
 'Gluten Free All Natural Chocolate Chip Cookies',
 'Cookie Chips Crunchy Dark Chocolate Chocolate Chip Cookies']

In [53]:
df1.shape

(1000, 7)

In [54]:
a=df1["similar_products_unigram"][0].split(",")
print(a)

['Danish Butter Cookies', 'Oreo Cookies and Cream Chocolate Frozen Dairy Dessert', 'Vanilla Sugar Cookies', 'Gluten Free All Natural Chocolate Chip Cookies', 'Cookie Chips Crunchy Dark Chocolate Chocolate Chip Cookies']


In [55]:
import numpy as np


In [56]:
df1['temp'] = np.where(df1['aisle_id']>100, df1["product_name"], df1["department_id"])
df1.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,similar_products,similar_products_aisle_id,similar_products_unigram,temp
0,1,Chocolate Sandwich Cookies,61,19,"Danish Butter Cookies,Oreo Cookies and Cream C...","[61, 104, 94, 38, 5]","Danish Butter Cookies,Oreo Cookies and Cream C...",19
1,2,All-Seasons Salt,104,13,"Black Salt Caramel Dark Chocolate Bar,Thin Sta...","[61, 104, 94, 38, 5]","Black Salt Caramel Dark Chocolate Bar,Thin Sta...",All-Seasons Salt
2,3,Robust Golden Unsweetened Oolong Tea,94,7,Fresh Cut Golden Sweet No Salt Added Whole Ker...,"[61, 104, 94, 38, 5]",Fresh Cut Golden Sweet No Salt Added Whole Ker...,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,"Classic Original Hummus,Classic coke,Ice Cream...","[61, 104, 94, 38, 5]","Classic Original Hummus,Classic coke,Ice Cream...",1
4,5,Green Chile Anytime Sauce,5,13,"Green Peas, Organic, Petite,Petite Green Peas,...","[61, 104, 94, 38, 5]","Green Peas, Organic, Petite,Petite Green Peas,...",13


In [57]:
for x in df1["similar_products_unigram"]:
    for y in x.split(","):
        df1['temp1'] = np.where(df1['product_name']==y, df1["aisle_id"], False )
        #print(y)

        
df1.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,similar_products,similar_products_aisle_id,similar_products_unigram,temp,temp1
0,1,Chocolate Sandwich Cookies,61,19,"Danish Butter Cookies,Oreo Cookies and Cream C...","[61, 104, 94, 38, 5]","Danish Butter Cookies,Oreo Cookies and Cream C...",19,0
1,2,All-Seasons Salt,104,13,"Black Salt Caramel Dark Chocolate Bar,Thin Sta...","[61, 104, 94, 38, 5]","Black Salt Caramel Dark Chocolate Bar,Thin Sta...",All-Seasons Salt,0
2,3,Robust Golden Unsweetened Oolong Tea,94,7,Fresh Cut Golden Sweet No Salt Added Whole Ker...,"[61, 104, 94, 38, 5]",Fresh Cut Golden Sweet No Salt Added Whole Ker...,7,0
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,"Classic Original Hummus,Classic coke,Ice Cream...","[61, 104, 94, 38, 5]","Classic Original Hummus,Classic coke,Ice Cream...",1,0
4,5,Green Chile Anytime Sauce,5,13,"Green Peas, Organic, Petite,Petite Green Peas,...","[61, 104, 94, 38, 5]","Green Peas, Organic, Petite,Petite Green Peas,...",13,0


In [58]:
df1.drop(["similar_products_aisle_id","temp","temp1"],axis=1,inplace=True)
df1.head(3)

Unnamed: 0,product_id,product_name,aisle_id,department_id,similar_products,similar_products_unigram
0,1,Chocolate Sandwich Cookies,61,19,"Danish Butter Cookies,Oreo Cookies and Cream C...","Danish Butter Cookies,Oreo Cookies and Cream C..."
1,2,All-Seasons Salt,104,13,"Black Salt Caramel Dark Chocolate Bar,Thin Sta...","Black Salt Caramel Dark Chocolate Bar,Thin Sta..."
2,3,Robust Golden Unsweetened Oolong Tea,94,7,Fresh Cut Golden Sweet No Salt Added Whole Ker...,Fresh Cut Golden Sweet No Salt Added Whole Ker...


In [59]:
a=df1["similar_products_unigram"][0].split(",")
a

['Danish Butter Cookies',
 'Oreo Cookies and Cream Chocolate Frozen Dairy Dessert',
 'Vanilla Sugar Cookies',
 'Gluten Free All Natural Chocolate Chip Cookies',
 'Cookie Chips Crunchy Dark Chocolate Chocolate Chip Cookies']

# similar Aisle ID

In [60]:
df1.dtypes

product_id                   int64
product_name                object
aisle_id                     int64
department_id                int64
similar_products            object
similar_products_unigram    object
dtype: object

In [61]:
a=list(df1["similar_products_unigram"][0].split(","))
a

['Danish Butter Cookies',
 'Oreo Cookies and Cream Chocolate Frozen Dairy Dessert',
 'Vanilla Sugar Cookies',
 'Gluten Free All Natural Chocolate Chip Cookies',
 'Cookie Chips Crunchy Dark Chocolate Chocolate Chip Cookies']

In [62]:
#emty=[]
for i in a:
    #print(i)
    print((df1[df1["product_name"]==i].index.values).astype(int))
    #emty.append(b)
    


[101]
[590]
[575]
[171]
[558]


In [63]:
for i in a:
    z=df1[df1["product_name"]==i].index.values
    z=z.astype(int)
    print(type(z))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [64]:
print(df1.loc[101,"aisle_id"])
print(df1.loc[590,"aisle_id"])
print(df1.loc[575,"aisle_id"])
print(df1.loc[171,"aisle_id"])
print(df1.loc[558,"aisle_id"])

61
37
61
61
61


In [65]:
df1.loc[171,:]

product_id                                                                172
product_name                   Gluten Free All Natural Chocolate Chip Cookies
aisle_id                                                                   61
department_id                                                              19
similar_products            Peanut Butter Chocolate Chip Bar,Gluten Free S...
similar_products_unigram    Peanut Butter Chocolate Chip Bar,Gluten Free S...
Name: 171, dtype: object

In [66]:
print(df1.loc[101,"department_id"])
print(df1.loc[590,"department_id"])
print(df1.loc[575,"department_id"])
print(df1.loc[171,"department_id"])
print(df1.loc[558,"department_id"])

19
1
19
19
19


In [67]:
df1.dtypes

product_id                   int64
product_name                object
aisle_id                     int64
department_id                int64
similar_products            object
similar_products_unigram    object
dtype: object

In [68]:
df1['aisle_id_new'] = df1['aisle_id'].apply(str)

In [69]:
df1.dtypes

product_id                   int64
product_name                object
aisle_id                     int64
department_id                int64
similar_products            object
similar_products_unigram    object
aisle_id_new                object
dtype: object