In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' #this helps to full output and not only the last lines of putput

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv(r"C:\Users\Admin\Downloads\Product-Recommendation-System-main\Product-Recommendation-System-main\customers_rating.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,0,A39HTATAQ9V7YF,205616461,5.0,1369699200
1,1,A3JM6GV9MNOF9X,558925278,3.0,1355443200
2,2,A1Z513UWSAAO0F,558925278,5.0,1404691200
3,3,A1WMRR494NWEWV,733001998,4.0,1382572800
4,4,A3IAAVS479H7M7,737104473,1.0,1274227200


In [4]:
df_copy = df.copy()

In [6]:
df_copy=df_copy.drop('Unnamed: 0', axis=1)

In [7]:
df_copy=df_copy.drop('Timestamp', axis=1)

In [8]:
df_copy.head()

Unnamed: 0,UserId,ProductId,Rating
0,A39HTATAQ9V7YF,205616461,5.0
1,A3JM6GV9MNOF9X,558925278,3.0
2,A1Z513UWSAAO0F,558925278,5.0
3,A1WMRR494NWEWV,733001998,4.0
4,A3IAAVS479H7M7,737104473,1.0


## Popularity Based Recommendation System 

In [9]:
#Counting the number of overall rating of all products and how many users have given the rating 
df_copy.groupby('ProductId').count()

Unnamed: 0_level_0,UserId,Rating
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1
0205616461,1,1
0558925278,2,2
0733001998,1,1
0737104473,1,1
0762451459,1,1
...,...,...
B0002VNKRM,17,17
B0002VNKSG,5,5
B0002VNLG2,2,2
B0002VNZS6,2,2


In [10]:
num_rating_df = df_copy.groupby('ProductId').count()['Rating'].reset_index()
num_rating_df.rename(columns={'Rating':'num_rating'},inplace=True)
num_rating_df

Unnamed: 0,ProductId,num_rating
0,0205616461,1
1,0558925278,2
2,0733001998,1
3,0737104473,1
4,0762451459,1
...,...,...
6379,B0002VNKRM,17
6380,B0002VNKSG,5
6381,B0002VNLG2,2
6382,B0002VNZS6,2


In [11]:
#Average rating of all products.
avg_rating_df = df_copy.groupby('ProductId').mean()['Rating'].reset_index()
avg_rating_df.rename(columns={'Rating':'avg_rating'},inplace=True)
avg_rating_df

Unnamed: 0,ProductId,avg_rating
0,0205616461,5.000000
1,0558925278,4.000000
2,0733001998,4.000000
3,0737104473,1.000000
4,0762451459,5.000000
...,...,...
6379,B0002VNKRM,4.294118
6380,B0002VNKSG,5.000000
6381,B0002VNLG2,4.000000
6382,B0002VNZS6,3.000000


In [12]:
popular_df = num_rating_df.merge(avg_rating_df,on='ProductId')
popular_df

Unnamed: 0,ProductId,num_rating,avg_rating
0,0205616461,1,5.000000
1,0558925278,2,4.000000
2,0733001998,1,4.000000
3,0737104473,1,1.000000
4,0762451459,1,5.000000
...,...,...,...
6379,B0002VNKRM,17,4.294118
6380,B0002VNKSG,5,5.000000
6381,B0002VNLG2,2,4.000000
6382,B0002VNZS6,2,3.000000


In [13]:
#We will suggest only those products to users which have recieved number of ratings more than 5
popular_df[popular_df['num_rating']>=5]

Unnamed: 0,ProductId,num_rating,avg_rating
19,1304351475,16,3.562500
22,1304482685,6,3.833333
50,1403790965,11,4.909091
51,1412759676,6,4.833333
62,3227001381,25,4.560000
...,...,...,...
6377,B0002VNKPY,17,3.882353
6378,B0002VNKQS,9,3.222222
6379,B0002VNKRM,17,4.294118
6380,B0002VNKSG,5,5.000000


In [14]:
popular_df[popular_df['num_rating']>=5].sort_values('avg_rating',ascending=False)

Unnamed: 0,ProductId,num_rating,avg_rating
1646,B000066B1B,14,5.000000
4663,B00021VU8S,5,5.000000
2967,B00014H8L4,5,5.000000
6212,B0002SH5E4,5,5.000000
2983,B00014IXAY,5,5.000000
...,...,...,...
5705,B0002FCA92,6,1.833333
1579,B00005YU0D,5,1.800000
5911,B0002JBU1M,5,1.800000
1393,B00005CDRP,6,1.166667


In [15]:
#Displaying top 5 products which have highest average ratings
popular_df = popular_df[popular_df['num_rating']>=5].sort_values('avg_rating',ascending=False).head(5)
popular_df

Unnamed: 0,ProductId,num_rating,avg_rating
1646,B000066B1B,14,5.0
4663,B00021VU8S,5,5.0
2967,B00014H8L4,5,5.0
6212,B0002SH5E4,5,5.0
2983,B00014IXAY,5,5.0


## Content based recommendation system

In [16]:
#Grouping data based on user ID and finding number of ratings given by each user
unum_rating_df = df_copy.groupby('UserId').count()['Rating'].reset_index()
unum_rating_df.rename(columns={'Rating':'unum_rating'},inplace=True)
unum_rating_df

Unnamed: 0,UserId,unum_rating
0,A00205921JHJK5X9LNP42,1
1,A00473363TJ8YSZ3YAGG9,2
2,A00700212KB3K0MVESPIY,1
3,A0081289HG0BXFQJQUWW,1
4,A01247753D6GFZD87MUV8,1
...,...,...
91651,AZZTJQ7CQZUD8,1
91652,AZZVCBG5G4EV8,1
91653,AZZWJ3LICUEKJ,1
91654,AZZWPNME0GQZ2,1


In [17]:
#consider only those users which have been given number of ratings more than or equal to 2
x = df_copy.groupby('UserId').count()['Rating']>1
good_users = x[x].index
good_users

Index(['A00473363TJ8YSZ3YAGG9', 'A0733801U845JJGV4H8V', 'A0908131Z7BWYSMRQ16T',
       'A099766128UI0NCS98N1E', 'A1008GFLTBL76H', 'A100V5X97DUOXV',
       'A100VQNP6I54HS', 'A1016DX7QKYBDC', 'A103D23F4SGCJY', 'A103U3KR4L2ZXT',
       ...
       'AZTX0RHQ1O9CB', 'AZTX1196S2WR9', 'AZU3L6HCEFG7B', 'AZUI6YY673GW5',
       'AZVYUXWLR63AZ', 'AZW9UD7Z7J0WL', 'AZWXG6KBXXC2N', 'AZZHB6U54UDYW',
       'AZZNK89PXD006', 'AZZQXL8VDCFTV'],
      dtype='object', name='UserId', length=6014)

In [18]:
filtered_rating = df_copy[df_copy['UserId'].isin(good_users)]
filtered_rating

Unnamed: 0,UserId,ProductId,Rating
4,A3IAAVS479H7M7,0737104473,1.0
5,AKJHHD5VEH7VG,0762451459,5.0
6,A1BG8QW55XHN6U,1304139212,5.0
7,A22VW0P4VZHDE3,1304139220,5.0
10,A1BG8QW55XHN6U,130414643X,5.0
...,...,...,...
99742,A2P5CDZF56NLVL,B0002VNKRM,5.0
99746,A1ASSKIYJ4Z9K8,B0002VNKSG,5.0
99828,A6WV0OPKDYP2F,B0002VQ0WO,4.0
99848,A70G84Z95RU3N,B0002VQ0WO,5.0


In [19]:
#Count the number of ratings for each product
filtered_rating.groupby('ProductId').count()['Rating']

ProductId
0737104473    1
0762451459    1
1304139212    1
1304139220    1
130414643X    1
             ..
B0002VNKPY    3
B0002VNKQS    1
B0002VNKRM    7
B0002VNKSG    1
B0002VQ0WO    3
Name: Rating, Length: 3091, dtype: int64

In [20]:
#Display all the products which have recieved more than 1 ratings
y = filtered_rating.groupby('ProductId').count()['Rating'] >= 1
famous_resto = y[y].index
famous_resto

Index(['0737104473', '0762451459', '1304139212', '1304139220', '130414643X',
       '1304146537', '130414674X', '1304168522', '1304174778', '1304174867',
       ...
       'B0002VHBTU', 'B0002VHOBK', 'B0002VJIH8', 'B0002VJTGS', 'B0002VNKPO',
       'B0002VNKPY', 'B0002VNKQS', 'B0002VNKRM', 'B0002VNKSG', 'B0002VQ0WO'],
      dtype='object', name='ProductId', length=3091)

In [21]:
final_rating = filtered_rating[filtered_rating['ProductId'].isin(famous_resto)]

In [22]:
final_rating.drop_duplicates()

Unnamed: 0,UserId,ProductId,Rating
4,A3IAAVS479H7M7,0737104473,1.0
5,AKJHHD5VEH7VG,0762451459,5.0
6,A1BG8QW55XHN6U,1304139212,5.0
7,A22VW0P4VZHDE3,1304139220,5.0
10,A1BG8QW55XHN6U,130414643X,5.0
...,...,...,...
99742,A2P5CDZF56NLVL,B0002VNKRM,5.0
99746,A1ASSKIYJ4Z9K8,B0002VNKSG,5.0
99828,A6WV0OPKDYP2F,B0002VQ0WO,4.0
99848,A70G84Z95RU3N,B0002VQ0WO,5.0


In [23]:
#Plot the pivot table with product Id as index and user ID as columns names
pt = final_rating.pivot_table(index='ProductId',columns='UserId',values='Rating')
pt

UserId,A00473363TJ8YSZ3YAGG9,A0733801U845JJGV4H8V,A0908131Z7BWYSMRQ16T,A099766128UI0NCS98N1E,A1008GFLTBL76H,A100V5X97DUOXV,A100VQNP6I54HS,A1016DX7QKYBDC,A103D23F4SGCJY,A103U3KR4L2ZXT,...,AZTX0RHQ1O9CB,AZTX1196S2WR9,AZU3L6HCEFG7B,AZUI6YY673GW5,AZVYUXWLR63AZ,AZW9UD7Z7J0WL,AZWXG6KBXXC2N,AZZHB6U54UDYW,AZZNK89PXD006,AZZQXL8VDCFTV
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0737104473,,,,,,,,,,,...,,,,,,,,,,
0762451459,,,,,,,,,,,...,,,,,,,,,,
1304139212,,,,,,,,,,,...,,,,,,,,,,
1304139220,,,,,,,,,,,...,,,,,,,,,,
130414643X,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B0002VNKPY,,,,,,,,,,,...,,,,,,,,,,
B0002VNKQS,,,,,,,,,,,...,,,,,,,,,,
B0002VNKRM,,,,,,,,,,,...,,,,,,,,,,
B0002VNKSG,,,,,,,,,,,...,,,,,,,,,,


In [29]:
# fill missing values with zeros
pt.fillna(0,inplace=True)
pt

UserId,A00473363TJ8YSZ3YAGG9,A0733801U845JJGV4H8V,A0908131Z7BWYSMRQ16T,A099766128UI0NCS98N1E,A1008GFLTBL76H,A100V5X97DUOXV,A100VQNP6I54HS,A1016DX7QKYBDC,A103D23F4SGCJY,A103U3KR4L2ZXT,...,AZTX0RHQ1O9CB,AZTX1196S2WR9,AZU3L6HCEFG7B,AZUI6YY673GW5,AZVYUXWLR63AZ,AZW9UD7Z7J0WL,AZWXG6KBXXC2N,AZZHB6U54UDYW,AZZNK89PXD006,AZZQXL8VDCFTV
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0737104473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0762451459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1304139212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1304139220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130414643X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B0002VNKPY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B0002VNKQS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B0002VNKRM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B0002VNKSG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# measure using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity_score = cosine_similarity(pt)
df2 = pd.DataFrame(similarity_score)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3081,3082,3083,3084,3085,3086,3087,3088,3089,3090
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.707107,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.707107,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.707107,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.131306,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [26]:
# measure using euclidean distance
from sklearn.metrics.pairwise import euclidean_distances
similarity_score_2 = euclidean_distances(pt)
df3 = pd.DataFrame(similarity_score_2)
df3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3081,3082,3083,3084,3085,3086,3087,3088,3089,3090
0,0.000000,5.099020,5.099020,5.099020,5.099020,5.099020,5.099020,7.141428,5.099020,7.141428,...,7.681146,3.162278,12.845233,4.123106,4.123106,5.000000,1.414214,11.313708,5.099020,8.185353
1,5.099020,0.000000,7.071068,7.071068,7.071068,7.071068,7.071068,8.660254,7.071068,8.660254,...,9.110434,5.830952,13.747727,6.403124,6.403124,7.000000,5.099020,12.328828,7.071068,9.539392
2,5.099020,7.071068,0.000000,7.071068,0.000000,7.071068,7.071068,8.660254,7.071068,5.000000,...,9.110434,5.830952,13.747727,6.403124,6.403124,7.000000,5.099020,12.328828,7.071068,9.539392
3,5.099020,7.071068,7.071068,0.000000,7.071068,7.071068,0.000000,8.660254,0.000000,5.000000,...,9.110434,5.830952,13.747727,6.403124,6.403124,7.000000,5.099020,12.328828,7.071068,9.539392
4,5.099020,7.071068,0.000000,7.071068,0.000000,7.071068,7.071068,8.660254,7.071068,5.000000,...,9.110434,5.830952,13.747727,6.403124,6.403124,7.000000,5.099020,12.328828,7.071068,9.539392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3086,5.000000,7.000000,7.000000,7.000000,7.000000,7.000000,7.000000,8.602325,7.000000,8.602325,...,9.055385,5.744563,13.711309,6.324555,6.324555,0.000000,5.000000,12.288206,7.000000,9.486833
3087,1.414214,5.099020,5.099020,5.099020,5.099020,5.099020,5.099020,7.141428,5.099020,7.141428,...,7.549834,3.162278,12.845233,4.123106,4.123106,5.000000,0.000000,11.313708,5.099020,8.185353
3088,11.313708,12.328828,12.328828,12.328828,12.328828,12.328828,12.328828,13.304135,12.328828,13.304135,...,13.601471,11.661904,17.058722,11.958261,11.958261,12.288206,11.313708,0.000000,12.328828,13.892444
3089,5.099020,7.071068,7.071068,7.071068,7.071068,7.071068,7.071068,8.660254,7.071068,8.660254,...,9.110434,5.830952,13.747727,6.403124,6.403124,7.000000,5.099020,12.328828,0.000000,9.539392


In [27]:
#Recommend a Product