# Problem Statement

This notebook explores two different collaborative filtering mechanism:
1. User-User collaborative filtering
2. Item-Item collaborative filtering

In [15]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split
import pickle


In [16]:
df = pd.read_csv("sample30.csv")
df.head()

Unnamed: 0,id,brand,categories,manufacturer,name,reviews_date,reviews_didPurchase,reviews_doRecommend,reviews_rating,reviews_text,reviews_title,reviews_userCity,reviews_userProvince,reviews_username,user_sentiment
0,AV13O1A8GV-KLJ3akUyj,Universal Music,"Movies, Music & Books,Music,R&b,Movies & TV,Mo...",Universal Music Group / Cash Money,Pink Friday: Roman Reloaded Re-Up (w/dvd),2012-11-30T06:21:45.000Z,,,5,i love this album. it's very good. more to the...,Just Awesome,Los Angeles,,joshua,Positive
1,AV14LG0R-jtxr-f38QfS,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co...",Lundberg,Lundberg Organic Cinnamon Toast Rice Cakes,2017-07-09T00:00:00.000Z,True,,5,Good flavor. This review was collected as part...,Good,,,dorothy w,Positive
2,AV14LG0R-jtxr-f38QfS,Lundberg,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co...",Lundberg,Lundberg Organic Cinnamon Toast Rice Cakes,2017-07-09T00:00:00.000Z,True,,5,Good flavor.,Good,,,dorothy w,Positive
3,AV16khLE-jtxr-f38VFn,K-Y,"Personal Care,Medicine Cabinet,Lubricant/Sperm...",K-Y,K-Y Love Sensuality Pleasure Gel,2016-01-06T00:00:00.000Z,False,False,1,I read through the reviews on here before look...,Disappointed,,,rebecca,Negative
4,AV16khLE-jtxr-f38VFn,K-Y,"Personal Care,Medicine Cabinet,Lubricant/Sperm...",K-Y,K-Y Love Sensuality Pleasure Gel,2016-12-21T00:00:00.000Z,False,False,1,My husband bought this gel for us. The gel cau...,Irritation,,,walker557,Negative


In [18]:
# removing duplicates
df =df.dropna(axis=0,subset=['reviews_username','reviews_rating'])
df=df.drop_duplicates(subset=['reviews_username'])

In [19]:
df['reviews_rating']=df['reviews_rating'].astype(float)

In [6]:
train,test = train_test_split(df,test_size=0.3,random_state=42)

In [7]:
train.head()

Unnamed: 0,id,brand,categories,manufacturer,name,reviews_date,reviews_didPurchase,reviews_doRecommend,reviews_rating,reviews_text,reviews_title,reviews_userCity,reviews_userProvince,reviews_username,user_sentiment
14499,AVpf3VOfilAPnD_xjpun,Clorox,"Household Essentials,Cleaning Supplies,Kitchen...",Clorox,Clorox Disinfecting Wipes Value Pack Scented 1...,2014-12-04T00:00:00.000Z,False,True,5.0,this clorox wipe cleans good. it can wipe ever...,good,,,lilxkrnxqt,Positive
16857,AVpf9pzn1cnluZ0-uNTM,Lundberg,"Food,Packaged Foods,Packaged Grains,Rice,Brown...",Lundberg Family Farms,"Lundberg Wehani Rice, 25lb",2017-03-28T00:00:00.000Z,,,5.0,fabulous item,Five Stars,,,byjean l'heureux,Positive
15730,AVpf5Z1zLJeJML43FpB-,Lysol,"Household Chemicals,Household Cleaners,Disinfe...",Reckitt Benckiser,"Lysol Concentrate Deodorizing Cleaner, Origina...",2012-05-02T00:00:00.000Z,False,True,5.0,"clean with this in bathrooms, kitchen, and was...",product is the best,,,grammy,Positive
27135,AVpfRoqcLJeJML436FqG,Vicks,"Personal Care,Medicine Cabinet,Cough/Cold,Heal...",P&G,"Vicks Vaporub, Regular, 3.53oz",2017-04-01T00:00:00.000Z,False,False,1.0,I purchased this and realized upon opening tha...,Jar was not sealed.,,,phantompunk1317,Negative
2704,AVpe59io1cnluZ0-ZgDU,Universal Home Video,"Movies, Music & Books,Movies,Comedy,Movies & T...",Universal,My Big Fat Greek Wedding 2 (blu-Ray + Dvd + Di...,2016-08-06T00:00:00.000Z,,True,4.0,"Its not as good as 1 st, but still enjoyed it....",Cute,,,trisha,Positive


## User-User colloborative filtering

In [8]:
# Creating a pivot table
df_pivot = train.pivot(
    index='reviews_username',
    columns='id',
    values='reviews_rating'
).fillna(0)

df_pivot.head(3)

id,AV13O1A8GV-KLJ3akUyj,AV14LG0R-jtxr-f38QfS,AV16khLE-jtxr-f38VFn,AV1YGDqsGV-KLJ3adc-O,AV1YIch7GV-KLJ3addeG,AV1YlENIglJLPUi8IHsX,AV1YmBrdGV-KLJ3adewb,AV1YmDL9vKc47QAVgr7_,AV1Ymf_rglJLPUi8II2v,AV1Yn94nvKc47QAVgtst,...,AVpfpoUCLJeJML43BLXv,AVpfqW4WilAPnD_xf7a_,AVpfr5cb1cnluZ0-pZFp,AVpfrFDZLJeJML43Bmv0,AVpfrTyiLJeJML43BrSI,AVpfrfHF1cnluZ0-pRai,AVpfrgjFLJeJML43BvCc,AVpfthSailAPnD_xg3ON,AVpftikC1cnluZ0-p31V,AVpfvieo1cnluZ0-qdnu
reviews_username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00dog3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01impala,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
02dakota,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# normalizing the pivot table by subtracting from the mean
mean = np.nanmean(df_pivot,axis=1)
mean.shape

(17439,)

In [10]:
# normalizing the pivot table as most of users have not rated
df_subtracted=df_pivot-mean[:,None]

In [11]:
df_subtracted

id,AV13O1A8GV-KLJ3akUyj,AV14LG0R-jtxr-f38QfS,AV16khLE-jtxr-f38VFn,AV1YGDqsGV-KLJ3adc-O,AV1YIch7GV-KLJ3addeG,AV1YlENIglJLPUi8IHsX,AV1YmBrdGV-KLJ3adewb,AV1YmDL9vKc47QAVgr7_,AV1Ymf_rglJLPUi8II2v,AV1Yn94nvKc47QAVgtst,...,AVpfpoUCLJeJML43BLXv,AVpfqW4WilAPnD_xf7a_,AVpfr5cb1cnluZ0-pZFp,AVpfrFDZLJeJML43Bmv0,AVpfrTyiLJeJML43BrSI,AVpfrfHF1cnluZ0-pRai,AVpfrgjFLJeJML43BvCc,AVpfthSailAPnD_xg3ON,AVpftikC1cnluZ0-p31V,AVpfvieo1cnluZ0-qdnu
reviews_username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00dog3,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,...,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260
01impala,-0.012195,-0.012195,-0.012195,-0.012195,-0.012195,-0.012195,-0.012195,-0.012195,-0.012195,-0.012195,...,-0.012195,-0.012195,-0.012195,-0.012195,-0.012195,-0.012195,-0.012195,-0.012195,-0.012195,-0.012195
02dakota,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,...,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325
02deuce,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,...,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260,-0.016260
0325home,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,...,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zxcsdfd,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,...,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325
zxjki,-0.008130,-0.008130,-0.008130,-0.008130,-0.008130,-0.008130,-0.008130,-0.008130,-0.008130,-0.008130,...,-0.008130,-0.008130,-0.008130,-0.008130,-0.008130,-0.008130,-0.008130,-0.008130,-0.008130,-0.008130
zyiah4,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,...,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325
zzdiane,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,...,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325,-0.020325


In [12]:
# finding user correlation using cosine similarity
user_correlation =1- (pairwise_distances(df_subtracted.fillna(0),metric='cosine'))
#user_correlation[np.isnan(user_correlation)]=0
user_correlation.shape

(17439, 17439)

In [13]:
user_correlation

array([[ 1.        , -0.00408163, -0.00408163, ..., -0.00408163,
        -0.00408163, -0.00408163],
       [-0.00408163,  1.        ,  1.        , ..., -0.00408163,
        -0.00408163, -0.00408163],
       [-0.00408163,  1.        ,  1.        , ..., -0.00408163,
        -0.00408163, -0.00408163],
       ...,
       [-0.00408163, -0.00408163, -0.00408163, ...,  1.        ,
        -0.00408163,  1.        ],
       [-0.00408163, -0.00408163, -0.00408163, ..., -0.00408163,
         1.        , -0.00408163],
       [-0.00408163, -0.00408163, -0.00408163, ...,  1.        ,
        -0.00408163,  1.        ]])

In [14]:
# to predict the ratings we need to create a dummy train where 1 is assigned to rating not given by user
dummy_train = train.copy()
dummy_train['reviews_rating'] = dummy_train['reviews_rating'].apply(lambda x:0 if x>=1 else 1)

In [15]:
dummy_train = dummy_train.pivot(index='reviews_username',columns='id',values='reviews_rating').fillna(1)
dummy_train.head()

id,AV13O1A8GV-KLJ3akUyj,AV14LG0R-jtxr-f38QfS,AV16khLE-jtxr-f38VFn,AV1YGDqsGV-KLJ3adc-O,AV1YIch7GV-KLJ3addeG,AV1YlENIglJLPUi8IHsX,AV1YmBrdGV-KLJ3adewb,AV1YmDL9vKc47QAVgr7_,AV1Ymf_rglJLPUi8II2v,AV1Yn94nvKc47QAVgtst,...,AVpfpoUCLJeJML43BLXv,AVpfqW4WilAPnD_xf7a_,AVpfr5cb1cnluZ0-pZFp,AVpfrFDZLJeJML43Bmv0,AVpfrTyiLJeJML43BrSI,AVpfrfHF1cnluZ0-pRai,AVpfrgjFLJeJML43BvCc,AVpfthSailAPnD_xg3ON,AVpftikC1cnluZ0-p31V,AVpfvieo1cnluZ0-qdnu
reviews_username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00dog3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
01impala,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
02dakota,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
02deuce,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0325home,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
# predict user rating  - dot product of user correlation and pivot table
user_predicted_ratings = np.dot(user_correlation,df_subtracted.fillna(0))

In [17]:
# final ratings
final_ratings = np.multiply(user_predicted_ratings,dummy_train)

In [18]:
final_ratings

id,AV13O1A8GV-KLJ3akUyj,AV14LG0R-jtxr-f38QfS,AV16khLE-jtxr-f38VFn,AV1YGDqsGV-KLJ3adc-O,AV1YIch7GV-KLJ3addeG,AV1YlENIglJLPUi8IHsX,AV1YmBrdGV-KLJ3adewb,AV1YmDL9vKc47QAVgr7_,AV1Ymf_rglJLPUi8II2v,AV1Yn94nvKc47QAVgtst,...,AVpfpoUCLJeJML43BLXv,AVpfqW4WilAPnD_xf7a_,AVpfr5cb1cnluZ0-pZFp,AVpfrFDZLJeJML43Bmv0,AVpfrTyiLJeJML43BrSI,AVpfrfHF1cnluZ0-pRai,AVpfrgjFLJeJML43BvCc,AVpfthSailAPnD_xg3ON,AVpftikC1cnluZ0-p31V,AVpfvieo1cnluZ0-qdnu
reviews_username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00dog3,1.025452,1.025452,0.792799,-2.929650,1.025452,0.841779,0.841779,0.041779,0.539738,0.719330,...,1.029534,0.678513,0.837697,0.674432,0.666268,0.890758,0.984636,0.976473,-1.978630,0.890758
01impala,-34.288834,-34.288834,-34.521487,-38.243936,-34.288834,-34.472507,-34.472507,-35.272507,-34.774548,-34.594956,...,-34.284752,-34.635772,-34.476589,-34.639854,-34.648017,-34.423527,-34.329650,-34.337813,-37.292915,-34.423527
02dakota,-34.288834,-34.288834,-34.521487,-38.243936,-34.288834,-34.472507,-34.472507,-35.272507,-34.774548,-34.594956,...,-34.284752,-34.635772,-34.476589,-34.639854,-34.648017,-34.423527,-34.329650,-34.337813,-37.292915,-34.423527
02deuce,-34.288834,-34.288834,-34.521487,-38.243936,-34.288834,-34.472507,-34.472507,-35.272507,-34.774548,-34.594956,...,-34.284752,-34.635772,-34.476589,-34.639854,-34.648017,-34.423527,-34.329650,-34.337813,-37.292915,-34.423527
0325home,-104.362303,-104.362303,-104.594956,-108.317405,-104.362303,-104.545976,-104.545976,-105.345976,-104.848017,-104.668425,...,-104.358221,-104.709242,-104.550058,-104.713323,-104.721487,-104.496997,-104.403119,-104.411283,-107.366385,-104.496997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zxcsdfd,-104.362303,-104.362303,-104.594956,-108.317405,-104.362303,-104.545976,-104.545976,-105.345976,-104.848017,-104.668425,...,-104.358221,-104.709242,-104.550058,-104.713323,-104.721487,-104.496997,-104.403119,-104.411283,-107.366385,-104.496997
zxjki,-104.362303,-104.362303,-104.594956,-108.317405,-104.362303,-104.545976,-104.545976,-105.345976,-104.848017,-104.668425,...,-104.358221,-104.709242,-104.550058,-104.713323,-104.721487,-104.496997,-104.403119,-104.411283,-107.366385,-104.496997
zyiah4,-104.362303,-104.362303,-104.594956,-108.317405,-104.362303,-104.545976,-104.545976,-105.345976,-104.848017,-104.668425,...,-104.358221,-104.709242,-104.550058,-104.713323,-104.721487,-104.496997,-104.403119,-104.411283,-107.366385,-104.496997
zzdiane,-5.815364,-5.815364,-6.048017,-9.770466,-5.815364,-5.999038,-5.999038,-6.799038,-6.301078,-6.121487,...,-5.811283,-6.162303,-6.003119,-6.166385,-6.174548,-5.950058,-5.856181,-5.864344,-8.819446,-5.950058


In [19]:
# top 10 recommendations for user 00dog3
final_ratings.loc['00dog3'].sort_values(ascending=False)[0:10]

id
AVpfRxSkilAPnD_xYrzm    1.041779
AVpfozgyilAPnD_xfe0r    1.041779
AVpfDflP1cnluZ0-c5RJ    1.041779
AVpe625_ilAPnD_xQ4SP    1.037697
AVpe4-GPLJeJML43xmuY    1.037697
AVpf0pfrilAPnD_xi6s_    1.033615
AVpe-MCY1cnluZ0-bCv_    1.033615
AVpe8q4T1cnluZ0-afct    1.033615
AVpf-bS61cnluZ0-uZcq    1.033615
AVpfF7k8ilAPnD_xUyWf    1.033615
Name: 00dog3, dtype: float64

In [20]:
common = test[test.reviews_username.isin(train.reviews_username)]
common
# there are no users common in both train and test

Unnamed: 0,id,brand,categories,manufacturer,name,reviews_date,reviews_didPurchase,reviews_doRecommend,reviews_rating,reviews_text,reviews_title,reviews_userCity,reviews_userProvince,reviews_username,user_sentiment


## Item-Item colloborative filtering

In [21]:
item_pivot = train.pivot(index='reviews_username',columns='id',values='reviews_rating').fillna(0).T
item_pivot.shape

(246, 17439)

In [22]:
mean = np.nanmean(item_pivot,axis=1)
mean.shape

(246,)

In [23]:
df_subtracted = item_pivot-mean[:,None]
df_subtracted.shape

(246, 17439)

In [24]:
df_subtracted

reviews_username,00dog3,01impala,02dakota,02deuce,0325home,06stidriver,08dallas,09mommy11,1.11E+24,10ten,...,zt313,zubb,zulaa118,zuttle,zwithanx,zxcsdfd,zxjki,zyiah4,zzdiane,zzz1127
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AV13O1A8GV-KLJ3akUyj,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,...,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287
AV14LG0R-jtxr-f38QfS,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,...,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287
AV16khLE-jtxr-f38VFn,-0.003555,-0.003555,-0.003555,-0.003555,-0.003555,-0.003555,-0.003555,-0.003555,-0.003555,-0.003555,...,-0.003555,-0.003555,-0.003555,-0.003555,-0.003555,-0.003555,-0.003555,-0.003555,-0.003555,-0.003555
AV1YGDqsGV-KLJ3adc-O,-0.055852,-0.055852,-0.055852,-0.055852,-0.055852,-0.055852,-0.055852,-0.055852,-0.055852,-0.055852,...,-0.055852,-0.055852,-0.055852,-0.055852,-0.055852,-0.055852,-0.055852,-0.055852,-0.055852,-0.055852
AV1YIch7GV-KLJ3addeG,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,...,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287,-0.000287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AVpfrfHF1cnluZ0-pRai,-0.002179,-0.002179,-0.002179,-0.002179,-0.002179,-0.002179,-0.002179,-0.002179,-0.002179,-0.002179,...,-0.002179,-0.002179,-0.002179,-0.002179,-0.002179,-0.002179,-0.002179,-0.002179,-0.002179,-0.002179
AVpfrgjFLJeJML43BvCc,-0.000860,-0.000860,-0.000860,-0.000860,-0.000860,-0.000860,-0.000860,-0.000860,-0.000860,-0.000860,...,-0.000860,-0.000860,-0.000860,-0.000860,-0.000860,-0.000860,-0.000860,-0.000860,-0.000860,-0.000860
AVpfthSailAPnD_xg3ON,-0.000975,-0.000975,-0.000975,-0.000975,-0.000975,-0.000975,-0.000975,-0.000975,-0.000975,-0.000975,...,-0.000975,-0.000975,-0.000975,-0.000975,-0.000975,-0.000975,-0.000975,-0.000975,-0.000975,-0.000975
AVpftikC1cnluZ0-p31V,-0.042491,-0.042491,-0.042491,-0.042491,-0.042491,-0.042491,-0.042491,-0.042491,-0.042491,-0.042491,...,-0.042491,-0.042491,-0.042491,-0.042491,-0.042491,-0.042491,-0.042491,-0.042491,-0.042491,-0.042491


In [25]:
# Finding the cosine similarity
item_correlation = 1- pairwise_distances(df_subtracted,metric='cosine')
item_correlation

array([[ 1.00000000e+00, -5.73460259e-05, -2.27710946e-04, ...,
        -1.12578939e-04, -7.33987309e-04, -1.81642774e-04],
       [-5.73460259e-05,  1.00000000e+00, -2.27710946e-04, ...,
        -1.12578939e-04, -7.33987309e-04, -1.81642774e-04],
       [-2.27710946e-04, -2.27710946e-04,  1.00000000e+00, ...,
        -4.47031092e-04, -2.91453403e-03, -7.21271389e-04],
       ...,
       [-1.12578939e-04, -1.12578939e-04, -4.47031092e-04, ...,
         1.00000000e+00, -1.44092831e-03, -3.56592290e-04],
       [-7.33987309e-04, -7.33987309e-04, -2.91453403e-03, ...,
        -1.44092831e-03,  1.00000000e+00, -2.32489503e-03],
       [-1.81642774e-04, -1.81642774e-04, -7.21271389e-04, ...,
        -3.56592290e-04, -2.32489503e-03,  1.00000000e+00]])

In [26]:
item_predicted_rating = np.dot(df_subtracted.fillna(0).T, item_correlation)
item_predicted_rating

array([[ 0.00997771,  0.00997771,  0.03719983, ...,  0.01917562,
         0.08848724,  0.03033221],
       [ 0.00281698,  0.00281698,  0.00876581, ...,  0.00511801,
        -0.00316496,  0.00765067],
       [-0.00253178, -0.00253178, -0.01247316, ..., -0.00538241,
        -0.07162514, -0.00929144],
       ...,
       [-0.01427711, -0.01427711, -0.05911181, ..., -0.02844028,
        -0.22195685, -0.04649463],
       [ 0.00522794,  0.00522794,  0.01833932, ...,  0.00985109,
         0.0276936 ,  0.01528737],
       [-0.00925367, -0.00925367, -0.03916459, ..., -0.0185785 ,
        -0.15766042, -0.03058294]])

In [27]:
final_rating = np.multiply(item_predicted_rating, dummy_train)
final_rating

id,AV13O1A8GV-KLJ3akUyj,AV14LG0R-jtxr-f38QfS,AV16khLE-jtxr-f38VFn,AV1YGDqsGV-KLJ3adc-O,AV1YIch7GV-KLJ3addeG,AV1YlENIglJLPUi8IHsX,AV1YmBrdGV-KLJ3adewb,AV1YmDL9vKc47QAVgr7_,AV1Ymf_rglJLPUi8II2v,AV1Yn94nvKc47QAVgtst,...,AVpfpoUCLJeJML43BLXv,AVpfqW4WilAPnD_xf7a_,AVpfr5cb1cnluZ0-pZFp,AVpfrFDZLJeJML43Bmv0,AVpfrTyiLJeJML43BrSI,AVpfrfHF1cnluZ0-pRai,AVpfrgjFLJeJML43BvCc,AVpfthSailAPnD_xg3ON,AVpftikC1cnluZ0-p31V,AVpfvieo1cnluZ0-qdnu
reviews_username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00dog3,0.009978,0.009978,0.037200,0.092773,0.009978,0.030982,0.030835,0.061824,0.044197,0.036901,...,0.010035,0.039303,0.031088,0.038782,0.039338,0.026738,0.016919,0.019176,0.088487,0.030332
01impala,0.002817,0.002817,0.008766,-0.011384,0.002817,0.007367,0.007322,0.008822,0.008396,0.007955,...,0.002874,0.008280,0.007359,0.008082,0.008171,0.006564,0.004516,0.005118,-0.003165,0.007651
02dakota,-0.002532,-0.002532,-0.012473,-0.089185,-0.002532,-0.010273,-0.010241,-0.030768,-0.018345,-0.013667,...,-0.002474,-0.014893,-0.010366,-0.014849,-0.015110,-0.008505,-0.004749,-0.005382,-0.071625,-0.009291
02deuce,0.000143,0.000143,-0.001854,-0.050285,0.000143,-0.001453,-0.001459,-0.010973,-0.004974,-0.002856,...,0.000200,-0.003306,-0.001504,-0.003383,-0.003470,-0.000970,-0.000117,-0.000132,-0.037395,-0.000820
0325home,-0.014277,-0.014277,-0.059112,-0.260029,-0.014277,-0.049007,-0.048807,-0.117704,-0.077066,-0.061146,...,-0.014220,-0.065777,-0.049287,-0.065203,-0.066233,-0.041595,-0.025094,-0.028440,-0.221957,-0.046495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zxcsdfd,-0.014277,-0.014277,-0.059112,-0.260029,-0.014277,-0.049007,-0.048807,-0.117704,-0.077066,-0.061146,...,-0.014220,-0.065777,-0.049287,-0.065203,-0.066233,-0.041595,-0.025094,-0.028440,-0.221957,-0.046495
zxjki,0.000793,0.000793,0.000730,-0.040821,0.000793,0.000693,0.000677,-0.006157,-0.001721,-0.000226,...,0.000851,-0.000488,0.000652,-0.000594,-0.000638,0.000863,0.001010,0.001145,-0.029068,0.001240
zyiah4,-0.014277,-0.014277,-0.059112,-0.260029,-0.014277,-0.049007,-0.048807,-0.117704,-0.077066,-0.061146,...,-0.014220,-0.065777,-0.049287,-0.065203,-0.066233,-0.041595,-0.025094,-0.028440,-0.221957,-0.046495
zzdiane,0.005228,0.005228,0.018339,0.023685,0.005228,0.015318,0.015239,0.026668,0.020450,0.017701,...,0.005285,0.018725,0.015348,0.018419,0.018664,0.013357,0.008692,0.009851,0.027694,0.015287


In [28]:
final_rating.loc['00dog3'].sort_values(ascending=False)[0:5]

id
AVpfm8yiLJeJML43AYyu    0.117951
AVpe41TqilAPnD_xQH3d    0.102767
AVpfW8y_LJeJML437ySW    0.102699
AVpe59io1cnluZ0-ZgDU    0.098938
AVpf2tw1ilAPnD_xjflC    0.096663
Name: 00dog3, dtype: float64

### Evaluating item-item collaborative filtering

In [29]:
common = test[test.id.isin(train.id)]
common.head()

Unnamed: 0,id,brand,categories,manufacturer,name,reviews_date,reviews_didPurchase,reviews_doRecommend,reviews_rating,reviews_text,reviews_title,reviews_userCity,reviews_userProvince,reviews_username,user_sentiment
6140,AVpf2tw1ilAPnD_xjflC,Summit Entertainment,"Movies & TV Shows,Instawatch Movies By VUDU,Sh...",Summit Entertainment,Red (special Edition) (dvdvideo),2016-04-08T00:00:00.000Z,,True,5.0,This movie is a great comedy with a brilliant ...,Great movie,,,hewhois,Positive
3285,AVpe5JOgilAPnD_xQPfE,Sony Music,"Movies, Music & Books,Music,Rock,Music on CD o...",Columbia,The Script - No Sound Without Silence (cd),2014-12-27T00:00:00Z,,,1.0,I bought it for the 1 song hoping there were o...,bad just bad,,,george67,Negative
22409,AVpfOmKwLJeJML435GM7,Clear Scalp & Hair Therapy,"Personal Care,Hair Care,Shampoo,Featured Brand...",Clear,Clear Scalp & Hair Therapy Total Care Nourishi...,2016-10-08T00:00:00.000Z,False,False,1.0,I used to have horrible dandruff. I purchased ...,New formula disappoints big time,,,lizzy,Negative
2064,AVpe41TqilAPnD_xQH3d,FOX,"Movies & TV Shows,Movies,Romance,Romantic Come...",Twentieth Century Fox,Mike Dave Need Wedding Dates (dvd + Digital),2017-01-06T00:00:00.000Z,,True,4.0,Any fan of outrages and over the top comedy wi...,An underrated comedy gem....,,,tf99,Positive
15003,AVpf3VOfilAPnD_xjpun,Clorox,"Household Essentials,Cleaning Supplies,Kitchen...",Clorox,Clorox Disinfecting Wipes Value Pack Scented 1...,2014-12-05T00:00:00.000Z,False,True,5.0,I love using clorox products!!!!! They smell g...,Clorox,,,stack,Positive


In [30]:
common_df_pivot = common.pivot(index='reviews_username',columns='id',values='reviews_rating').fillna(0).T

In [31]:
import pandas as pd
item_correlation_df = pd.DataFrame(item_correlation)

In [32]:
item_correlation_df['id']=df_subtracted.index
item_correlation_df.set_index('id',inplace=True)
item_correlation_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,236,237,238,239,240,241,242,243,244,245
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AV13O1A8GV-KLJ3akUyj,1.0,-5.7e-05,-0.000228,-0.000834,-5.7e-05,-0.000189,-0.000188,-0.000424,-0.000287,-0.000232,...,-5.7e-05,-0.000248,-0.00019,-0.000246,-0.00025,-0.000162,-9.9e-05,-0.000113,-0.000734,-0.000182
AV14LG0R-jtxr-f38QfS,-5.7e-05,1.0,-0.000228,-0.000834,-5.7e-05,-0.000189,-0.000188,-0.000424,-0.000287,-0.000232,...,-5.7e-05,-0.000248,-0.00019,-0.000246,-0.00025,-0.000162,-9.9e-05,-0.000113,-0.000734,-0.000182
AV16khLE-jtxr-f38VFn,-0.000228,-0.000228,1.0,-0.003312,-0.000228,-0.000751,-0.000748,-0.001685,-0.001138,-0.00092,...,-0.000228,-0.000987,-0.000755,-0.000976,-0.000991,-0.000642,-0.000394,-0.000447,-0.002915,-0.000721
AV1YGDqsGV-KLJ3adc-O,-0.000834,-0.000834,-0.003312,1.0,-0.000834,-0.002751,-0.002739,-0.006174,-0.00417,-0.003372,...,-0.000834,-0.003614,-0.002764,-0.003576,-0.003631,-0.00235,-0.001445,-0.001638,-0.010676,-0.002642
AV1YIch7GV-KLJ3addeG,-5.7e-05,-5.7e-05,-0.000228,-0.000834,1.0,-0.000189,-0.000188,-0.000424,-0.000287,-0.000232,...,-5.7e-05,-0.000248,-0.00019,-0.000246,-0.00025,-0.000162,-9.9e-05,-0.000113,-0.000734,-0.000182


In [33]:
item_correlation_df.columns = df_subtracted.index

In [34]:
item_correlation_df.head()

id,AV13O1A8GV-KLJ3akUyj,AV14LG0R-jtxr-f38QfS,AV16khLE-jtxr-f38VFn,AV1YGDqsGV-KLJ3adc-O,AV1YIch7GV-KLJ3addeG,AV1YlENIglJLPUi8IHsX,AV1YmBrdGV-KLJ3adewb,AV1YmDL9vKc47QAVgr7_,AV1Ymf_rglJLPUi8II2v,AV1Yn94nvKc47QAVgtst,...,AVpfpoUCLJeJML43BLXv,AVpfqW4WilAPnD_xf7a_,AVpfr5cb1cnluZ0-pZFp,AVpfrFDZLJeJML43Bmv0,AVpfrTyiLJeJML43BrSI,AVpfrfHF1cnluZ0-pRai,AVpfrgjFLJeJML43BvCc,AVpfthSailAPnD_xg3ON,AVpftikC1cnluZ0-p31V,AVpfvieo1cnluZ0-qdnu
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AV13O1A8GV-KLJ3akUyj,1.0,-5.7e-05,-0.000228,-0.000834,-5.7e-05,-0.000189,-0.000188,-0.000424,-0.000287,-0.000232,...,-5.7e-05,-0.000248,-0.00019,-0.000246,-0.00025,-0.000162,-9.9e-05,-0.000113,-0.000734,-0.000182
AV14LG0R-jtxr-f38QfS,-5.7e-05,1.0,-0.000228,-0.000834,-5.7e-05,-0.000189,-0.000188,-0.000424,-0.000287,-0.000232,...,-5.7e-05,-0.000248,-0.00019,-0.000246,-0.00025,-0.000162,-9.9e-05,-0.000113,-0.000734,-0.000182
AV16khLE-jtxr-f38VFn,-0.000228,-0.000228,1.0,-0.003312,-0.000228,-0.000751,-0.000748,-0.001685,-0.001138,-0.00092,...,-0.000228,-0.000987,-0.000755,-0.000976,-0.000991,-0.000642,-0.000394,-0.000447,-0.002915,-0.000721
AV1YGDqsGV-KLJ3adc-O,-0.000834,-0.000834,-0.003312,1.0,-0.000834,-0.002751,-0.002739,-0.006174,-0.00417,-0.003372,...,-0.000834,-0.003614,-0.002764,-0.003576,-0.003631,-0.00235,-0.001445,-0.001638,-0.010676,-0.002642
AV1YIch7GV-KLJ3addeG,-5.7e-05,-5.7e-05,-0.000228,-0.000834,1.0,-0.000189,-0.000188,-0.000424,-0.000287,-0.000232,...,-5.7e-05,-0.000248,-0.00019,-0.000246,-0.00025,-0.000162,-9.9e-05,-0.000113,-0.000734,-0.000182


In [35]:
item_correlation_1=item_correlation_df[item_correlation_df.columns.isin(common.id)]

In [36]:
item_correlation_2=item_correlation_1.T[item_correlation_1.T.index.isin(common.id)]

In [37]:
item_correlation_3=item_correlation_2.T

In [38]:
common_item_predicted_ratings = np.dot(item_correlation_3,common_df_pivot)

In [39]:
dummy_test = common.copy()
dummy_test['reviews_rating'] = dummy_test['reviews_rating'].apply(lambda x: 1 if x>=1 else 0)

dummy_test = dummy_test.pivot_table(index='reviews_username', columns='id', values='reviews_rating').T.fillna(0)

common_item_predicted_ratings = np.multiply(common_item_predicted_ratings,dummy_test)

In [40]:
X= common_item_predicted_ratings.copy()

In [41]:
X[X<0]=0

In [42]:
from sklearn.preprocessing import MinMaxScaler

In [43]:
scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

MinMaxScaler(copy=True, feature_range=(1, 5))


In [44]:
total_non_nan = np.count_nonzero(~np.isnan(y))

In [45]:
rmse = (sum(sum((common_df_pivot.values - y )**2))/total_non_nan)**0.5
print(rmse)

1.0002722787636782


In [20]:
# creating item_correlation for the entire data
df_pivot = df.pivot(
    index='reviews_username',
    columns='id',
    values='reviews_rating'
).fillna(0).T

mean = np.nanmean(df_pivot,axis=1)
df_subtracted = df_pivot-mean[:,None]
final_item_correlation = 1- pairwise_distances(df_subtracted,metric='cosine')

In [23]:
# dumping thr item-item correlation
with open('item_correlation.pkl','wb') as f:
    pickle.dump(final_item_correlation,f)

Since the user-user correlation is more sparse thana item-item correlation, i.e most of the users have not rated and have each user has rated only once it is better to choose item-item correlation matrix.