# Importing libraries and data set

In [146]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [147]:
colnames = ['User_Id', 'Prod_Id', 'Rating', 'Time_Stamp']

df = pd.read_csv("ratings_Electronics.csv", names= colnames)

df.head()

Unnamed: 0,User_Id,Prod_Id,Rating,Time_Stamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [148]:
df.tail()

Unnamed: 0,User_Id,Prod_Id,Rating,Time_Stamp
7824477,A2YZI3C9MOHC0L,BT008UKTMW,5.0,1396569600
7824478,A322MDK0M89RHN,BT008UKTMW,5.0,1313366400
7824479,A1MH90R0ADMIK0,BT008UKTMW,4.0,1404172800
7824480,A10M2KEFPEQDHN,BT008UKTMW,4.0,1297555200
7824481,A2G81TMIOIDEQQ,BT008V9J9U,5.0,1312675200


In [149]:
df.shape

(7824482, 4)

In [150]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7824482 entries, 0 to 7824481
Data columns (total 4 columns):
User_Id       object
Prod_Id       object
Rating        float64
Time_Stamp    int64
dtypes: float64(1), int64(1), object(2)
memory usage: 238.8+ MB


In [151]:
df.isnull().sum().sum()

0

# Creating a subset of the actual data set 

In [155]:
#Dropping the users who have given less than 50 ratings
df1 = df[df['User_Id'].isin(df['User_Id'].value_counts()[df['User_Id'].value_counts()>50].index)]

In [156]:
df1.shape 

(122171, 4)

In [158]:
#Converting User Id and Product Id columns into categorical for easier calculations

for col_name in df1.columns:
    if(df1[col_name].dtype == 'object'):
        df1[col_name]= df1[col_name].astype('category')
        df1[col_name] = df1[col_name].cat.codes

df1.head()

Unnamed: 0,User_Id,Prod_Id,Rating,Time_Stamp
118,1390,0,3.0,1377907200
177,806,1,1.0,1319673600
178,79,1,4.0,1281744000
492,902,2,5.0,1373587200
631,1089,2,5.0,1353456000


In [159]:
#Dropping Time Stamp column

df1 = df1.drop('Time_Stamp', axis = 1)

# Building a Popularity model

In [182]:
df1.groupby('Prod_Id')['Rating'].mean().head()  

Prod_Id
0    3.0
1    2.5
2    5.0
3    3.0
4    5.0
Name: Rating, dtype: float64

In [183]:
df1.groupby('Prod_Id')['Rating'].mean().sort_values(ascending=False).head()  

Prod_Id
47154    5.0
9294     5.0
9306     5.0
28351    5.0
28352    5.0
Name: Rating, dtype: float64

In [184]:
df1.groupby('Prod_Id')['Rating'].count().sort_values(ascending=False).head()  

Prod_Id
38146    204
24262    177
10820    163
37407    156
37765    146
Name: Rating, dtype: int64

In [188]:
ratings_mean_count = pd.DataFrame(df1.groupby('Prod_Id')['Rating'].mean()) 

In [189]:
ratings_mean_count['rating_counts'] = pd.DataFrame(df1.groupby('Prod_Id')['Rating'].count())  

In [214]:
#Showing 5 most rated products(they are not highly rated but they are rated most number of times and have not full but very high ratings)
ratings_mean_count.sort_values(['rating_counts'], ascending = False).head()

Unnamed: 0_level_0,Rating,rating_counts
Prod_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
38146,4.215686,204
24262,4.858757,177
10820,4.773006,163
37407,4.692308,156
37765,4.431507,146


# Building a Collaborative Filtering model using SVD

In [160]:
from collections import defaultdict
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

In [161]:
reader = Reader(rating_scale = (1, 5))

In [162]:
#Creating data set using above "train set which is 70% of original data
data = Dataset.load_from_df(df1, reader)

In [163]:
#Splitting the data
trainset, testset = train_test_split(data, test_size = 0.3, random_state = 123)

In [164]:
algo = SVD(n_factors = 50)

In [165]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x175c4abb9e8>

In [166]:
predictions = algo.test(testset)

In [167]:
p = pd.DataFrame(predictions)

In [168]:
p.shape

(36652, 5)

In [169]:
p

Unnamed: 0,uid,iid,r_ui,est,details
0,625,20804,5.0,4.739962,{'was_impossible': False}
1,605,26949,5.0,4.877509,{'was_impossible': False}
2,1342,5211,5.0,4.645265,{'was_impossible': False}
3,367,37888,4.0,4.282892,{'was_impossible': False}
4,378,40116,4.0,4.875615,{'was_impossible': False}
5,1302,44216,4.0,4.598920,{'was_impossible': False}
6,1357,34277,4.0,4.417519,{'was_impossible': False}
7,432,2410,1.0,4.472193,{'was_impossible': False}
8,374,10509,5.0,3.858597,{'was_impossible': False}
9,162,28799,3.0,4.331866,{'was_impossible': False}


In [170]:
accuracy.rmse(predictions)

RMSE: 0.9832


0.9831804028583833

In [171]:
p.sort_values(['est'], ascending = False)

Unnamed: 0,uid,iid,r_ui,est,details
21689,271,34383,5.0,5.000000,{'was_impossible': False}
6570,1296,42630,5.0,5.000000,{'was_impossible': False}
12734,869,27768,4.0,5.000000,{'was_impossible': False}
483,1019,37407,5.0,5.000000,{'was_impossible': False}
9143,150,18458,5.0,5.000000,{'was_impossible': False}
5964,906,30375,5.0,5.000000,{'was_impossible': False}
3308,795,43315,5.0,5.000000,{'was_impossible': False}
5156,570,34519,5.0,5.000000,{'was_impossible': False}
33373,1333,42631,5.0,5.000000,{'was_impossible': False}
17287,262,4170,5.0,5.000000,{'was_impossible': False}


In [177]:
#Function to find top 5 recommendation for anyone of the user
def get_top_n(predictions, n=5):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [178]:
top_n = get_top_n(predictions, n=5)

In [179]:
#Getting top ten recommendations for user '558'
top = pd.DataFrame(top_n.get(558))

In [180]:
top.columns = ['Prod_Id', 'Ratings'] 

In [181]:
#Showing the name of the top ten books for user '558'
top.head(10)

Unnamed: 0,Prod_Id,Ratings
0,24262,5.0
1,43790,4.999813
2,22989,4.877099
3,34503,4.625899
4,44998,4.599102
