In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import pandas_profiling 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_color_codes()
sns.set(style="whitegrid")
%matplotlib inline
from scipy.stats import zscore
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from math import sqrt

import requests
from io import StringIO

#setting up for customized printing
from IPython.display import Markdown, display
from IPython.display import HTML
def printmd(string, color=None):
    colorstr = "<span style='color:{}'>{}</span>".format(color, string)
    display(Markdown(colorstr))
    
#function to display dataframes side by side    
from IPython.display import display_html
def display_side_by_side(args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline;margin-left:50px !important;margin-right: 40px !important"'),raw=True)

In [None]:
electronics_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/20191226-items.csv')
reviews = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/20191226-reviews.csv')
reviews = reviews.drop('rating',1)
reviews = reviews.drop('title',1)
electronics_data = pd.merge(electronics_data,reviews,on='asin')
electronics_data = electronics_data[['name','asin','rating']]
electronics_data

Unnamed: 0,name,asin,rating
0,Janet,B0000SX2UC,3.0
1,Luke Wyatt,B0000SX2UC,3.0
2,Brooke,B0000SX2UC,3.0
3,amy m. teague,B0000SX2UC,3.0
4,tristazbimmer,B0000SX2UC,3.0
...,...,...,...
67981,jande,B081H6STQQ,4.5
67982,2cool4u,B081H6STQQ,4.5
67983,simon,B081H6STQQ,4.5
67984,Tobiasz Jedrysiak,B081TJFVCJ,5.0


In [None]:
electronics_groupby_users_Ratings = electronics_data.groupby('name')['rating']
electronics_groupby_users_Ratings = pd.DataFrame(electronics_groupby_users_Ratings.count())

In [None]:
user_list_min50_ratings = electronics_groupby_users_Ratings[electronics_groupby_users_Ratings['rating'] >= 50].index
electronics_data =  electronics_data[electronics_data['name'].isin(user_list_min50_ratings)]
electronics_data

Unnamed: 0,name,asin,rating
9,Amazon Customer,B0000SX2UC,3.0
27,Amazon Customer,B000SKTZ0S,2.7
45,Amazon Customer,B001AO4OUC,3.3
64,Kindle Customer,B001DCJAJG,3.1
109,Amazon Customer,B002AS9WEA,2.9
...,...,...,...
67910,David,B07ZHPCJW3,3.2
67921,Amazon Customer,B07ZPKZSSC,1.0
67941,Amazon Customer,B081H6STQQ,4.5
67946,Amazon Customer,B081H6STQQ,4.5


In [None]:
print('The total number of rows :', electronics_data.shape[0])
print('The total number of columns :', electronics_data.shape[1])

The total number of rows : 7503
The total number of columns : 3


In [None]:
electronics_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7503 entries, 9 to 67951
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    7503 non-null   object 
 1   asin    7503 non-null   object 
 2   rating  7503 non-null   float64
dtypes: float64(1), object(2)
memory usage: 234.5+ KB


In [None]:
display(electronics_data.isna().sum().sort_values())
print('===================')
printmd('**CONCLUSION**: As seen from the data above, we conclude there are **"No Missing"** values in the data', color="blue")

name      0
asin      0
rating    0
dtype: int64



<span style='color:blue'>**CONCLUSION**: As seen from the data above, we conclude there are **"No Missing"** values in the data</span>

In [None]:
display(electronics_data[['rating']].describe().transpose())

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rating,7503.0,3.731721,0.42714,1.0,3.5,3.8,4.0,5.0


In [None]:
print('Total unique users in the dataset', electronics_data['name'].nunique())
print('Total unique products in the dataset', electronics_data['asin'].nunique())

Total unique users in the dataset 11
Total unique products in the dataset 546


In [None]:
!pip install surprise



In [None]:
from surprise import accuracy
from surprise.model_selection.validation import cross_validate
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise import SVD
from surprise import KNNBasic
from surprise import KNNWithMeans
reader = Reader()
surprise_data = Dataset.load_from_df(electronics_data, reader)





In [None]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(surprise_data, test_size=.3, random_state=10)

In [None]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
class collab_filtering_based_recommender_model():
    def __init__(self, model, trainset, testset, data):
        self.model = model
        self.trainset = trainset
        self.testset = testset
        self.data = data
        self.pred_test = None
        self.recommendations = None
        self.top_n = None
        self.recommenddf = None

    def fit_and_predict(self):        
        printmd('**Fitting the train data...**', color='brown')
        self.model.fit(self.trainset)       

        printmd('**Predicting the test data...**', color='brown')
        self.pred_test = self.model.test(self.testset)        
        rmse = round(accuracy.rmse(self.pred_test), 3)
        printmd('**RMSE for the predicted result is ' + str(rmse) + '**', color='brown')   
        
        self.top_n = get_top_n(self.pred_test)
        self.recommenddf = pd.DataFrame(columns=['userId', 'productId', 'Rating'])
        for item in self.top_n:
            subdf = pd.DataFrame(self.top_n[item], columns=['productId', 'Rating'])
            subdf['userId'] = item
            cols = subdf.columns.tolist()
            cols = cols[-1:] + cols[:-1]
            subdf = subdf[cols]        
            self.recommenddf = pd.concat([self.recommenddf, subdf], axis = 0)        
        return rmse
        
    def cross_validate(self):
        printmd('**Cross Validating the data...**', color='brown')
        cv_result = cross_validate(self.model, self.data, n_jobs=-1)
        cv_result = round(cv_result['test_rmse'].mean(),3)
        printmd('**Mean CV RMSE is ' + str(cv_result)  + '**', color='brown')
        return cv_result

    def recommend(self, user_id, n=5):
        printmd('**Recommending top ' + str(n)+ ' products for userid : ' + user_id + ' ...**', color='brown')
        
        #df = pd.DataFrame(self.top_n[user_id], columns=['productId', 'Rating'])
        #df['UserId'] = user_id
        #cols = df.columns.tolist()
        #cols = cols[-1:] + cols[:-1]
        #df = df[cols].head(n)
        df = self.recommenddf[self.recommenddf['userId'] == user_id].head(n)
        display(df)
        return df

In [None]:
from surprise.model_selection import RandomizedSearchCV

def find_best_model(model, parameters,data):
    clf = RandomizedSearchCV(model, parameters, n_jobs=-1, measures=['rmse'])
    clf.fit(data)             
    print(clf.best_score)
    print(clf.best_params)
    print(clf.best_estimator)
    return clf

In [None]:
sim_options = {
    "name": ["msd", "cosine", "pearson", "pearson_baseline"],
    "min_support": [3, 4, 5],
    "user_based": [True],
}
params = { 'k': range(30,50,1), 'sim_options': sim_options}
clf = find_best_model(KNNWithMeans, params, surprise_data)

{'rmse': 0.11501978411099664}
{'rmse': {'k': 41, 'sim_options': {'name': 'pearson_baseline', 'min_support': 4, 'user_based': True}}}
{'rmse': <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7f1307ba4810>}


In [None]:
knnwithmeans = clf.best_estimator['rmse']
col_fil_knnwithmeans = collab_filtering_based_recommender_model(knnwithmeans, trainset, testset, surprise_data)

In [None]:
knnwithmeans_rmse = col_fil_knnwithmeans.fit_and_predict()

<span style='color:brown'>**Fitting the train data...**</span>

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<span style='color:brown'>**Predicting the test data...**</span>

RMSE: 0.0863


<span style='color:brown'>**RMSE for the predicted result is 0.086**</span>

In [None]:
knnwithmeans_cv_rmse = col_fil_knnwithmeans.cross_validate()

<span style='color:brown'>**Cross Validating the data...**</span>

<span style='color:brown'>**Mean CV RMSE is 0.112**</span>

In [None]:
result_knn_user1 = col_fil_knnwithmeans.recommend(user_id='Amazon Customer', n=5)
result_knn_user2 = col_fil_knnwithmeans.recommend(user_id='Kindle Customer', n=5)
result_knn_user3 = col_fil_knnwithmeans.recommend(user_id='David', n=5)

<span style='color:brown'>**Recommending top 5 products for userid : Amazon Customer ...**</span>

Unnamed: 0,userId,productId,Rating
0,Amazon Customer,B07R5ZYR77,4.7
1,Amazon Customer,B07R5ZYR77,4.7
2,Amazon Customer,B07V5NSD8N,4.7
3,Amazon Customer,B07K1M36CM,4.6
4,Amazon Customer,B07YQ58NPF,4.6


<span style='color:brown'>**Recommending top 5 products for userid : Kindle Customer ...**</span>

Unnamed: 0,userId,productId,Rating
0,Kindle Customer,B07YQ58NPF,4.566319
1,Kindle Customer,B07Q26V49K,4.46701
2,Kindle Customer,B07Q26V49K,4.46701
3,Kindle Customer,B07K3X5JTP,4.466319
4,Kindle Customer,B07C6FCC8G,4.366266


<span style='color:brown'>**Recommending top 5 products for userid : David ...**</span>

Unnamed: 0,userId,productId,Rating
0,David,B07JNKYBZ4,4.530803
1,David,B07QCXPP71,4.527634
2,David,B07VZL3L5V,4.428592
3,David,B07NZX5BKH,4.326601
4,David,B07455VT9F,4.227634


In [None]:
params= {
    "n_epochs": [5, 10, 15, 20],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
clf = find_best_model(SVD, params, surprise_data)

{'rmse': 0.21647588165489928}
{'rmse': {'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.4}}
{'rmse': <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7f130a3e1890>}


In [None]:
svd = clf.best_estimator['rmse']
col_fil_svd = collab_filtering_based_recommender_model(svd, trainset, testset, surprise_data)

In [None]:
svd_rmse = col_fil_svd.fit_and_predict()

<span style='color:brown'>**Fitting the train data...**</span>

<span style='color:brown'>**Predicting the test data...**</span>

RMSE: 0.2135


<span style='color:brown'>**RMSE for the predicted result is 0.214**</span>

In [None]:
svd_cv_rmse = col_fil_svd.cross_validate()

<span style='color:brown'>**Cross Validating the data...**</span>

<span style='color:brown'>**Mean CV RMSE is 0.217**</span>

In [None]:
result_svd_user1 = col_fil_svd.recommend(user_id='Amazon Customer', n=5)
result_svd_user2 = col_fil_svd.recommend(user_id='Kindle Customer', n=5)
result_svd_user3 = col_fil_svd.recommend(user_id='David', n=5)

<span style='color:brown'>**Recommending top 5 products for userid : Amazon Customer ...**</span>

Unnamed: 0,userId,productId,Rating
0,Amazon Customer,B07Q26V49K,4.252284
1,Amazon Customer,B07Q26V49K,4.252284
2,Amazon Customer,B07Q26V49K,4.252284
3,Amazon Customer,B07Q26V49K,4.252284
4,Amazon Customer,B07Q26V49K,4.252284


<span style='color:brown'>**Recommending top 5 products for userid : Kindle Customer ...**</span>

Unnamed: 0,userId,productId,Rating
0,Kindle Customer,B07Q26V49K,4.232537
1,Kindle Customer,B07Q26V49K,4.232537
2,Kindle Customer,B07C6FCC8G,4.182807
3,Kindle Customer,B07RWFC6NY,4.148102
4,Kindle Customer,B07V5KS95Y,4.070741


<span style='color:brown'>**Recommending top 5 products for userid : David ...**</span>

Unnamed: 0,userId,productId,Rating
0,David,B07VZL3L5V,4.199521
1,David,B07NZX5BKH,3.989627
2,David,B07JNKYBZ4,3.950359
3,David,B07QCXPP71,3.949593
4,David,B07QK32C8M,3.943245
