# Book Recommendations: Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np
import os
import re

import plotly
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

import matplotlib.pyplot as plt
import seaborn as sns

import langid

from surprise import Reader
from surprise import Dataset
from surprise.model_selection.validation import cross_validate
from surprise.model_selection.search import GridSearchCV
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import SVD
from surprise import CoClustering

from tabulate import tabulate

## Ratings Data

In [2]:
def read_folder(csv_folder):  
    """Function to read csv files in a folder and concact all to one dataframe."""
    files = os.listdir(csv_folder)
    df = []
    for f in files:
        print(f)
        csv_file = csv_folder + "/" + f
        df.append(pd.read_csv(csv_file))
    df_full = pd.concat(df, ignore_index=True, sort=False)
    return df_full

In [3]:
ratings = read_folder('data/user_ratings')

user_rating_0_to_1000.csv
user_rating_1000_to_2000.csv
user_rating_2000_to_3000.csv
user_rating_3000_to_4000.csv
user_rating_4000_to_5000.csv
user_rating_5000_to_6000.csv
user_rating_6000_to_11000.csv


In [4]:
ratings.head()

Unnamed: 0,ID,Name,Rating
0,1,Agile Web Development with Rails: A Pragmatic ...,it was amazing
1,1,The Restaurant at the End of the Universe (Hit...,it was amazing
2,1,Siddhartha,it was amazing
3,1,The Clock of the Long Now: Time and Responsibi...,really liked it
4,1,"Ready Player One (Ready Player One, #1)",really liked it


### Prep Data

#### Rename Columns

In [5]:
ratings.rename(columns={"ID": "user_id", "Name": "title", "Rating": "rating"}, inplace=True)

In [6]:
ratings.head(2)

Unnamed: 0,user_id,title,rating
0,1,Agile Web Development with Rails: A Pragmatic ...,it was amazing
1,1,The Restaurant at the End of the Universe (Hit...,it was amazing


In [7]:
ratings.shape

(362596, 3)

#### Missing Values

In [8]:
ratings.isnull().sum()

user_id    0
title      0
rating     0
dtype: int64

#### Duplicated Values

In [9]:
ratings.duplicated().sum()

435

In [10]:
ratings.drop_duplicates(inplace=True)

In [11]:
ratings.duplicated().sum()

0

#### Unique Users

In [12]:
unique_users = ratings.user_id.unique()
print('There are {} unique users.'.format(len(unique_users)))

There are 8919 unique users.


#### English Titles

In [13]:
# english titles
ratings.loc[:,'title_lang'] = ratings.loc[:,'title'].apply(lambda x: str(langid.classify(x)))

In [14]:
ratings = ratings[ratings['title_lang'].str.contains('en')]

In [15]:
ratings.drop(columns=['title_lang'], inplace=True)

In [16]:
ratings.shape

(292748, 3)

### Rating Prep & EDA

In [17]:
ratings.rating.unique()

array(['it was amazing', 'really liked it', 'liked it', 'did not like it',
       'it was ok', "This user doesn't have any rating"], dtype=object)

In [18]:
no_rating = ratings.loc[(ratings['rating'] == "This user doesn't have any rating")]

len(no_rating)

4765

In [19]:
no_rating.head()

Unnamed: 0,user_id,title,rating
1566,10,Rating,This user doesn't have any rating
1604,13,Rating,This user doesn't have any rating
2269,22,Rating,This user doesn't have any rating
2270,23,Rating,This user doesn't have any rating
2434,27,Rating,This user doesn't have any rating


In [20]:
# drop users that don't have any rating
ratings = ratings[ratings.rating != "This user doesn't have any rating"]

In [21]:
unique_users = ratings.user_id.unique()
print('There are {} unique users that have ratings.'.format(len(unique_users)))

There are 3998 unique users that have ratings.


In [22]:
# replace ratings with number value
ratings_dict = {'it was amazing': 5, 'really liked it': 4, 'liked it': 3, 
                'it was ok':2, 'did not like it': 1}

ratings['rating'] = ratings['rating'].replace(ratings_dict)

In [23]:
ratings.head()

Unnamed: 0,user_id,title,rating
0,1,Agile Web Development with Rails: A Pragmatic ...,5
1,1,The Restaurant at the End of the Universe (Hit...,5
2,1,Siddhartha,5
3,1,The Clock of the Long Now: Time and Responsibi...,4
4,1,"Ready Player One (Ready Player One, #1)",4


#### Descriptive Statistics

In [24]:
ratings.describe()

Unnamed: 0,user_id,rating
count,287983.0,287983.0
mean,4871.090044,3.794557
std,3245.515102,0.975834
min,1.0,1.0
25%,1974.0,3.0
50%,4534.0,4.0
75%,7717.0,5.0
max,10997.0,5.0


In [25]:
ratings['rating'].mode()

0    4
dtype: int64

#### Graphs

In [73]:
data = ratings['rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / ratings.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution of Book Ratings'.format(ratings.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [27]:
# Number of ratings per book
data = ratings.groupby('title')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per Book (Clipped at 50)',
                   xaxis = dict(title = 'Number of Ratings Per Book'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [28]:
# Number of ratings per user
data = ratings.groupby('user_id')['rating'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per User (Clipped at 50)',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

#### Most ratings per given user

In [29]:
ratings.groupby('user_id')['rating'].count().reset_index().sort_values('rating', ascending=False)[:10]

Unnamed: 0,user_id,rating
1705,4196,3260
1960,4806,2879
155,284,1803
577,1134,1754
2694,7452,1508
1319,3259,1503
3789,10378,1425
1464,3625,1244
1147,2828,1225
1931,4746,1185


Select books with a minimum of 50 ratings, and select users with a minimum of 50 ratings.

In [30]:
one_book_rating = 1
one_filter_books = ratings['title'].value_counts() == one_book_rating
one_filter_books.sum()

53483

In [31]:
min_book_ratings = 50
filter_books = ratings['title'].value_counts() > min_book_ratings
filter_books = filter_books[filter_books].index.tolist()

min_user_ratings = 50
filter_users = ratings['user_id'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

ratings2 = ratings[(ratings['title'].isin(filter_books)) & (ratings['user_id'].isin(filter_users))]
print('The original data frame shape:\t{}'.format(ratings.shape))
print('The new data frame shape:\t{}'.format(ratings2.shape))

The original data frame shape:	(287983, 3)
The new data frame shape:	(71881, 3)


In [32]:
unique_users = ratings2.user_id.unique()
print('There are {} unique users.'.format(len(unique_users)))

There are 1211 unique users.


## Personal Books

In [33]:
my_books = pd.read_csv('data/goodreads_library_export.csv')

In [34]:
my_books.head()

Unnamed: 0,Book Id,Title,Author,Author l-f,Additional Authors,ISBN,ISBN13,My Rating,Average Rating,Publisher,...,Private Notes,Read Count,Recommended For,Recommended By,Owned Copies,Original Purchase Date,Original Purchase Location,Condition,Condition Description,BCID
0,1845,Into the Wild,Jon Krakauer,"Krakauer, Jon",,0385486804,9780385000000.0,4,3.99,Anchor Books,...,,1,,,0,,,,,
1,77767,"Little House on the Prairie (Little House, #3)",Laura Ingalls Wilder,"Wilder, Laura Ingalls",Garth Williams,,,5,4.19,HarperTrophy,...,,1,,,0,,,,,
2,252577,"Angela's Ashes (Frank McCourt, #1)",Frank McCourt,"McCourt, Frank",,0007205236,9780007000000.0,5,4.12,Harper Perennial,...,,1,,,0,,,,,
3,140225,The Voyage of the Dawn Treader (Chronicles of ...,C.S. Lewis,"Lewis, C.S.",Pauline Baynes,006112527X,9780061000000.0,3,4.08,HarperCollins,...,,1,,,0,,,,,
4,100915,"The Lion, the Witch and the Wardrobe (Chronicl...",C.S. Lewis,"Lewis, C.S.",,,,5,4.22,HarperCollins Publishers,...,,1,,,0,,,,,


In [35]:
my_books.columns

Index(['Book Id', 'Title', 'Author', 'Author l-f', 'Additional Authors',
       'ISBN', 'ISBN13', 'My Rating', 'Average Rating', 'Publisher', 'Binding',
       'Number of Pages', 'Year Published', 'Original Publication Year',
       'Date Read', 'Date Added', 'Bookshelves', 'Bookshelves with positions',
       'Exclusive Shelf', 'My Review', 'Spoiler', 'Private Notes',
       'Read Count', 'Recommended For', 'Recommended By', 'Owned Copies',
       'Original Purchase Date', 'Original Purchase Location', 'Condition',
       'Condition Description', 'BCID'],
      dtype='object')

In [36]:
# remove books that I have not read yet, currently reading, or did not rate
my_books = my_books[my_books['My Rating'] != 0]

In [37]:
my_books = my_books[['Title', 'My Rating']]

In [38]:
my_books.rename(columns={'Title': 'title', 'My Rating': 'rating'}, inplace=True)

In [39]:
my_books.duplicated().sum()

1

In [40]:
my_books.drop_duplicates(inplace=True)

In [41]:
my_books.shape

(214, 2)

In [42]:
data = my_books['rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / my_books.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of My Book Ratings'.format(my_books.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

### Merge with Ratings

In [43]:
ratings.user_id.max()

10997

In [44]:
# create user ID
my_books['user_id'] = 11000

In [45]:
my_books.head()

Unnamed: 0,title,rating,user_id
0,Into the Wild,4,11000
1,"Little House on the Prairie (Little House, #3)",5,11000
2,"Angela's Ashes (Frank McCourt, #1)",5,11000
3,The Voyage of the Dawn Treader (Chronicles of ...,3,11000
4,"The Lion, the Witch and the Wardrobe (Chronicl...",5,11000


In [46]:
ratings = ratings.append(my_books)

In [47]:
ratings.tail()

Unnamed: 0,user_id,title,rating
276,11000,"Something Borrowed (Darcy & Rachel, #1)",5
277,11000,"The Devil Wears Prada (The Devil Wears Prada, #1)",3
278,11000,"Confessions of a Shopaholic (Shopaholic, #1)",5
279,11000,The Undomestic Goddess,5
280,11000,Remember Me?,5


In [48]:
ratings2 = ratings2.append(my_books)

In [49]:
ratings2.tail()

Unnamed: 0,user_id,title,rating
276,11000,"Something Borrowed (Darcy & Rachel, #1)",5
277,11000,"The Devil Wears Prada (The Devil Wears Prada, #1)",3
278,11000,"Confessions of a Shopaholic (Shopaholic, #1)",5
279,11000,The Undomestic Goddess,5
280,11000,Remember Me?,5


## Recommeder System

In [50]:
reader = Reader(rating_scale=(1,5))

data = Dataset.load_from_df(ratings2[['user_id', 'title', 'rating']], reader)

### Cross Validate Each Alogrithim

#### BaselineOnly

In [51]:
algo_BaselienOnly = BaselineOnly()

results_BaselineOnly = cross_validate(algo_BaselienOnly, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

BaselineOnly_rmse = np.mean(results_BaselineOnly ['test_rmse'])
BaselineOnly_mae = np.mean(results_BaselineOnly ['test_mae'])

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8774  0.8866  0.8738  0.8712  0.8788  0.8776  0.0053  
MAE (testset)     0.6958  0.7034  0.6952  0.6920  0.6980  0.6969  0.0038  
Fit time          0.07    0.09    0.08    0.08    0.08    0.08    0.01    
Test time         0.04    0.08    0.04    0.04    0.08    0.06    0.02    


#### SVD

In [52]:
algo_svd = SVD(n_epochs=50)

results_svd = cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

svd_rmse = np.mean(results_svd['test_rmse'])
svd_mae = np.mean(results_svd['test_mae'])

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9214  0.9115  0.9156  0.9036  0.9057  0.9116  0.0065  
MAE (testset)     0.7260  0.7191  0.7221  0.7116  0.7132  0.7184  0.0054  
Fit time          6.19    6.16    6.15    6.15    6.18    6.17    0.02    
Test time         0.06    0.06    0.11    0.06    0.07    0.07    0.02    


In [53]:
algo_svd = SVD(n_epochs=10)

results_svd = cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

svd_rmse = np.mean(results_svd['test_rmse'])
svd_mae = np.mean(results_svd['test_mae'])

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8860  0.8826  0.8860  0.8798  0.8760  0.8821  0.0038  
MAE (testset)     0.7043  0.7021  0.7034  0.6979  0.6935  0.7002  0.0040  
Fit time          1.25    1.25    1.26    1.25    1.26    1.25    0.00    
Test time         0.06    0.11    0.06    0.06    0.11    0.08    0.02    


In [54]:
algo_svd = SVD(n_epochs=10, n_factors=50)

results_svd = cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

svd_rmse = np.mean(results_svd['test_rmse'])
svd_mae = np.mean(results_svd['test_mae'])

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8821  0.8847  0.8761  0.8766  0.8803  0.8799  0.0033  
MAE (testset)     0.7005  0.7030  0.6977  0.6951  0.6966  0.6986  0.0028  
Fit time          0.77    0.81    0.80    0.78    0.77    0.79    0.02    
Test time         0.07    0.06    0.11    0.07    0.07    0.08    0.02    


#### KNN Basic

In [55]:
sim_options = {'name': 'cosine', 
              'user_based': True}

algo_KNNBasic = KNNBasic(n_epochs=10, sim_options=sim_options)

results_KNNBasic = cross_validate(algo_KNNBasic, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

KNNBasic_rmse = np.mean(results_KNNBasic['test_rmse'])
KNNBasic_mae = np.mean(results_KNNBasic['test_mae'])

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9267  0.9368  0.9234  0.9280  0.9415  0.9313  0.0068  
MAE (testset)     0.7324  0.7409  0.7296  0.7348  0.7464  0.7368  0.0061  
Fit time          1.10    1.12    1.11    1.11    1.11    1.11    0.01    
Test time         1.42    1.52    1.47    1.46    1.46    1.47    0.03    


#### KNN With Means

In [56]:
sim_options = {'name': 'cosine', 
              'user_based': True}

algo_KNNMeans = KNNWithMeans(n_epochs=10, sim_options=sim_options)

results_KNNMeans = cross_validate(algo_KNNMeans, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

KNNMeans_rmse = np.mean(results_KNNMeans['test_rmse'])
KNNMeans_mae = np.mean(results_KNNMeans['test_mae'])

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8793  0.8852  0.8794  0.8760  0.8874  0.8815  0.0042  
MAE (testset)     0.6930  0.7004  0.6971  0.6942  0.7022  0.6974  0.0035  
Fit time          1.10    1.14    1.14    1.13    1.12    1.13    0.01    
Test time         1.58    1.52    1.53    1.54    1.57    1.55    0.02    


#### KNN Basic

In [57]:
sim_options = {'name': 'cosine', 
              'user_based': True}

algo_KNNBasic = KNNBasic(n_epochs=10, sim_options=sim_options)

results_KNNBasic = cross_validate(algo_KNNBasic, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

KNNBasic_rmse = np.mean(results_KNNBasic['test_rmse'])
KNNBasic_mae = np.mean(results_KNNBasic['test_mae'])

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9357  0.9323  0.9330  0.9262  0.9316  0.9318  0.0031  
MAE (testset)     0.7423  0.7388  0.7362  0.7332  0.7361  0.7373  0.0031  
Fit time          1.10    1.15    1.13    1.14    1.10    1.12    0.02    
Test time         1.53    1.46    1.54    1.56    1.47    1.51    0.04    


#### Co-clustering

In [58]:
algo_CoCluster = SVD()

results_CoCluster = cross_validate(algo_CoCluster, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

CoCluster_rmse = np.mean(results_CoCluster['test_rmse'])
CoCluster_mae = np.mean(results_CoCluster['test_mae'])

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8728  0.8824  0.8862  0.8731  0.8909  0.8811  0.0072  
MAE (testset)     0.6911  0.7011  0.7028  0.6938  0.7032  0.6984  0.0050  
Fit time          2.49    2.46    2.47    2.45    2.46    2.47    0.01    
Test time         0.06    0.06    0.06    0.07    0.07    0.06    0.00    


### Grid Search CV on SVD Algorithm

In [59]:
param_grid = {
    'n_factors':[10, 20, 40, 50],
    'n_epochs': [10, 20, 30], 
    'lr_all': [0.002, 0.005],
    'reg_all': [0.2, 0.4, 0.6]}

gs_svd = GridSearchCV(
    algo_class = SVD,
    param_grid = param_grid,
    joblib_verbose = 5)

gs_svd.fit(data)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:  7.6min finished


In [60]:
gs_svd.best_params

{'rmse': {'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.2},
 'mae': {'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.2}}

In [None]:
gs_svd.best_params

In [61]:
algo_svd = SVD(n_factors=50, n_epochs=30, lr_all=0.005, reg_all=0.2)

results_svd = cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8810  0.8836  0.8700  0.8729  0.8782  0.8771  0.0050  
MAE (testset)     0.6976  0.6997  0.6928  0.6916  0.6996  0.6963  0.0034  
Fit time          2.33    2.33    2.31    2.32    2.32    2.32    0.01    
Test time         0.06    0.06    0.06    0.12    0.06    0.07    0.02    


In [62]:
algo_svd = SVD(n_factors=10, n_epochs=40, lr_all=0.005, reg_all=0.2)

results_svd = cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

svd_rmse = np.mean(results_svd['test_rmse'])
svd_mae = np.mean(results_svd['test_mae'])

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8713  0.8770  0.8811  0.8888  0.8680  0.8772  0.0073  
MAE (testset)     0.6915  0.6951  0.7015  0.7023  0.6904  0.6962  0.0049  
Fit time          1.62    1.60    1.61    1.58    1.60    1.60    0.01    
Test time         0.06    0.11    0.06    0.06    0.06    0.07    0.02    


In [63]:
#winner
algo_svd = SVD(n_factors=10, n_epochs=50, lr_all=0.005, reg_all=0.2)

results_svd = cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

svd_rmse = np.mean(results_svd['test_rmse'])
svd_mae = np.mean(results_svd['test_mae'])

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8796  0.8856  0.8745  0.8751  0.8699  0.8769  0.0053  
MAE (testset)     0.6999  0.7043  0.6920  0.6935  0.6905  0.6961  0.0053  
Fit time          2.00    2.00    1.96    1.98    2.00    1.99    0.02    
Test time         0.12    0.06    0.06    0.12    0.06    0.08    0.03    


In [64]:
algo_svd = SVD(n_factors=10, n_epochs=20, lr_all=0.005, reg_all=0.4)

results_svd = cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

svd_rmse = np.mean(results_svd['test_rmse'])
svd_mae = np.mean(results_svd['test_mae'])

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8869  0.8755  0.8788  0.8801  0.8898  0.8822  0.0053  
MAE (testset)     0.7014  0.6951  0.7005  0.6960  0.7087  0.7004  0.0048  
Fit time          0.81    0.85    0.81    0.82    0.80    0.82    0.02    
Test time         0.06    0.11    0.06    0.06    0.06    0.07    0.02    


### User-Based Collaborative Filtering Recommeder System

In [65]:
# build model
reader = Reader(rating_scale=(1,5))

data = Dataset.load_from_df(ratings2[['user_id', 'title', 'rating']], reader)

algo_svd = SVD(n_factors=10, n_epochs=20, lr_all=0.005, reg_all=0.4)

trainset = data.build_full_trainset()

# fit data
algo_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2783d925348>

In [69]:
def recommend_books(user_id):
    """Function to recommend books with collaborative filtering using SVD.
    Args: user_id from rataings dataframe.
    Returns top 10 books recommended to the user."""
    
    try:
        print('Gathering recommendations...')
        print("")

        # get list of titles
        titles = ratings2['title'].unique()
        
        # filter out titles that user has read
        user_titles = ratings2.loc[ratings2['user_id']==user_id, 'title']
        titles_to_predict = np.setdiff1d(titles,user_titles)
        
        recs = []
        
        for iid in titles_to_predict:
            recs.append((iid, algo_svd.predict(user_id, iid).est))
        
        df = pd.DataFrame(recs, columns=['Title', 'Score']).sort_values('Score', ascending=False)
        df.reset_index(drop=True)
        df.insert(0, 'Rank', range(1, 1+len(df)))


        print('The following books are recommended for user {}:'.format(user_id))
        print("")
        df.head(10).to_csv('results/collab_book_recs_{}.csv'.format(user_id), index=False)
        print(tabulate(df.head(10), headers='keys', tablefmt='psql', showindex=False))
        
    except:
        print('Unable to gather recommendations for the user id. \n Please try entering a valid user id.')

In [70]:
recommend_books(11000)

Gathering recommendations...

The following books are recommended for user 11000:

+--------+-------------------------------------------------------------------------+---------+
|   Rank | Title                                                                   |   Score |
|--------+-------------------------------------------------------------------------+---------|
|      1 | The Divan                                                               | 4.97967 |
|      2 | Labyrinths: Selected Stories and Other Writings                         | 4.84535 |
|      3 | Between the World and Me                                                | 4.80137 |
|      4 | Maus II: A Survivor's Tale: And Here My Troubles Began (Maus, #2)       | 4.76017 |
|      5 | Becoming                                                                | 4.71775 |
|      6 | Maus I: A Survivor's Tale: My Father Bleeds History (Maus, #1)          | 4.71741 |
|      7 | Where the Wild Things Are                          

In [71]:
recommend_books(9356)

Gathering recommendations...

The following books are recommended for user 9356:

+--------+-------------------------------------------------------------------------+---------+
|   Rank | Title                                                                   |   Score |
|--------+-------------------------------------------------------------------------+---------|
|      1 | The Divan                                                               | 4.89667 |
|      2 | Labyrinths: Selected Stories and Other Writings                         | 4.76356 |
|      3 | Between the World and Me                                                | 4.71933 |
|      4 | Maus II: A Survivor's Tale: And Here My Troubles Began (Maus, #2)       | 4.67531 |
|      5 | Maus I: A Survivor's Tale: My Father Bleeds History (Maus, #1)          | 4.6368  |
|      6 | Becoming                                                                | 4.63563 |
|      7 | Where the Wild Things Are                           

In [72]:
recommend_books(25)

Gathering recommendations...

The following books are recommended for user 25:

+--------+-------------------------------------------------------------------------+---------+
|   Rank | Title                                                                   |   Score |
|--------+-------------------------------------------------------------------------+---------|
|      1 | The Divan                                                               | 4.57644 |
|      2 | Labyrinths: Selected Stories and Other Writings                         | 4.44024 |
|      3 | Between the World and Me                                                | 4.3956  |
|      4 | Maus II: A Survivor's Tale: And Here My Troubles Began (Maus, #2)       | 4.35425 |
|      5 | Becoming                                                                | 4.31149 |
|      6 | Maus I: A Survivor's Tale: My Father Bleeds History (Maus, #1)          | 4.31116 |
|      7 | Where the Wild Things Are                             