In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pylab as plt
%matplotlib inline

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors


In [2]:
data = pd.read_csv('clean_book_recommender_explicit.csv')
data.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,Rating-Count
0,786868716,The Five People You Meet in Heaven,Mitch Albom,2003,Hyperion,11400,9,197
1,151008116,Life of Pi,Yann Martel,2002,Harcourt,11400,6,90
2,312195516,The Red Tent (Bestselling Backlist),Anita Diamant,1998,Picador USA,11400,7,335
3,316789089,The Pilot's Wife : A Novel Tag: Author of the ...,Anita Shreve,1999,"Little, Brown",11400,7,60
4,743418174,Good in Bed,Jennifer Weiner,2002,Washington Square Press,11400,8,259


---
# Models
### A - Popularity Base 

This is a very basic popularity base recommender.

In [3]:
pop_df = pd.DataFrame(data.groupby('ISBN')['Book-Rating'].sum())
top_five = pop_df.sort_values('Book-Rating', ascending=False).head()
top_five

Unnamed: 0_level_0,Book-Rating
ISBN,Unnamed: 1_level_1
0316666343,1934
0385504209,1619
059035342X,1153
0312195516,1125
043935806X,1064


---
### B - Collaborative Filtering

In [4]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [5]:
data = pd.read_csv('clean_book_recommender.csv')
data.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,Rating-Count
0,786868716,The Five People You Meet in Heaven,Mitch Albom,2003,Hyperion,11400,9,197
1,151008116,Life of Pi,Yann Martel,2002,Harcourt,11400,6,90
2,671021001,She's Come Undone (Oprah's Book Club),Wally Lamb,1998,Pocket,11400,0,226
3,312195516,The Red Tent (Bestselling Backlist),Anita Diamant,1998,Picador USA,11400,7,335
4,446364193,Along Came a Spider (Alex Cross Novels),James Patterson,1993,Warner Books,11400,0,187


In [6]:
data.shape

(96543, 8)

In [7]:
# Drop all the duplicate rows
data = data.drop_duplicates(['User-ID', 'Book-Title'])
data.shape

(96085, 8)

In [8]:
data_pivot = data.pivot(index='Book-Title', columns='User-ID', values='Book-Rating').fillna(0)
data_matrix = csr_matrix(data_pivot.values)
data_pivot.head()

User-ID,243,254,507,638,643,741,882,929,1211,1424,...,277928,277965,278026,278137,278144,278188,278418,278582,278633,278843
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
204 Rosewood Lane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
model = NearestNeighbors(algorithm='brute', metric='cosine')
model.fit(data_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [10]:
def get_five_recommendation_for(book_index):
    distances, indices = model.kneighbors(data_pivot.iloc[book_index,:].values.reshape(1, -1), n_neighbors = 6)
    for i in range(6):
        if i == 0:
            print(f'Recommendations for "{data_pivot.index[book_index]}" is:\n')
        else:
            print(f'{i}- "{data_pivot.index[indices.flatten()[i]]}" with the {np.round((distances.flatten()[i]) * 100)}%\n')

In [11]:
get_five_recommendation_for(book_index=747)

Recommendations for "The Green Mile" is:

1- "Bleachers" with the 81.0%

2- "It" with the 83.0%

3- "Firestarter" with the 84.0%

4- "Needful Things" with the 84.0%

5- "Dreamcatcher" with the 85.0%



It is possible to make it more personal and get the User-ID and do the recommendation more specific to the user. 

---

#### Using Surprise package

In [12]:
from surprise.reader import Reader
from surprise.dataset import Dataset
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor
from surprise import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import BaselineOnly, CoClustering
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy

In [13]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(data[['User-ID', 'ISBN', 'Book-Rating']], reader)

In [14]:
algorithms = [SVD(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore()]
benchmark = []

for algorithm in algorithms:
    validation = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)

    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(validation).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNWithMeans,3.480828,0.67507,4.281777
KNNBaseline,3.482582,0.672497,4.612777
KNNWithZScore,3.49611,0.703548,4.74988
SVD,3.548044,3.789639,0.2606
KNNBasic,3.658461,0.537164,3.949946
NormalPredictor,4.614661,0.123197,0.253197


Since KNNBaseline looks better, I go with this algorithm.

---

In [16]:
algorithm_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
knn = KNNBaseline(bsl_options=algorithm_options)
cross_validate(knn, data, measures=['RMSE'], cv=3, verbose=False)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([3.44636979, 3.49319299, 3.48661599]),
 'fit_time': (0.633439302444458, 0.6458969116210938, 0.7680399417877197),
 'test_time': (4.502333879470825, 4.957219839096069, 4.502232074737549)}

In [17]:
trainset, testset = train_test_split(data, test_size=0.25)
predictions = knn.fit(trainset).test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 3.4731


3.473146121096231

In [18]:
def get_rated_items(user_id):
    '''
    Returns nr of items the given user have rated.
    '''
    try:
        return len(trainset.ur[trainset.to_inner_uid(user_id)])
    except ValueError:
        return 0

In [19]:
def get_rated_users(item_id):
    '''
    Returns nr of user who rated the given item
    '''
    try:
        return len(trainset.ir[trainset.to_inner_iid(item_id)])
    except ValueError:
        return 0

In [20]:
predicted_df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'Details'])
predicted_df['Iu'] = predicted_df.uid.apply(get_rated_users)
predicted_df['Ui'] = predicted_df.iid.apply(get_rated_items)
predicted_df['err'] = abs(predicted_df.est - predicted_df.rui)
best_predictions = predicted_df.sort_values(by='err')[:10]
worst_predictions = predicted_df.sort_values(by='err')[-10:]

In [21]:
best_predictions.head()

Unnamed: 0,uid,iid,rui,est,Details,Iu,Ui,err
14417,30972,971880107,1.0,1.0,"{'actual_k': 40, 'was_impossible': False}",0,0,0.0
19073,41667,971880107,1.0,1.0,"{'actual_k': 40, 'was_impossible': False}",0,0,0.0
19948,249862,971880107,1.0,1.0,"{'actual_k': 40, 'was_impossible': False}",0,0,0.0
16129,251422,971880107,1.0,1.0,"{'actual_k': 40, 'was_impossible': False}",0,0,0.0
20092,101851,439064864,10.0,10.0,"{'actual_k': 40, 'was_impossible': False}",0,0,0.0


In [22]:
worst_predictions.head()

Unnamed: 0,uid,iid,rui,est,Details,Iu,Ui,err
20723,44728,553279912,10.0,1.0,"{'actual_k': 40, 'was_impossible': False}",0,0,9.0
7440,69933,515128554,0.0,9.030458,"{'actual_k': 40, 'was_impossible': False}",0,0,9.030458
11579,162052,61015725,0.0,9.11231,"{'actual_k': 23, 'was_impossible': False}",0,0,9.11231
5689,37712,553211404,0.0,9.330869,"{'actual_k': 37, 'was_impossible': False}",0,0,9.330869
22296,1674,679429220,0.0,9.456168,"{'actual_k': 18, 'was_impossible': False}",0,0,9.456168
