In [4]:
import pandas as pd
from surprise import Reader, SVD, KNNBasic

# Load Data

In [4]:
df = pd.read_csv('goodreads_ratings.csv')
print(df.head())

                            user_id   book_id  \
0  d089c9b670c0b0b339353aebbace46a1   7686667   
1  6dcb2c16e12a41ae0c6c38e9d46f3292  18073066   
2  244e0ce681148a7586d7746676093ce9  13610986   
3  73fcc25ff29f8b73b3a7578aec846394  27274343   
4  f8880e158a163388a990b64fec7df300  11614718   

                          review_id  rating  \
0  3337e0e75701f7f682de11638ccdc60c       3   
1  7201aa3c1161f2bad81258b6d4686c16       5   
2  07a203f87bfe1b65ff58774667f6f80d       5   
3  8be2d87b07098c16f9742020ec459383       1   
4  a29c4ba03e33ad073a414ac775266c5f       4   

                                         review_text  \
0  Like Matched, this book felt like it was echoi...   
1  WOW again! 4,5 Stars \r\n So i wont forget to ...   
2  The second novel was hot & heavy. Not only in ...   
3  What a maddening waste of time. And I unfortun...   
4  4.5 stars! \r\n This was an awesome read! \r\n...   

                       date_added                    date_updated  \
0  Fri Apr 29 14

# Inspect Data


In [5]:
#1. Print dataset size and examine column data types
print(df.dtypes)

user_id         object
book_id          int64
review_id       object
rating           int64
review_text     object
date_added      object
date_updated    object
read_at         object
started_at      object
n_votes          int64
n_comments       int64
dtype: object


In [18]:
#2. Distribution of ratings
print(df['rating'].value_counts())
print(df['rating'].describe())

rating
4    1278
5    1001
3     707
2     269
1     125
0     120
Name: count, dtype: int64
count    3500.000000
mean        3.686000
std         1.251911
min         0.000000
25%         3.000000
50%         4.000000
75%         5.000000
max         5.000000
Name: rating, dtype: float64


In [8]:
#3. Filter ratings that are out of range
filtered_ratings = df[df['rating']!=0]
# print(filtered_ratings.value_counts())

# Filter Data using Surprise

In [9]:
#4. Prepare data for surprise: build a Suprise reader object

reader = Reader(rating_scale= (1,5))

#5. Load `book_ratings` into a Surprise Dataset
from surprise import Dataset
data = Dataset.load_from_df(filtered_ratings[['user_id', 'book_id', 'rating']], reader)

# Split Data

In [10]:
#6. Create a 80:20 train-test split and set the random state to 7
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, random_state= 7, test_size= .2)

# Build Models

In [11]:
#7. Use KNNBasice from Surprise to train a collaborative filter
from surprise import KNNBasic
knn = KNNBasic()
knn.fit(trainset)
test = knn.test(testset)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [25]:
#8. Evaluate the recommender system
from surprise import accuracy
print('The accuracy of KNNBasic prediction model using RSME is - ' + 
str(accuracy.rmse(test)))
print('=============================================================')

# test SVD and see if it does better than KNNBasic
svd = SVD()
svd.fit(trainset)
test_svd = svd.test(testset)
print('The accuracy of SVD prediction model using RSME is - ' + 
str(accuracy.rmse(test_svd)))
print('=============================================================')

#9. Prediction on a user who gave the "The Three-Body Problem" a rating of 5
uid = '8842281e1d1347389f2ab93d60773d4d'
bid = 18245960

user_pred = knn.predict(uid, bid, r_ui=5, verbose=True )
print('=============================================================')

user_pred_svd = svd.predict(uid, bid, r_ui=5, verbose=True )

RMSE: 1.1105
The accuracy of KNNBasic prediction model using RSME is - 1.110471008157185
RMSE: 0.7399
The accuracy of SVD prediction model using RSME is - 0.7398665977772447
user: 8842281e1d1347389f2ab93d60773d4d item: 18245960   r_ui = 5.00   est = 3.83   {'was_impossible': True, 'reason': 'Not enough neighbors.'}
user: 8842281e1d1347389f2ab93d60773d4d item: 18245960   r_ui = 5.00   est = 3.81   {'was_impossible': False}


# Cross Validation

In [27]:
from surprise.model_selection import cross_validate
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    1.0539  0.9517  1.0694  1.0368  1.0357  1.1245  1.0519  1.0492  1.0687  1.0341  1.0476  0.0407  
MAE (testset)     0.8495  0.7294  0.8524  0.8131  0.8083  0.9144  0.8458  0.8143  0.8359  0.8156  0.8279  0.0442  
Fit time          0.05    0.04    0.04    0.04    0.04    0.02    0.04    0.03    0.03    0.04    0.04    0.01    
Test time         0.00    0.00    0.00    0.00    0.00    0.02    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([1.05393841, 0.95166882, 1.06943828, 1.03680359, 1.03568522,
        1.12451838, 1.05191411, 1.04916501, 1.06873417, 1.03408343]),
 'test_mae': array([0.84950489, 0.7293576 , 0.85244359, 0.81305024, 0.80834987,
        0.9143598 , 0.84583388, 0.81431643, 0.83591875, 0.81564621]),
 'fit_time': (0.04688858985900879,
  0.03946280479431152,
  0.03557944297790527,
  0.03923678398132324,
  0.04012894630432129,
  0.023686647415161133,
  0.041409969329833984,
  0.03173542022705078,
  0.031434059143066406,
  0.03598475456237793),
 'test_time': (0.0019960403442382812,
  0.0020291805267333984,
  0.0,
  0.0009975433349609375,
  0.001994609832763672,
  0.015728473663330078,
  0.0,
  0.0,
  0.0,
  0.0)}

# BaseLine Model

In [28]:
from surprise import BaselineOnly

baseline_model = BaselineOnly()

cross_validate(baseline_model, data, measures=['RMSE'], cv=5, verbose=True)
# trainset= data.build_full_trainset()
baseline_model.fit(trainset)
baseline_model.predict(uid, bid, r_ui=5)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0432  1.0359  1.0604  1.0811  1.0434  1.0528  0.0163  
Fit time          0.01    0.01    0.01    0.02    0.01    0.01    0.00    
Test time         0.01    0.00    0.00    0.00    0.00    0.00    0.00    
Estimating biases using als...


Prediction(uid='8842281e1d1347389f2ab93d60773d4d', iid=18245960, r_ui=5, est=3.8715976331360946, details={'was_impossible': False})

# Conclusion

Even when using a cross validation model to look at differnt splits.   
We end up with the lowest RMSE of .9953 which is pretty poor.   
The closer to 0 the more perfect the predictions. 