In [None]:
!pip install recommenders



In [None]:
pip install git+https://github.com/microsoft/recommenders.git

Collecting git+https://github.com/microsoft/recommenders.git
  Cloning https://github.com/microsoft/recommenders.git to /tmp/pip-req-build-chflw5ta
  Running command git clone -q https://github.com/microsoft/recommenders.git /tmp/pip-req-build-chflw5ta
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting scikit-surprise<=1.1.1,>=0.19.1
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 4.5 MB/s 
[?25hCollecting cornac<2,>=1.1.2
  Downloading cornac-1.14.0-cp37-cp37m-manylinux1_x86_64.whl (12.4 MB)
[K     |████████████████████████████████| 12.4 MB 31 kB/s 
Collecting category-encoders<2,>=1.3.0
  Downloading category_encoders-1.3.0-py2.py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 8.6 MB/s 
[?25hCollecting memory-profiler<1,>=0.54.0
  Downloading memory_profiler-0.58.0.tar.gz (36 kB)
Co

### Surprise implementation of SVD

SVD is implemented in the [Surprise](https://surprise.readthedocs.io/en/stable/) library as a recommender module.
* Detailed documentations of the SVD module in Surprise can be found [here](https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD)
* Source codes of the SVD implementation is available on the Surprise Github repository, which can be found [here](https://github.com/NicolasHug/Surprise/blob/master/surprise/prediction_algorithms/matrix_factorization.pyx).

### Surprise SVD

Surprise supports dataframes as long as they have three colums reprensenting the user ids, item ids, and the ratings (in this order).

### Global Setup

In [None]:
import sys
import os
import surprise
import pandas as pd
import numpy as np

import recommenders
from recommenders.utils.timer import Timer
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions

print("System version: {}".format(sys.version))
print("Surprise version: {}".format(surprise.__version__))

System version: 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]
Surprise version: 1.1.1


Getting Data from drive. Not critical to overall project, but a helpful if running only on colab. Otherwise, local files are ok

In [None]:
import gdown
books_raw_url = 'https://drive.google.com/file/d/1TcFgTOCoBBqfEwrSbhYqunu5qYDnzxSH/view?usp=sharing'
books_url     ='https://drive.google.com/uc?id=' + books_raw_url.split('/')[-2] #Do not change
gdown.download(books_url, 'books.csv',quiet=False)

ratings_raw_url = 'https://drive.google.com/file/d/12noQhcRgAYBUNcbhG2Z3fuNWKX0EebMA/view?usp=sharing'
ratings_url     ='https://drive.google.com/uc?id=' + ratings_raw_url.split('/')[-2] #Do not change
gdown.download(ratings_url, 'ratings.csv',quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1TcFgTOCoBBqfEwrSbhYqunu5qYDnzxSH
To: /content/books.csv
3.29MB [00:00, 161MB/s]
Downloading...
From: https://drive.google.com/uc?id=12noQhcRgAYBUNcbhG2Z3fuNWKX0EebMA
To: /content/ratings.csv
72.1MB [00:00, 243MB/s]


'ratings.csv'


### Data Exploration

In [None]:
df_books = pd.read_csv("books.csv")
print(df_books.shape)
df_books.head()

(10000, 23)


Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [None]:
df_ratings = pd.read_csv("ratings.csv", nrows=100000)
print(df_ratings.shape)
df_ratings.head()

(100000, 3)


Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


### Train/Test Split

In [None]:
df_ratings_train, df_ratings_test = python_random_split(df_ratings, ratio=0.80, seed=42)

### Train the SVD Model

Surprise needs to build an internal model of the data. We here use the load_from_df method to build a Dataset object, and then indicate that we want to train on all the samples of this dataset by using the build_full_trainset method.

In [None]:
# 'reader' is being used to get rating scale (for Poikilingo, the scale is [0, 1]).
# https://github.com/NicolasHug/Surprise/blob/master/surprise/dataset.py
reader = surprise.Reader(line_format="user item rating", rating_scale=(0, 5))
train_set = surprise.Dataset.load_from_df(df_ratings_train[['user_id', 'book_id', 'rating']], reader=reader).build_full_trainset()


The [SVD](https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD) has a lot of parameters. The most important ones are:

* `n_factors`, which controls the dimension of the latent space (i.e. the size of the vectors $p_u$ and $q_i$). Usually, the quality of the training set predictions grows with as `n_factors` gets higher.
* `n_epochs`, which defines the number of iteration of the SGD procedure.
Note that both parameter also affect the training time.

We will here set n_factors to 200 and n_epochs to 30. To train the model, we simply need to call the fit() method.

In [None]:
svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)

with Timer() as train_time:
    svd.fit(train_set)

print("Took {} seconds for training.".format(train_time.interval))

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Took 12.924863339000012 seconds for training.


### Prediction

Now that our model is fitted, we can call `predict` to get some predictions. `predict` returns an internal object `Prediction` which can be easily converted back to a dataframe:

In [None]:
predictions = predict(svd, df_ratings_test[['user_id', 'book_id', 'rating']], usercol='user_id', itemcol='book_id')
predictions.head()

Unnamed: 0,user_id,book_id,prediction
0,2407,1365,3.697966
1,2491,718,4.203765
2,809,1053,3.384986
3,2259,248,3.756153
4,2674,7327,4.34536


### Remove played activity in the top k recommendations

In [None]:
with Timer() as test_time:
    all_predictions = compute_ranking_predictions(svd, df_ratings_train[['user_id', 'book_id', 'rating']], usercol='user_id', itemcol='book_id', remove_seen=True)
    
print("Took {} seconds for prediction.".format(test_time.interval))

Took 119.21350548700002 seconds for prediction.


In [None]:
all_predictions.head()

Unnamed: 0,user_id,book_id,prediction
80000,2384,4919,4.274129
80001,2384,900,4.834051
80002,2384,1459,4.355019
80003,2384,495,3.579601
80004,2384,7089,4.413871


### Evaluate how well SVD performs

In [None]:
kwargs = dict(col_user='user_id', col_item='book_id', col_rating='rating', col_prediction='prediction')

eval_rmse = rmse(df_ratings_test[['user_id', 'book_id', 'rating']], predictions, **kwargs)
eval_mae = mae(df_ratings_test[['user_id', 'book_id', 'rating']], predictions, **kwargs)
eval_rsquared = rsquared(df_ratings_test[['user_id', 'book_id', 'rating']], predictions, **kwargs)
eval_exp_var = exp_var(df_ratings_test[['user_id', 'book_id', 'rating']], predictions, **kwargs)

k = 10
eval_map = map_at_k(df_ratings_test[['user_id', 'book_id', 'rating']], all_predictions, k=k, **kwargs)
eval_ndcg = ndcg_at_k(df_ratings_test[['user_id', 'book_id', 'rating']], all_predictions, k=k, **kwargs)
eval_precision = precision_at_k(df_ratings_test[['user_id', 'book_id', 'rating']], all_predictions, k=k, **kwargs)
eval_recall = recall_at_k(df_ratings_test[['user_id', 'book_id', 'rating']], all_predictions, k=k, **kwargs)


print("RMSE:\t\t%f" % eval_rmse,
      "MAE:\t\t%f" % eval_mae,
      "rsquared:\t%f" % eval_rsquared,
      "exp var:\t%f" % eval_exp_var, sep='\n')

print('----')

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

RMSE:		0.896419
MAE:		0.710375
rsquared:	0.194187
exp var:	0.194749
----
MAP:	0.004421
NDCG:	0.016319
Precision@K:	0.012888
Recall@K:	0.011362


In [None]:
# prediction for user 1
result = all_predictions[all_predictions['user_id'] == 1]

In [None]:
result.head()

Unnamed: 0,user_id,book_id,prediction
8146016,1,349,3.217039
8146017,1,4919,3.622708
8146018,1,900,3.898682
8146019,1,1459,3.373677
8146020,1,495,3.268057


In [None]:
result.head(n=15)

Unnamed: 0,user_id,book_id,prediction
8146016,1,349,3.217039
8146017,1,4919,3.622708
8146018,1,900,3.898682
8146019,1,1459,3.373677
8146020,1,495,3.268057
8146021,1,7089,3.604961
8146022,1,8152,3.457308
8146023,1,2985,3.552979
8146024,1,150,3.84617
8146025,1,13,3.842918


In [None]:
# merge top 10 prediction with activities dataframe
result = pd.merge(result, df_books, on='book_id', how='left')

In [None]:
print("User 1 top 15 predictions")
result[['prediction', 'user_id', 'book_id', 'title', 'isbn']].sort_values('prediction', ascending=False).head(n=10)

User 1 top 15 predictions


Unnamed: 0,prediction,user_id,book_id,title,isbn
130,4.708417,1,25,Harry Potter and the Deathly Hallows (Harry Po...,545010225
638,4.655646,1,8946,The Divan,9646534783
1018,4.648713,1,9308,The Dark Is Rising Sequence (The Dark Is Risi...,20425651
552,4.605427,1,102,Where the Wild Things Are,99408392
3207,4.59942,1,3628,The Complete Calvin and Hobbes,740748475
2046,4.599172,1,422,"Harry Potter Boxset (Harry Potter, #1-7)",545044251
823,4.568593,1,6920,The Indispensable Calvin and Hobbes,751500283
2202,4.517904,1,4483,It's a Magical World: A Calvin and Hobbes Coll...,836221362
1496,4.508694,1,561,The Phantom Tollbooth,394820371
2580,4.506719,1,1877,The Elements of Style,205313426
