In [None]:
pip install git+https://github.com/microsoft/recommenders.git

Collecting git+https://github.com/microsoft/recommenders.git
  Cloning https://github.com/microsoft/recommenders.git to /tmp/pip-req-build-chflw5ta
  Running command git clone -q https://github.com/microsoft/recommenders.git /tmp/pip-req-build-chflw5ta
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting scikit-surprise<=1.1.1,>=0.19.1
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 4.5 MB/s 
[?25hCollecting cornac<2,>=1.1.2
  Downloading cornac-1.14.0-cp37-cp37m-manylinux1_x86_64.whl (12.4 MB)
[K     |████████████████████████████████| 12.4 MB 31 kB/s 
Collecting category-encoders<2,>=1.3.0
  Downloading category_encoders-1.3.0-py2.py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 8.6 MB/s 
[?25hCollecting memory-profiler<1,>=0.54.0
  Downloading memory_profiler-0.58.0.tar.gz (36 kB)
Co

### Surprise implementation of SVD

SVD is implemented in the [Surprise](https://surprise.readthedocs.io/en/stable/) library as a recommender module.
* Detailed documentations of the SVD module in Surprise can be found [here](https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD)
* Source codes of the SVD implementation is available on the Surprise Github repository, which can be found [here](https://github.com/NicolasHug/Surprise/blob/master/surprise/prediction_algorithms/matrix_factorization.pyx).

### Surprise SVD

Surprise supports dataframes as long as they have three colums reprensenting the user ids, item ids, and the ratings (in this order).

### Global Setup

In [None]:
import sys
import os
import surprise
import pandas as pd
import numpy as np

import recommenders
from recommenders.utils.timer import Timer
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions

print("System version: {}".format(sys.version))
print("Surprise version: {}".format(surprise.__version__))

System version: 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]
Surprise version: 1.1.1


Getting Data from drive. Not critical to overall project, but a helpful if running only on colab. Otherwise, local files are ok

In [None]:
import gdown
activity_raw_url = 'https://drive.google.com/file/d/1GZ7Pg06hoMGdfDtcAPPNPnQ_ZwIFcEh7/view?usp=sharing'
activity_url     ='https://drive.google.com/uc?id=' + activity_raw_url.split('/')[-2] #Do not change
gdown.download(activity_url, 'activity.csv',quiet=False)

student_info_raw_url = 'https://drive.google.com/file/d/1Y0EmJMalLuoSSe4_CHzf7aSWBV4BEcH4/view?usp=sharing'
student_info_url     ='https://drive.google.com/uc?id=' + student_info_raw_url.split('/')[-2] #Do not change
gdown.download(student_info_url, 'student_info.csv',quiet=False)

student_activity_train_raw_url = 'https://drive.google.com/file/d/1xaXuFboFYe4fwxUBr9ygWlcfaIX-cTbc/view?usp=sharing'
student_activity_train_url     ='https://drive.google.com/uc?id=' + student_activity_train_raw_url.split('/')[-2] #Do not change
gdown.download(student_activity_train_url, 'student_activity_train.csv',quiet=False)

student_activity_test_raw_url = 'https://drive.google.com/file/d/17Rd0DTaQ4APmD0L4sZTHJWMhfdflI5N7/view?usp=sharing'
student_activity_test_url     ='https://drive.google.com/uc?id=' + student_activity_test_raw_url.split('/')[-2] #Do not change
gdown.download(student_activity_test_url, 'student_activity_test.csv',quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1GZ7Pg06hoMGdfDtcAPPNPnQ_ZwIFcEh7
To: /content/activity.csv
100%|██████████| 18.9k/18.9k [00:00<00:00, 6.09MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Y0EmJMalLuoSSe4_CHzf7aSWBV4BEcH4
To: /content/student_info.csv
100%|██████████| 24.9k/24.9k [00:00<00:00, 15.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1xaXuFboFYe4fwxUBr9ygWlcfaIX-cTbc
To: /content/student_activity_train.csv
100%|██████████| 1.06M/1.06M [00:00<00:00, 68.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=17Rd0DTaQ4APmD0L4sZTHJWMhfdflI5N7
To: /content/student_activity_test.csv
100%|██████████| 222k/222k [00:00<00:00, 63.0MB/s]


'student_activity_test.csv'

### Data Exploration

In [None]:
df_activity = pd.read_csv("activity.csv", index_col=0)
print(df_activity.shape)
df_activity.head()

(155, 25)


Unnamed: 0,Activity_ID,Activity_Name,Activity_Content,Activity_Type,GameType,Ages_2yo,Ages_3yo,Ages_4yo,Ages_5yo,Ages_6yo,Ages_7yo,Cultures_None,Cultures_Brazil,Cultures_US,Cultures_UK,Cultures_Denmark,Cultures_Spain,Cultures_Mexico,Cultures_Canada,Cultures_Portugal,Cultures_Australia,Cultures_France,Cultures_Italy,Cultures_Germany,Activity_Level
0,farm-poiki-book-1,"Book: Poiki, the farmer",farm,standard,reading-book,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,farm-poiki-book-2,Book: The animals in the farm,farm-animals,standard,reading-book,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,farm-wendy-book-1,Book: Colors in the farm,colors,standard,reading-book,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,farm-kevin-book-1,Book: Shapes in the farm,shapes,standard,reading-book,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,country-none-book-1,How are the farms in my country?,my-country,cultural,reading-book,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,0,0,0,0


In [None]:
df_students = pd.read_csv("student_info.csv", index_col=0)
print(df_students.shape)
df_students.head()

(250, 13)


Unnamed: 0,User_ID,Gender,User_Name,Age,Translanguage_Level,Literate,Learning_Rate,Country_of_Birth,Country_of_Residence,Cultural_Heritage,Main_Language,Learning_Language,Learning_Profile
0,0,F,Norma Fisher,5,2,Complete,Slow,Denmark,Canada,"['Portugal', 'Australia']",Portuguese,English,HL
1,1,F,Kayla Sullivan,6,1,Not_Started,Fast,UK,Canada,['France'],French,English,BTB
2,2,F,Elizabeth Woods,6,2,Not_Started,Slow,Brazil,Canada,['Portugal'],Portuguese,French,IL
3,3,F,Susan Wagner,4,3,Complete,Fast,UK,Portugal,"['France', 'Canada']",French,English,HL
4,4,M,Peter Montgomery,3,2,Complete,Fast,France,Brazil,"['Germany', 'Spain', 'Italy']",German,Spanish,HL


In [None]:
df_scores_train = pd.read_csv("student_activity_train.csv", index_col=0)
print(df_scores_train.shape)
df_scores_train.head()

(11880, 7)


Unnamed: 0,User_ID,Activity_ID,No_of_attempts,Time_taken,Translanguage_Level,Usefulness_Score,_Translanguage_Level_Before_Course_Plan
0,0,country-wendy-videoclip-1,3,278.05518,3.101002,0.1398,2
1,0,country-yasmin-videoclip-1,4,281.867673,2.826381,0.084624,2
2,0,lab-wendy-colors-1,3,277.465027,2.782403,0.065831,2
3,0,who-wendy-colors-1,2,283.909252,3.006378,0.01164,2
4,0,farm-poiki-video-1,3,285.101,2.736871,0.038446,2


In [None]:
df_scores_test = pd.read_csv("student_activity_test.csv", index_col=0)
print(df_scores_test.shape)
df_scores_test.head()

(2500, 7)


Unnamed: 0,User_ID,Activity_ID,No_of_attempts,Time_taken,Translanguage_Level,Usefulness_Score,_Translanguage_Level_Before_Course_Plan
0,0,cooking-yasmin-simulation-1,4,276.452842,3.487373,0.390101,3
1,0,farm-poiki-video-2,1,292.072599,3.711785,0.168906,3
2,0,lab-kevin-game-1,1,280.767327,3.86006,0.092441,3
3,0,insects-roulette-1,2,286.974125,3.731912,0.079838,3
4,0,cooking-yasmin-simulation-1,3,286.916326,3.823462,0.055227,3


### Build a mapping Activity_ID -> encoded (numerical) activity ID

In [None]:
# make sure `Activity_ID` in `df_activity` is unique
assert len(df_activity) == len(df_activity["Activity_ID"].unique())

activity_id_to_encoded_id_dict = df_activity["Activity_ID"].to_dict()
print(activity_id_to_encoded_id_dict)
encoded_id_to_activity_dict = {v: k for k, v in activity_id_to_encoded_id_dict.items()}  #inverted
print(encoded_id_to_activity_dict)

{0: 'farm-poiki-book-1', 1: 'farm-poiki-book-2', 2: 'farm-wendy-book-1', 3: 'farm-kevin-book-1', 4: 'country-none-book-1', 5: 'country-none-book-2', 6: 'country-none-book-3', 7: 'country-none-book-4', 8: 'home-kevin-book-1', 9: 'daycare-all-book-1', 10: 'world-all-book-1', 11: 'home-wendy-book-1', 12: 'travel-yasmin-book-1', 13: 'who-wendy-book-1', 14: 'who-wendy-book-2', 15: 'lab-wendy-book-1', 16: 'farm-poiki-videoclip-1', 17: 'farm-poiki-videoclip-2', 18: 'theatre-all-videoclip-1', 19: 'country-yasmin-videoclip-1', 20: 'country-wendy-videoclip-1', 21: 'country-yasmin-videoclip-2', 22: 'who-all-videoclip-1', 23: 'farm-yasmin-demovideo-1', 24: 'farm-poiki-demovideo-1', 25: 'farm-kevin-demovideo-1', 26: 'farm-wendy-demovideo-1', 27: 'farm-kevin-demovideo-2', 28: 'farm-poiki-demovideo-2', 29: 'farm-poiki-demovideo-3', 30: 'farm-none-demovideo-1', 31: 'farm-poiki-demovideo-4', 32: 'lab-wendy-demovideo-1', 33: 'who-wendy-demovideo-1', 34: 'who-yasmin-demovideo-1', 35: 'who-kevin-demovideo

### Add the encoded (numerical) activity id to `df_activitiy`

In [None]:
df_activity['Activity_Num_ID'] = df_activity['Activity_ID'].map(encoded_id_to_activity_dict)
df_activity.head()

Unnamed: 0,Activity_ID,Activity_Name,Activity_Content,Activity_Type,GameType,Ages_2yo,Ages_3yo,Ages_4yo,Ages_5yo,Ages_6yo,Ages_7yo,Cultures_None,Cultures_Brazil,Cultures_US,Cultures_UK,Cultures_Denmark,Cultures_Spain,Cultures_Mexico,Cultures_Canada,Cultures_Portugal,Cultures_Australia,Cultures_France,Cultures_Italy,Cultures_Germany,Activity_Level,Activity_Num_ID
0,farm-poiki-book-1,"Book: Poiki, the farmer",farm,standard,reading-book,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,farm-poiki-book-2,Book: The animals in the farm,farm-animals,standard,reading-book,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,farm-wendy-book-1,Book: Colors in the farm,colors,standard,reading-book,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,farm-kevin-book-1,Book: Shapes in the farm,shapes,standard,reading-book,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3
4,country-none-book-1,How are the farms in my country?,my-country,cultural,reading-book,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,0,0,0,0,4


### Add the encoded (numerical) activity id to `df_scores_train`

In [None]:
df_scores_train['Activity_Num_ID'] = df_scores_train['Activity_ID'].map(encoded_id_to_activity_dict)
df_scores_train.head()

Unnamed: 0,User_ID,Activity_ID,No_of_attempts,Time_taken,Translanguage_Level,Usefulness_Score,_Translanguage_Level_Before_Course_Plan,Activity_Num_ID
0,0,country-wendy-videoclip-1,3,278.05518,3.101002,0.1398,2,20
1,0,country-yasmin-videoclip-1,4,281.867673,2.826381,0.084624,2,19
2,0,lab-wendy-colors-1,3,277.465027,2.782403,0.065831,2,110
3,0,who-wendy-colors-1,2,283.909252,3.006378,0.01164,2,135
4,0,farm-poiki-video-1,3,285.101,2.736871,0.038446,2,66


### Add the encoded (numerical) activity id to `df_scores_test`

In [None]:
df_scores_test['Activity_Num_ID'] = df_scores_test['Activity_ID'].map(encoded_id_to_activity_dict)
df_scores_test.head()

Unnamed: 0,User_ID,Activity_ID,No_of_attempts,Time_taken,Translanguage_Level,Usefulness_Score,_Translanguage_Level_Before_Course_Plan,Activity_Num_ID
0,0,cooking-yasmin-simulation-1,4,276.452842,3.487373,0.390101,3,102
1,0,farm-poiki-video-2,1,292.072599,3.711785,0.168906,3,67
2,0,lab-kevin-game-1,1,280.767327,3.86006,0.092441,3,136
3,0,insects-roulette-1,2,286.974125,3.731912,0.079838,3,146
4,0,cooking-yasmin-simulation-1,3,286.916326,3.823462,0.055227,3,102


### Train the SVD Model

Surprise needs to build an internal model of the data. We here use the load_from_df method to build a Dataset object, and then indicate that we want to train on all the samples of this dataset by using the build_full_trainset method.

In [None]:
# 'reader' is being used to get rating scale (for Poikilingo, the scale is [0, 1]).
# https://github.com/NicolasHug/Surprise/blob/master/surprise/dataset.py
reader = surprise.Reader(line_format="user item rating", rating_scale=(0, 1))
train_set = surprise.Dataset.load_from_df(df_scores_train[['User_ID', 'Activity_Num_ID', 'Usefulness_Score']], reader=reader).build_full_trainset()


The [SVD](https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD) has a lot of parameters. The most important ones are:

* `n_factors`, which controls the dimension of the latent space (i.e. the size of the vectors $p_u$ and $q_i$). Usually, the quality of the training set predictions grows with as `n_factors` gets higher.
* `n_epochs`, which defines the number of iteration of the SGD procedure.
Note that both parameter also affect the training time.

We will here set n_factors to 200 and n_epochs to 30. To train the model, we simply need to call the fit() method.

In [None]:
svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)

with Timer() as train_time:
    svd.fit(train_set)

print("Took {} seconds for training.".format(train_time.interval))

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Took 1.590882960999977 seconds for training.


### Prediction

Now that our model is fitted, we can call `predict` to get some predictions. `predict` returns an internal object `Prediction` which can be easily converted back to a dataframe:

In [None]:
predictions = predict(svd, df_scores_test[['User_ID', 'Activity_Num_ID', 'Usefulness_Score']], usercol='User_ID', itemcol='Activity_Num_ID')
predictions.head()

Unnamed: 0,User_ID,Activity_Num_ID,prediction
0,0,102,0.218034
1,0,67,0.148088
2,0,136,0.179007
3,0,146,0.189965
4,0,102,0.218034


### Remove played activity in the top k recommendations

In [None]:
with Timer() as test_time:
    all_predictions = compute_ranking_predictions(svd, df_scores_train[['User_ID', 'Activity_Num_ID', 'Usefulness_Score']], usercol='User_ID', itemcol='Activity_Num_ID', remove_seen=True)
    
print("Took {} seconds for prediction.".format(test_time.interval))

Took 0.22375432199999068 seconds for prediction.


In [None]:
all_predictions.head()

Unnamed: 0,User_ID,Activity_Num_ID,prediction
11880,0,147,0.137275
11881,0,137,0.12177
11882,0,145,0.241462
11883,0,15,0.0
11884,0,107,0.168881


### Evaluate how well SVD performs

In [None]:
kwargs = dict(col_user='User_ID', col_item='Activity_Num_ID', col_rating='Usefulness_Score', col_prediction='prediction')

eval_rmse = rmse(df_scores_test[['User_ID', 'Activity_Num_ID', 'Usefulness_Score']], predictions, **kwargs)
eval_mae = mae(df_scores_test[['User_ID', 'Activity_Num_ID', 'Usefulness_Score']], predictions, **kwargs)
eval_rsquared = rsquared(df_scores_test[['User_ID', 'Activity_Num_ID', 'Usefulness_Score']], predictions, **kwargs)
eval_exp_var = exp_var(df_scores_test[['User_ID', 'Activity_Num_ID', 'Usefulness_Score']], predictions, **kwargs)

k = 10
eval_map = map_at_k(df_scores_test[['User_ID', 'Activity_Num_ID', 'Usefulness_Score']], all_predictions, k=k, **kwargs)
eval_ndcg = ndcg_at_k(df_scores_test[['User_ID', 'Activity_Num_ID', 'Usefulness_Score']], all_predictions, k=k, **kwargs)
eval_precision = precision_at_k(df_scores_test[['User_ID', 'Activity_Num_ID', 'Usefulness_Score']], all_predictions, k=k, **kwargs)
eval_recall = recall_at_k(df_scores_test[['User_ID', 'Activity_Num_ID', 'Usefulness_Score']], all_predictions, k=k, **kwargs)


print("RMSE:\t\t%f" % eval_rmse,
      "MAE:\t\t%f" % eval_mae,
      "rsquared:\t%f" % eval_rsquared,
      "exp var:\t%f" % eval_exp_var, sep='\n')

print('----')

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

RMSE:		0.264573
MAE:		0.199131
rsquared:	-0.837950
exp var:	-0.412626
----
MAP:	0.001570
NDCG:	0.003145
Precision@K:	0.003600
Recall@K:	0.003600


In [None]:
# prediction for user 0
result = all_predictions[all_predictions['User_ID'] == 0]

In [None]:
result.head()

Unnamed: 0,User_ID,Activity_Num_ID,prediction
11880,0,147,0.137275
11881,0,137,0.12177
11882,0,145,0.241462
11883,0,15,0.0
11884,0,107,0.168881


In [None]:
result.head(n=15)

Unnamed: 0,User_ID,Activity_Num_ID,prediction
11880,0,147,0.137275
11881,0,137,0.12177
11882,0,145,0.241462
11883,0,15,0.0
11884,0,107,0.168881
11885,0,43,0.154593
11886,0,129,0.160305
11887,0,118,0.29922
11888,0,153,0.228412
11889,0,115,0.199704


In [None]:
# merge top 10 prediction with activities dataframe
result = pd.merge(result, df_activity, on='Activity_Num_ID', how='left')

In [None]:
print("User 0 top 15 predictions")
result[['prediction', 'User_ID', 'Activity_Num_ID', 'Activity_Name']].head(n=10)

User 0 top 15 predictions


Unnamed: 0,prediction,User_ID,Activity_Num_ID,Activity_Name
0,0.137275,0,147,Poiki is drawing in the stars
1,0.12177,0,137,Bingo - pets
2,0.241462,0,145,Roulette - shopping cart
3,0.0,0,15,Book: Wendy teaches about germs
4,0.168881,0,107,Cake decoration
5,0.154593,0,43,Flashcards - musical instruments
6,0.160305,0,129,Tracing my name
7,0.29922,0,118,Wendy's popping balloons
8,0.228412,0,153,4 pairs memory game
9,0.199704,0,115,Poiki is harvesting with his tractor
