In [6]:
from data import train_dev_test_split, read_data
from CollaborativeFiltering import CollaborativeFilteringModel
import pandas as pd
import numpy as np

Read the data and create training, development and testning sets, for hyperparams selection.

In [2]:
data_dir = 'data/'
csv_zip = 'BX-CSV-Dump.zip'
ratings_pkl = 'data/ratings.pkl'
ratings_csv = 'data/new_ratings.csv'
seed = 0xDEAD

# function already returns ratings with the book titles
# if ISBN was not found, the title would be `Unknown Book`
ratings_df, ratings = read_data(
    data_dir=data_dir,
    csv_zip=csv_zip,
    ratings_pickle=ratings_pkl,
    csv_fn=ratings_csv,
)

train, dev, test = train_dev_test_split(
    ratings_df=ratings_df,
    test_perc=0.2,
    seed=seed,
)

Creating test set:  43%|█████       | 489086/1149780 [00:26<00:35, 18489.86it/s]
Creating dev set: 100%|████████████| 1149780/1149780 [01:11<00:00, 16116.18it/s]


In [3]:
ratings_df.head()

Unnamed: 0,user_id,ISBN,rating,book_id,title
0,0,034545104X,0,0,Flesh Tones: A Novel
1,1,0155061224,5,1,Rites of Passage
2,2,0446520802,0,2,The Notebook
3,3,052165615X,3,3,Help!: Level 1
4,3,0521795028,6,4,The Amsterdam Connection : Level 4 (Cambridge ...


Select best hyperparams using validation set. Training will be done on small number of epochs. The learning rate and learning rate factor were set by trial and error on previous runs, obiously they highly depend on the other parameters, however for simplicity i left them fixed. Logging is done every second epoch to save time.

In [21]:
results = []
for lam in [2, 1, 0.1]:
    for n_ft in [6, 9, 12]:
        print(f'Training for lambda :: {lam}, n_features :: {n_ft}')
        m = CollaborativeFilteringModel(
            ratings=train,
            n_users=ratings_df.user_id.max() + 1,
            n_items=ratings_df.book_id.max() + 1,
            n_features=n_ft,
            mean_norm=True,
        )


        rmse, min_loss, cur_loss = m.train(
            lam=lam,
            lr=0.015,
            lr_factor=0.85,
            n_epochs=10,
            seed=seed,
            log_each=2,
        )
        
        rmse_val = m.evaluate_rmse(dev)
        results.append(dict(rmse_val=rmse_val, rmse_train=rmse, lamb=lam, n_features=n_ft))
        print()

Training for lambda :: 2, n_features :: 6
Epoch   0, min MSE loss 9.024, curr MSE loss 9.013, last 2 epochs took 13.993 secs, elapsed 0.233 mins
Epoch   2, min MSE loss 9.013, curr MSE loss 8.919, last 2 epochs took 19.058 secs, elapsed 0.551 mins
Epoch   4, min MSE loss 8.919, curr MSE loss 8.339, last 2 epochs took 20.227 secs, elapsed 0.888 mins
Epoch   6, min MSE loss 8.339, curr MSE loss 7.693, last 2 epochs took 20.463 secs, elapsed 1.229 mins
Epoch   8, min MSE loss 7.693, curr MSE loss 7.349, last 2 epochs took 20.413 secs, elapsed 1.569 mins
Train RMSE :: 3.634029159042763

Training for lambda :: 2, n_features :: 9
Epoch   0, min MSE loss 9.037, curr MSE loss 9.019, last 2 epochs took 14.881 secs, elapsed 0.248 mins
Epoch   2, min MSE loss 9.019, curr MSE loss 8.892, last 2 epochs took 20.516 secs, elapsed 0.590 mins
Epoch   4, min MSE loss 8.892, curr MSE loss 8.209, last 2 epochs took 20.202 secs, elapsed 0.927 mins
Epoch   6, min MSE loss 8.209, curr MSE loss 7.465, last 2 

In [22]:
pd.DataFrame(results).sort_values(by='rmse_val')

Unnamed: 0,rmse_val,rmse_train,lamb,n_features
1,11.652924,3.363778,2.0,9
4,11.667651,3.346141,1.0,9
7,11.681516,3.330373,0.1,9
2,11.734767,3.38047,2.0,12
0,11.746713,3.634029,2.0,6
3,11.768721,3.63539,1.0,6
5,11.784279,3.408188,1.0,12
6,11.789873,3.638178,0.1,6
8,11.844285,3.426672,0.1,12


In [26]:
lam = 2
n_features = 9

m = CollaborativeFilteringModel(
    ratings={**train, **dev},
    n_users=ratings_df.user_id.max() + 1,
    n_items=ratings_df.book_id.max() + 1,
    n_features=n_features,
    mean_norm=True,
)


rmse, min_loss, cur_loss = m.train(
    lam=lam,
    lr=0.01,
    lr_factor=0.8475,
    n_epochs=25,
    seed=seed,
    log_each=2,
)

Epoch   0, min MSE loss 9.419, curr MSE loss 9.407, last 2 epochs took 15.370 secs, elapsed 0.256 mins
Epoch   2, min MSE loss 9.407, curr MSE loss 9.343, last 2 epochs took 20.577 secs, elapsed 0.599 mins
Epoch   4, min MSE loss 9.343, curr MSE loss 9.012, last 2 epochs took 25.948 secs, elapsed 1.032 mins
Epoch   6, min MSE loss 9.012, curr MSE loss 8.573, last 2 epochs took 25.691 secs, elapsed 1.460 mins
Epoch   8, min MSE loss 8.573, curr MSE loss 8.163, last 2 epochs took 26.778 secs, elapsed 1.906 mins
Epoch  10, min MSE loss 8.163, curr MSE loss 7.866, last 2 epochs took 25.255 secs, elapsed 2.327 mins
Epoch  12, min MSE loss 7.866, curr MSE loss 7.661, last 2 epochs took 27.121 secs, elapsed 2.779 mins
Epoch  14, min MSE loss 7.661, curr MSE loss 7.519, last 2 epochs took 25.374 secs, elapsed 3.202 mins
Epoch  16, min MSE loss 7.519, curr MSE loss 7.420, last 2 epochs took 24.048 secs, elapsed 3.603 mins
Epoch  18, min MSE loss 7.420, curr MSE loss 7.350, last 2 epochs took 25

Now we evaluate the best model on the test set

In [28]:
rmse_test = m.evaluate_rmse(test)
rmse_test

12.175313177224092

Now we train the best model on the whole data available, and then use it to predict recomendations.

In [39]:
lam = 2
n_features = 9

m = CollaborativeFilteringModel(
    ratings={**train, **dev, **test},
    n_users=ratings_df.user_id.max() + 1,
    n_items=ratings_df.book_id.max() + 1,
    n_features=n_features,
    mean_norm=True,
)


rmse, min_loss, cur_loss = m.train(
    lam=lam,
    lr=0.0075,
    lr_factor=0.8475,
    n_epochs=40,
    seed=seed,
    log_each=2,
)

Epoch   0, min MSE loss 9.840, curr MSE loss 9.831, last 2 epochs took 21.395 secs, elapsed 0.357 mins
Epoch   2, min MSE loss 9.831, curr MSE loss 9.768, last 2 epochs took 28.471 secs, elapsed 0.831 mins
Epoch   4, min MSE loss 9.768, curr MSE loss 9.469, last 2 epochs took 29.015 secs, elapsed 1.315 mins
Epoch   6, min MSE loss 9.469, curr MSE loss 9.148, last 2 epochs took 28.903 secs, elapsed 1.796 mins
Epoch   8, min MSE loss 9.148, curr MSE loss 8.808, last 2 epochs took 28.363 secs, elapsed 2.269 mins
Epoch  10, min MSE loss 8.808, curr MSE loss 8.554, last 2 epochs took 28.560 secs, elapsed 2.745 mins
Epoch  12, min MSE loss 8.554, curr MSE loss 8.372, last 2 epochs took 28.143 secs, elapsed 3.214 mins
Epoch  14, min MSE loss 8.372, curr MSE loss 8.243, last 2 epochs took 27.856 secs, elapsed 3.678 mins
Epoch  16, min MSE loss 8.243, curr MSE loss 8.152, last 2 epochs took 28.222 secs, elapsed 4.149 mins
Epoch  18, min MSE loss 8.152, curr MSE loss 8.088, last 2 epochs took 28

Key feature of my implementation is that it is possible to create a feature vector for new user with history, without training whole system from scratch. How is it possible? I fix all weigths for the other users and items and train only weights for the new user. To understand it better, imagine that feature vectors for all users/items are embeddings in the neural network. Obviously, in one batch we do not train all embeddings, only those that are presented in the batch. Here is point is the same, each feature vector for user/item is an embedding of the ID, so for one user we can train only one embedding, other weights are kept fixed. This may leed to small performance degradation but will improve speed of training a lot.

In [41]:
lotr = ratings_df[ratings_df.title == 'Lord of the Rings Trilogy']
prediction = m.predict_new_user_with_history(
    history={lotr.book_id.unique()[0]: 10},
    round=False,
    lam=1,
    lr=0.01,
    lr_factor=0.8,
    n_epochs=20,
    seed=seed,
    log_each=2,
)

prediction_title = []
for i, val in enumerate(prediction):
    if val < 9:
        continue
    title = ratings_df[ratings_df.book_id == i].title.unique()[0]
    prediction_title.append([title, val])

prediction_title.sort(key=lambda x: x[1])

for t, v in prediction_title[:50]:
    if t == 'Unknown Book':
        continue
    print(f'{t:60} :: {v:.3f}')

Epoch   0, min MSE loss 7.902, curr MSE loss 7.902, last 2 epochs took 10.453 secs, elapsed 0.174 mins
Epoch   2, min MSE loss 7.902, curr MSE loss 7.902, last 2 epochs took 5.052 secs, elapsed 0.258 mins
Epoch   4, min MSE loss 7.902, curr MSE loss 7.902, last 2 epochs took 5.119 secs, elapsed 0.344 mins
Epoch   6, min MSE loss 7.902, curr MSE loss 7.902, last 2 epochs took 5.480 secs, elapsed 0.435 mins
Epoch   8, min MSE loss 7.902, curr MSE loss 7.902, last 2 epochs took 5.018 secs, elapsed 0.519 mins
Epoch  10, min MSE loss 7.902, curr MSE loss 7.902, last 2 epochs took 5.024 secs, elapsed 0.602 mins
Epoch  12, min MSE loss 7.902, curr MSE loss 7.902, last 2 epochs took 5.207 secs, elapsed 0.689 mins
Epoch  14, min MSE loss 7.902, curr MSE loss 7.902, last 2 epochs took 5.568 secs, elapsed 0.782 mins
Epoch  16, min MSE loss 7.902, curr MSE loss 7.902, last 2 epochs took 5.174 secs, elapsed 0.868 mins
Epoch  18, min MSE loss 7.902, curr MSE loss 7.902, last 2 epochs took 5.150 secs