# LightFM Model

In [2]:
# !pip install lightfm
import pandas as pd
import numpy as np
import scipy as sp

from lightfm.data import Dataset
from lightfm import LightFM
import time
from lightfm.evaluation import precision_at_k

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Setups
- Import processed dataset into this notebook
- Make a copy of the dataset to work on while keeping the original dataset separate.


In [4]:
from google.colab import drive
drive.mount('/content/drive')
ratings_train = pd.read_parquet('drive/MyDrive/Big Data - Final Project/train_data_small_processed.parquet')
ratings_val = pd.read_parquet('drive/MyDrive/Big Data - Final Project/val_data_small_processed.parquet')

tiny_ratings_train = ratings_train.copy()
tiny_ratings_val = ratings_val.copy()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


ValueError: ignored

## COO Matrix Conversion
1. Find the max in the column `user_id` and `recording_id`
2. Convert them using user and recordings as the column and rows
3. Use the `.tocoo()` method for a COOrdinate representation of the matrices

In [3]:
# convert data to coo matrix
train_user = tiny_ratings_train["user_id"].max()
val_user = tiny_ratings_val["user_id"].max()
train_music = tiny_ratings_train["recording_id"].max()
val_music = tiny_ratings_val["recording_id"].max()

trains = sp.sparse.lil_matrix((max(train_user, val_user) + 1, max(train_music, val_music) + 1), dtype=np.int32)
vals = sp.sparse.lil_matrix((max(train_user, val_user) + 1, max(train_music, val_music) + 1), dtype=np.int32)

for col, row in tiny_ratings_train.iterrows():
    trains[int(row['user_id']), int(row['recording_id'])] = row["count"]
for col, row in tiny_ratings_val.iterrows():
    vals[int(row['user_id']), int(row['recording_id'])] = row["count"]

train_data = trains.tocoo()
val_data = vals.tocoo()

Unnamed: 0,user_id,recording_msid,count,new_msid
3440914,5864,00000334-40c9-4787-b80a-10fc73c82184,1,1
1931734,9943,000008cd-770c-47db-b254-2e85101724a8,1,2
7066589,10567,000008cd-770c-47db-b254-2e85101724a8,1,2
5471117,9307,00000a5e-9ba6-4639-8bf4-36e9cf95b0af,1,3
2180422,13041,00000dd4-4114-4b7c-90a3-512ae3916af4,1,4
...,...,...,...,...
12074629,11325,ffffe9bf-6044-489e-a9a1-9bf16b3a6fe2,10,3644058
1538562,20819,ffffebd9-ecf2-4858-a598-909cb4a82027,1,3644059
7850252,17813,fffff359-f170-46a2-8230-7365d1210d0b,2,3644060
14474663,9180,fffff596-c7b6-4138-8891-5da51bcbd558,1,3644061


## Hyperparameter Tuning
### 1. Learning Rate
- For default `no_components` of 10, tune learning rate for `warp` Loss

In [None]:
# Hypermarameter Tuning
precisions = []
time_spent = []
lrs = []

for lr in [0.01, 0.02, 0.04, 0.06, 0.08, 0.1]:
    start = time.time()
    model = LightFM(loss='warp', learning_rate=lr, random_state=1111)
    model.fit(train_data, epochs=30, num_threads=7)
    test_precision = precision_at_k(model, val_data, k=100).mean()
    end = time.time()
    test_time = end - start

    lrs.append(lr)
    precisions.append(test_precision)
    time_spent.append(test_time)

print(lrs)
print(precisions)
print(time_spent)


- For default `no_components` of 10, tune learning rate for `bpr` Loss

In [None]:
precisions = []
time_spent = []
lrs = []

for lr in [0.01, 0.02, 0.04, 0.06, 0.08, 0.1]:
    start = time.time()
    model = LightFM(loss='bpr', learning_rate=lr, random_state=1111)
    model.fit(train_data, epochs=30, num_threads=7)
    test_precision = precision_at_k(model, val_data, k=100).mean()
    end = time.time()
    test_time = end - start

    lrs.append(lr)
    precisions.append(test_precision)
    time_spent.append(test_time)

print(lrs)
print(precisions)
print(time_spent)


### 2. `no_components`
- Since learning rate is optimized at 0.04 for `warp` Loss, tune `no_components` using learning rate = 0.04

In [None]:
precisions = []
time_spent = []
ncs = []

for nc in [10, 20, 30, 40]:
    start = time.time()
    model = LightFM(loss='warp', learning_rate=0.04, no_components = nc, random_state=1111)
    model.fit(train_data, epochs=30, num_threads=7)
    test_precision = precision_at_k(model, val_data, k=100).mean()
    end = time.time()
    test_time = end - start

    ncs.append(nc)
    precisions.append(test_precision)
    time_spent.append(test_time)

print(ncs)
print(precisions)
print(time_spent)

- Since learning rate is optimized at 0.1 for `bpr` Loss, tune `no_components` use learning rate = 0.04

In [None]:
precisions = []
time_spent = []
ncs = []

for nc in [10, 20, 30, 40]:
    start = time.time()
    model = LightFM(loss='bpr', learning_rate=0.1, no_components = nc, random_state=1111)
    model.fit(train_data, epochs=30, num_threads=7)
    test_precision = precision_at_k(model, val_data, k=100).mean()
    end = time.time()
    test_time = end - start

    ncs.append(nc)
    precisions.append(test_precision)
    time_spent.append(test_time)

print(ncs)
print(precisions)
print(time_spent)

## Final Model
- `warp` Loss
- `no_components` = 40
- `learning_rate` =0.04

In [7]:
# Final Model
start = time.time()
model = LightFM(loss='warp', no_components = 40, learning_rate=0.04, random_state=1111)
model.fit(train_data, epochs=30, num_threads=7)
test_precision = precision_at_k(model, val_data, k=100).mean()
end = time.time()
test_time = end - start
print(test_precision)
print(test_time)

Unnamed: 0,user_id,count,new_msid
0,1377,2,1052303
1,7010,2,10087536
2,2717,2,3832350
3,10890,3,3150685
4,5758,5,1645798
...,...,...,...
6899012,10636,1,9396561
6899013,1419,1,7462606
6899014,4650,1,10654730
6899015,1105,1,3544340
