In [1]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'

In [2]:
import joblib

import numpy as np
import pandas as pd
import scipy.sparse as sp

from lightfm_wrapper import LightFMWrapper



In [3]:
!mkdir models

mkdir: models: File exists


In [4]:
# Load all datasets using the function
df_tracks = pd.read_hdf('df_data/df_tracks.hdf', 'abc')
df_playlists = pd.read_hdf('df_data/df_playlists.hdf', 'abc')
df_playlists_info = pd.read_hdf('df_data/df_playlists_info.hdf', 'abc')
df_playlists_test = pd.read_hdf('df_data/df_playlists_test.hdf', 'abc')
df_playlists_test_info = pd.read_hdf('df_data/df_playlists_test_info.hdf', 'abc')

In [5]:
train = pd.read_hdf('df_data/train.hdf')
val = pd.read_hdf('df_data/val1.hdf')
val1_pids = joblib.load('df_data/val1_pids.pkl')

In [6]:
user_seen = train.groupby('pid').tid.apply(set).to_dict()
val_tracks = val.groupby('pid').tid.apply(set).to_dict()

In [7]:
# this was edited, since before "num_playlists" was set as the number of
# playlists in the testing set, so there was an error creating the sparse matrix

config = {
    'num_playlists': df_playlists_info.pid.max() + 1,
    'num_tracks': df_tracks.tid.max() + 1,
    'model_path': 'models/lightfm_model.pkl'
}


In [8]:
X_train = sp.coo_matrix(
    (np.ones(len(train)), (train.pid, train.tid)),
    shape=(config['num_playlists'], config['num_tracks'])
)

In [9]:
model = LightFMWrapper(no_components=200, loss='warp', learning_rate=0.02, max_sampled=400, random_state=1, user_alpha=1e-05)

best_score = 0
for i in range(10):
    print(f'iteration: {i}')
    
    model.fit_partial(X_train, epochs=3, num_threads=8)

    model.batch_setup(
        item_chunks={0: np.arange(config['num_tracks'])},
        item_features=None,
        user_features=None,
        n_process=1, 
    )
    # model.n_process = 1

    res = model.batch_predict(chunk_id=0, user_ids=val1_pids, top_k=600)
    model.batch_cleanup()
    
    score = []
    for pid in val1_pids:
        tracks_t = val_tracks[pid]
        tracks = [i for i in res[pid][0] if i not in user_seen.get(pid, set())][:len(tracks_t)]
        guess = np.sum([i in tracks_t for i in tracks])
        score.append(guess / len(tracks_t))
    
    score = np.mean(score)
    print(score)
    if score > best_score:
        joblib.dump(model, open(config['model_path'], 'wb'))
        best_score = score

iteration: 0
Batch predict: user_ids: 1,000, item_ids: 212,601
Start recommending: using single process
Recommendations for chunk 0 done in 28.995s. 0.0290 s by user
0.045738770019544066
iteration: 1
Batch predict: user_ids: 1,000, item_ids: 212,601
Start recommending: using single process
Recommendations for chunk 0 done in 26.449s. 0.0264 s by user
0.05383870654579224
iteration: 2
Batch predict: user_ids: 1,000, item_ids: 212,601
Start recommending: using single process
Recommendations for chunk 0 done in 26.237s. 0.0262 s by user
0.058136829908612145
iteration: 3
Batch predict: user_ids: 1,000, item_ids: 212,601
Start recommending: using single process
Recommendations for chunk 0 done in 26.451s. 0.0265 s by user
0.05900736569409577
iteration: 4
Batch predict: user_ids: 1,000, item_ids: 212,601
Start recommending: using single process
Recommendations for chunk 0 done in 26.223s. 0.0262 s by user
0.06008706660729112
iteration: 5
Batch predict: user_ids: 1,000, item_ids: 212,601
Start