In [1]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'

In [2]:
import os
import json
import joblib

import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

from lightfm_wrapper import LightFMWrapper



In [3]:
df_tracks = pd.read_hdf('df_data/df_tracks.hdf')
df_playlists = pd.read_hdf('df_data/df_playlists.hdf')
df_playlists_info = pd.read_hdf('df_data/df_playlists_info.hdf')
df_playlists_test = pd.read_hdf('df_data/df_playlists_test.hdf')
df_playlists_test_info = pd.read_hdf('df_data/df_playlists_test_info.hdf')

In [4]:
train = pd.read_hdf('df_data/train.hdf')
val = pd.read_hdf('df_data/val1.hdf')
val1_pids = joblib.load('df_data/val1_pids.pkl')

In [5]:
user_seen = train.groupby('pid').tid.apply(set).to_dict()
val_tracks = val.groupby('pid').tid.apply(set).to_dict()

In [6]:
config = {
    'num_playlists': df_playlists_info.pid.max() + 1,
    'num_tracks': df_tracks.tid.max() + 1,
    'model_path': 'models/lightfm_model_text.pkl',
}

In [7]:
zeros_pids = np.array(list(set(val1_pids).difference(train.pid.unique())))

no_zeros_pids = np.array(list(set(val1_pids).difference(zeros_pids))[:1000])
target_pids = np.hstack([zeros_pids, no_zeros_pids])

playlist_name1 = df_playlists_test_info.set_index('pid').name
playlist_name2 = df_playlists_info.set_index('pid').name
playlist_name = pd.concat([playlist_name1, playlist_name2]).sort_index()

In [8]:
playlist_name = playlist_name.reset_index(drop=True)
playlist_name = playlist_name.reindex(np.arange(config['num_playlists'])).fillna('')
# .reindex(np.arange(config['num_playlists']))

vectorizer = CountVectorizer(max_features=20000)
user_features = vectorizer.fit_transform(playlist_name)

user_features = sp.hstack([sp.eye(config['num_playlists']), user_features])

In [9]:
X_train = sp.coo_matrix(
    (np.ones(len(train)), (train.pid, train.tid)),
    shape=(config['num_playlists'], config['num_tracks'])
)

In [10]:
model = LightFMWrapper(
    no_components=200, 
    loss='warp', 
    learning_rate=0.03, 
    max_sampled=400, 
    random_state=1,
    user_alpha=1e-05,
)

In [11]:
best_score = 0

for i in range(10):
    print("iteration:", i)
    model.fit_partial(X_train, epochs=3, num_threads=1, user_features=user_features)

    model.batch_setup(
        item_chunks={0: np.arange(config['num_tracks'])},
        n_process=1, 
        item_features=None,
        user_features=user_features,
    )

    res = model.batch_predict(chunk_id=0, user_ids=target_pids, top_k=600)
    model.batch_cleanup()
    
    score = []
    score2 = []
    
    for pid in zeros_pids:
        tracks_t = val_tracks[pid]
        tracks = [i for i in res[pid][0] if i not in user_seen.get(pid, set())][:len(tracks_t)]
        guess = np.sum([i in tracks_t for i in tracks])
        score.append(guess / len(tracks_t))
    
    for pid in no_zeros_pids:
        tracks_t = val_tracks[pid]
        tracks = [i for i in res[pid][0] if i not in user_seen.get(pid, set())][:len(tracks_t)]
        guess = np.sum([i in tracks_t for i in tracks])
        score2.append(guess / len(tracks_t))
    
    score = np.mean(score)
    score2 = np.mean(score2)
    
    print(score, score2)
    if score > best_score:
        joblib.dump(model, open(config['model_path'], 'wb'))
        best_score = score

iteration: 0
Batch predict: user_ids: 1,000, item_ids: 212,601
Start recommending: using single process
Recommendations for chunk 0 done in 27.566s. 0.0276 s by user
0.015731011650883513 0.042915827128738204
iteration: 1
Batch predict: user_ids: 1,000, item_ids: 212,601
Start recommending: using single process
Recommendations for chunk 0 done in 28.458s. 0.0285 s by user
0.012003157379362504 0.0527908607297672
iteration: 2
Batch predict: user_ids: 1,000, item_ids: 212,601
Start recommending: using single process
Recommendations for chunk 0 done in 27.054s. 0.0271 s by user
0.009768487851248085 0.055516085132755244
iteration: 3
Batch predict: user_ids: 1,000, item_ids: 212,601
Start recommending: using single process
Recommendations for chunk 0 done in 27.112s. 0.0271 s by user
0.008931517916406213 0.0557987105399528
iteration: 4
Batch predict: user_ids: 1,000, item_ids: 212,601
Start recommending: using single process
Recommendations for chunk 0 done in 27.461s. 0.0275 s by user
0.0081

In [12]:
joblib.dump(user_features, open('models/user_features.pkl', 'wb'))

In [13]:
model = joblib.load(open(config['model_path'], 'rb'))