In [1]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'

In [16]:
import os
import json
import joblib

import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.preprocessing import LabelEncoder
from lightfm import LightFM

In [24]:
import lightfm
lightfm.__version__

'1.13.20'

In [5]:
model = joblib.load(open('models/lightfm_model.pkl', 'rb'))
model_text = joblib.load(open('models/lightfm_model_text.pkl', 'rb'))
user_features = joblib.load(open('models/user_features.pkl', 'rb'))

In [6]:
df_tracks = pd.read_hdf('df_data/df_tracks.hdf')
df_playlists = pd.read_hdf('df_data/df_playlists.hdf')
df_playlists_info = pd.read_hdf('df_data/df_playlists_info.hdf')
df_playlists_test = pd.read_hdf('df_data/df_playlists_test.hdf')
df_playlists_test_info = pd.read_hdf('df_data/df_playlists_test_info.hdf')

In [7]:
train = pd.read_hdf('df_data/train.hdf')

In [8]:
val1 = pd.read_hdf('df_data/val1.hdf')
val1_pids = joblib.load('df_data/val1_pids.pkl')

In [9]:
val2 = pd.read_hdf('df_data/val2.hdf')
val2_pids = joblib.load('df_data/val2_pids.pkl')

In [10]:
user_seen = set(zip(train.pid, train.tid))

In [11]:
def save_candidates(target_pids, df_size, file_name, df=None):
    
    target_pids_text = list(set(target_pids).difference(train.pid))
    target_pids_no_text = list(set(target_pids).difference(target_pids_text))
    
    model.batch_setup(
        item_chunks={0: np.arange(df_tracks.tid.max() + 1)},
        n_process=50, 
    )    
    res = model.batch_predict(chunk_id=0, user_ids=target_pids_no_text, top_k=10000)
    model.batch_cleanup()
    
    model_text.batch_setup(
        item_chunks={0: np.arange(df_tracks.tid.max() + 1)},
        n_process=50, 
        user_features=user_features,
    )    
    res2 = model_text.batch_predict(chunk_id=0, user_ids=target_pids_text, top_k=10000)
    model_text.batch_cleanup()
    
    res.update(res2)
    
    if df is not None:
        val_tracks = df.groupby('pid').tid.apply(set).to_dict()  
    
    pids = []
    tids = []
    targets = []

    for pid in target_pids:

        l = max(df_size[pid] * 15, 700 + df_size[pid])
        #l = 2000
        pids += [pid] * l
        tids += list(res[pid][0][:l])
        
        if df is not None:
            tracks_t = val_tracks[pid]
            targets += [i in tracks_t for i in res[pid][0][:l]]

    candidates = pd.DataFrame()
    candidates['pid'] = np.array(pids)
    candidates['tid'] = np.array(tids)
    
    if df is not None:
        candidates['target'] = np.array(targets).astype(int)

    index = []
    for pid, tid in candidates[['pid', 'tid']].values:
        index.append((pid, tid) not in user_seen)

    candidates = candidates[index]

    candidates.to_hdf(file_name, key='abc')

In [13]:
save_candidates(
    val1_pids,
    val1.pid.value_counts(),
    'df_data/ii_candidate.hdf',
    val1
)

In [14]:
save_candidates(
    val2_pids,
    val2.pid.value_counts(),
    'df_data/iii_candidate.hdf',
    val2
)

In [15]:
save_candidates(
    df_playlists_test_info.pid.values,
    df_playlists_test_info.set_index('pid').num_holdouts,
    'df_data/test_candidate.hdf'
)