In [17]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'

In [None]:
import os
import json
import joblib

import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.preprocessing import LabelEncoder

from lightfm import LightFM



In [None]:
### FILL IN MISSING CODE FROM https://github.com/dmitryhd/lightfm

CYTHON_DTYPE = np.float32

class LightFMWrapper(LightFM):
    def __init__(self, model):
        self.model = model

    @staticmethod
    def _to_cython_dtype(mat):
        if mat.dtype != CYTHON_DTYPE:
            return mat.astype(CYTHON_DTYPE)
        else:
            return mat

    def _construct_item_features(self, n_items: int, item_features) -> sp.csr_matrix:
        # TODO: mb. merge with user features
        if item_features is None:
            item_features = sp.identity(n_items, dtype=CYTHON_DTYPE, format='csr')
        else:
            item_features = item_features.tocsr()

        if n_items > item_features.shape[0]:
            raise Exception('Number of item feature rows does not equal the number of items')

        if self.item_embeddings is not None:
            if not self.item_embeddings.shape[0] >= item_features.shape[1]:
                raise ValueError(
                    'The item feature matrix specifies more features than there are estimated '
                    'feature embeddings: {} vs {}.'.format(self.item_embeddings.shape[0], item_features.shape[1])
                )

        item_features = self._to_cython_dtype(item_features)
        return item_features
    
def _batch_setup(model: LightFMWrapper, item_chunks, item_features, user_features, n_process: int=1):

    # global _item_repr, _user_repr
    # global _item_repr_biases, _user_repr_biases
    # global _pool
    # global _item_chunks

    if item_features is None:
        n_items = len(model.item_biases)
        item_features = sp.identity(n_items, dtype=CYTHON_DTYPE, format='csr')

    if user_features is None:
        n_users = len(model.user_biases)
        user_features = sp.identity(n_users, dtype=CYTHON_DTYPE, format='csr')

    n_users = user_features.shape[0]
    user_features = model._construct_user_features(n_users, user_features)
    _user_repr, _user_repr_biases = _precompute_representation(
        features=user_features,
        feature_embeddings=model.user_embeddings,
        feature_biases=model.user_biases,
    )

    n_items = item_features.shape[0]
    item_features = model._construct_item_features(n_items, item_features)
    _item_repr, _item_repr_biases = _precompute_representation(
        features=item_features,
        feature_embeddings=model.item_embeddings,
        feature_biases=model.item_biases,
    )
    _item_repr = _item_repr.T
    _item_chunks = item_chunks
    # _clean_pool()
    # # Pool creation should go last
    # if n_process > 1:
    #     _pool = mp.Pool(processes=n_process)

def _precompute_representation(features, feature_embeddings, feature_biases):
    representation = features.dot(feature_embeddings)
    representation_bias = features.dot(feature_biases)
    return representation, representation_bias

In [19]:
!mkdir models

mkdir: cannot create directory ‘models’: File exists


In [9]:
# Load all datasets using the function
df_tracks = pd.read_hdf('df_data/df_tracks.hdf', 'abc')
df_playlists = pd.read_hdf('df_data/df_playlists.hdf', 'abc')
df_playlists_info = pd.read_hdf('df_data/df_playlists_info.hdf', 'abc')
df_playlists_test = pd.read_hdf('df_data/df_playlists_test.hdf', 'abc')
df_playlists_test_info = pd.read_hdf('df_data/df_playlists_test_info.hdf', 'abc')

# display(df_tracks)
display(df_playlists_info)

Unnamed: 0,collaborative,duration_ms,modified_at,name,num_albums,num_artists,num_edits,num_followers,num_tracks,pid
0,False,4007017,1470355200,Funk,15,11,4,1,16,10000
1,False,5823244,1412985600,Childhood,19,14,3,1,28,10001
2,False,4891462,1421971200,Old Country,19,12,3,1,23,10002
3,False,3498401,1492646400,april showers,14,14,6,1,14,10003
4,False,8765977,1496534400,DnB,21,14,6,1,31,10004
...,...,...,...,...,...,...,...,...,...,...
1995,False,9917901,1507852800,old,40,36,8,1,41,995
1996,False,3699248,1479254400,Daze,17,15,13,1,17,996
1997,False,27538723,1410307200,rap,98,82,63,1,119,997
1998,False,24950143,1507939200,Country,75,40,37,1,108,998


In [None]:
train = pd.read_hdf('df_data/train.hdf')
val = pd.read_hdf('df_data/val1.hdf')
val1_pids = joblib.load('df_data/val1_pids.pkl')

In [6]:
user_seen = train.groupby('pid').tid.apply(set).to_dict()
val_tracks = val.groupby('pid').tid.apply(set).to_dict()

In [7]:
config = {
    'num_playlists': df_playlists_test_info.pid.max() + 1,
    'num_tracks': df_tracks.tid.max() + 1,
}

In [8]:
X_train = sp.coo_matrix(
    (np.ones(len(train)), (train.pid, train.tid)),
    shape=(config['num_playlists'], config['num_tracks'])
)

In [16]:
config['model_path'] = 'models/lightfm_model.pkl'

print(val)

Empty DataFrame
Columns: [Unnamed: 0.1, Unnamed: 0, pid, tid, pos]
Index: []


In [12]:
model = LightFM(no_components=200, loss='warp', learning_rate=0.02, max_sampled=400, random_state=1, user_alpha=1e-05)

best_score = 0
for i in range(60):
    
    # model.fit_partial(X_train, epochs=5, num_threads=8)

    # model.batch_setup(
    #     item_chunks={0: np.arange(config['num_tracks'])},
    #     n_process=50, 
    # )
    # res = model.batch_predict(chunk_id=0, user_ids=val1_pids, top_k=600)
    # model.batch_cleanup()

    model.fit(X_train, epochs=5, num_threads=6)
    res = model.predict(user_ids=val1_pids, item_ids=np.arange(config['num_tracks']))
    
    score = []
    for pid in val1_pids:
        tracks_t = val_tracks[pid]
        tracks = [i for i in res[pid][0] if i not in user_seen.get(pid, set())][:len(tracks_t)]
        guess = np.sum([i in tracks_t for i in tracks])
        score.append(guess / len(tracks_t))
    
    score = np.mean(score)
    print(score)
    if score > best_score:
        joblib.dump(model, open(config['model_path'], 'wb'))
        best_score = score

ValueError: Expected the number of user IDs (0) to equal the number of item IDs (57884)