In [1]:
import pickle

import pandas as pd
import scipy.sparse
from lightfm import LightFM
from lightfm.data import Dataset

In [2]:
THREADS = 8

## Preparing dataset

In [3]:
df = pd.read_parquet('/pio/scratch/1/recommender_systems/processed/amazon-clothes/5-core/train.parquet')
test_df = pd.read_parquet('/pio/scratch/1/recommender_systems/processed/amazon-clothes/5-core/test.parquet')
categories = pd.read_json('/pio/scratch/1/recommender_systems/interim/Amazon/meta_Clothing_Shoes_and_Jewelry_categories.json', lines=True)

In [4]:
interactions = scipy.sparse.load_npz('/pio/scratch/1/i313924/data/lightfm_data/5_core_interactions.npz')
test_interactions = scipy.sparse.load_npz('/pio/scratch/1/i313924/data/lightfm_data/5_core_test_interactions.npz')
dataset = pd.read_pickle('/pio/scratch/1/i313924/data/lightfm_data/5_core_dataset.pkl')

In [5]:
items_list = df.asin.unique()
categories  = categories[categories.asin.isin(items_list)]
categories_names = categories.category_1.unique()

In [6]:
dataset.fit_partial(item_features=(x for x in categories_names))
with open('/pio/scratch/1/i313924/data/lightfm_data/5_core_dataset_features.pkl', 'wb') as f:
    pickle.dump(dataset, f, -1)

In [7]:
item_features = dataset.build_item_features((row.asin, [row.asin, row.category_1])
                                            for _, row in categories.iterrows())
with open('/pio/scratch/1/i313924/data/lightfm_data/5_core_features.pkl', 'wb') as f:
    pickle.dump(item_features, f, -1)

In [20]:
list(dataset.mapping()[2].keys())[-20:]

['B01FUYP6TA',
 'B00FS2ZENI',
 'B01DVMAS6E',
 'B00GXNIIVQ',
 'B00K808WUC',
 'B00JRNEA48',
 'B00ML2YGNM',
 'B00JRN547A',
 'B00M55A8LE',
 'B01H7ENM5Q',
 'B01ADR9G68',
 'B01HEZ41NK',
 'B01C6M71NS',
 'B01CC5KHWG',
 'B01CR9ZDIU',
 'B01HFBZKQK',
 'B01HF4AOO0',
 'B01HF4AG3O',
 'B01HF4AG0W',
 'B01HF4AE7C']

In [21]:
list(dataset.mapping()[3].keys())[-20:]

['B01HEZ41NK',
 'B01C6M71NS',
 'B01CC5KHWG',
 'B01CR9ZDIU',
 'B01HFBZKQK',
 'B01HF4AOO0',
 'B01HF4AG3O',
 'B01HF4AG0W',
 'B01HF4AE7C',
 'Women',
 'Luggage & Travel Gear',
 'Men',
 'Boys',
 'Costumes & Accessories',
 'Novelty & More',
 'Girls',
 'Baby',
 'Shoe, Jewelry & Watch Accessories',
 'Uniforms, Work & Safety',
 'Traditional & Cultural Wear']

## Model training

In [9]:
model = LightFM(
    no_components=240,
    loss='warp',
    learning_schedule='adadelta',
    epsilon=2.45e-07,
    rho=0.958,
    item_alpha=5.97e-05,
    user_alpha=2.06e-6,
    max_sampled=9
)

my new implementation


In [11]:
model

<lightfm.lightfm.LightFM at 0x7fbb154afc50>

In [12]:
item_features

<242400x242411 sparse matrix of type '<class 'numpy.float32'>'
	with 484798 stored elements in Compressed Sparse Row format>

In [None]:
num_epochs = [5, 25, 50, 250, 500, 1000]
remaining = [num_epochs[0]] + [num_epochs[i] - num_epochs[i-1] for i in range(1, len(num_epochs))]
for i, epochs in enumerate(num_epochs):
    model.fit_partial(interactions, item_features=item_features, verbose=True, epochs=remaining[i], num_threads=8)
    pickle.dump(model, open(f'/pio/scratch/1/i313924/data/lightfm_data/model_{epochs}_epochs_optimal_hyperparams_categories.pkl', 'wb'), protocol=4)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]