In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [45]:
from lightfm import LightFM
from rectools.metrics import MAP
from rectools.models import LightFMWrapperModel
from rectools.tools import UserToItemAnnRecommender
from rectools import Columns
from rectools.dataset import Dataset
from pathlib import Path
import pandas as pd
import optuna
from tqdm.auto import tqdm
import joblib
import nmslib

## Loading data

In [4]:
DATA_PATH = Path("../../data/03_primary")

In [5]:
users_features_train = pd.read_csv(DATA_PATH / 'prepared_featured_users_train.csv')
items_features_train = pd.read_csv(DATA_PATH / 'prepared_featured_items_train.csv')

users_features_full = pd.read_csv(DATA_PATH / 'prepared_featured_users_full.csv')
items_features_full = pd.read_csv(DATA_PATH / 'prepared_featured_items_full.csv')

interactions_train = pd.read_csv(DATA_PATH / 'prepared_interactions_train.csv')
interactions_test = pd.read_csv(DATA_PATH / 'prepared_interactions_test.csv')

interactions_full = pd.read_csv(DATA_PATH / 'prepared_interactions_full.csv')


In [6]:
train_dataset = Dataset.construct(
    interactions_df=interactions_train,
)

train_featured_dataset = Dataset.construct(
    interactions_df=interactions_train,
    user_features_df=users_features_train,
    cat_user_features=["sex", "age", "income"],
    item_features_df=items_features_train,
    cat_item_features=["genre", "content_type","age_rating","country","release_year_bin"],
)

In [7]:
full_dataset = Dataset.construct(
    interactions_df=interactions_full,
)

full_featured_dataset = Dataset.construct(
    interactions_df=interactions_full,
    user_features_df=users_features_full,
    cat_user_features=["sex", "age", "income"],
    item_features_df=items_features_full,
    cat_item_features=["genre", "content_type","age_rating","country","release_year_bin"],
)

## Hyperparameters choice via Optuna

In [8]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 4
TOTAL_ITERATIONS = 50

In [9]:
class TqdmCallback:
    '''Callback to check progress of optuna hyperparameters selection'''
    def __init__(self, total):
        self.pbar = tqdm(total = total)

    def __call__(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None:
        self.pbar.update(1)

In [15]:
# Metric
metric_map = MAP(k = K_RECOS)

### Without features

In [18]:
def objective(trial):
    '''Objective for optuna'''  
    # Set parameters borders
    components = trial.suggest_int('no_components', 2, 32, log = True)
    rho = trial.suggest_float('rho', 0.9, 0.99, log = True)
    epsilon = trial.suggest_float('epsilon', 1e-6, 1e-5, log = True)
    lr = trial.suggest_float('learning_rate', 3e-4, 0.3, log = True)
    
    lightfm = LightFMWrapperModel(
        model = LightFM(
            no_components = components,
            learning_schedule = 'adadelta',
            loss = 'warp',
            rho = rho,
            epsilon = epsilon,
            learning_rate = lr,
            random_state = RANDOM_STATE
        )
    )
    
    lightfm.fit(train_dataset)
    
    # Make recommendations for users in test
    recos = lightfm.recommend(
        users=interactions_test[Columns.User].unique(),
        dataset=train_dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    
    # Calculate metric
    map_10 = metric_map.calc(recos, interactions_test)
    
    return map_10


In [19]:
tqdm_callback = TqdmCallback(TOTAL_ITERATIONS)

study = optuna.create_study(directions = ['maximize'])
study.optimize(objective, n_trials=TOTAL_ITERATIONS, callbacks=[tqdm_callback])

  0%|          | 0/50 [00:00<?, ?it/s]

[32m[I 2022-12-05 15:03:57,390][0m A new study created in memory with name: no-name-7329220b-f908-46af-8c1b-89cdbe00bb53[0m
[32m[I 2022-12-05 15:04:44,647][0m Trial 0 finished with value: 0.07839290035485222 and parameters: {'no_components': 7, 'rho': 0.9379805440259346, 'epsilon': 4.422302221887881e-06, 'learning_rate': 0.003975823573135649}. Best is trial 0 with value: 0.07839290035485222.[0m
[32m[I 2022-12-05 15:05:45,272][0m Trial 1 finished with value: 0.08010428629741832 and parameters: {'no_components': 17, 'rho': 0.9765555641055071, 'epsilon': 1.3710247867173495e-06, 'learning_rate': 0.20811124572085096}. Best is trial 1 with value: 0.08010428629741832.[0m
[32m[I 2022-12-05 15:06:30,643][0m Trial 2 finished with value: 0.07234345811073446 and parameters: {'no_components': 3, 'rho': 0.9225624638838649, 'epsilon': 2.3294257775979056e-06, 'learning_rate': 0.02941526052704058}. Best is trial 1 with value: 0.08010428629741832.[0m
[32m[I 2022-12-05 15:07:30,948][0m Tria

[32m[I 2022-12-05 15:31:43,861][0m Trial 29 finished with value: 0.07756124858492278 and parameters: {'no_components': 7, 'rho': 0.9398896165635315, 'epsilon': 3.623296985031526e-06, 'learning_rate': 0.0005471170561303716}. Best is trial 12 with value: 0.08173593202342642.[0m
[32m[I 2022-12-05 15:32:33,120][0m Trial 30 finished with value: 0.07666886668758034 and parameters: {'no_components': 6, 'rho': 0.9558501557237976, 'epsilon': 1.150709423135656e-06, 'learning_rate': 0.001256578119718818}. Best is trial 12 with value: 0.08173593202342642.[0m
[32m[I 2022-12-05 15:33:29,604][0m Trial 31 finished with value: 0.07708566804588296 and parameters: {'no_components': 15, 'rho': 0.9360805658053665, 'epsilon': 3.0279264946911103e-06, 'learning_rate': 0.001172720704203659}. Best is trial 12 with value: 0.08173593202342642.[0m
[32m[I 2022-12-05 15:34:25,209][0m Trial 32 finished with value: 0.07778283091664721 and parameters: {'no_components': 9, 'rho': 0.9157336437325329, 'epsilon'

In [20]:
best_params_without_features = study.best_trial.params
print(best_params_without_features)

{'no_components': 14, 'rho': 0.9293803074906476, 'epsilon': 1.019065155861173e-06, 'learning_rate': 0.0012055115615607931}


### With features

In [23]:
def objective(trial):
    '''Objective for optuna'''  
    # Set parameters borders
    components = trial.suggest_int('no_components', 2, 32, log = True)
    rho = trial.suggest_float('rho', 0.9, 0.99, log = True)
    epsilon = trial.suggest_float('epsilon', 1e-6, 1e-5, log = True)
    lr = trial.suggest_float('learning_rate', 3e-4, 0.3, log = True)
    
    lightfm = LightFMWrapperModel(
        model = LightFM(
            no_components = components,
            learning_schedule = 'adadelta',
            loss = 'warp',
            rho = rho,
            epsilon = epsilon,
            learning_rate = lr,
            random_state = RANDOM_STATE
        )
    )
    
    lightfm.fit(train_featured_dataset)
    
    # Make recommendations for users in test
    recos = lightfm.recommend(
        users=interactions_test[Columns.User].unique(),
        dataset=train_featured_dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    
    # Calculate MAP@10
    map_10 = metric_map.calc(recos, interactions_test)
    
    return map_10


In [24]:
tqdm_callback = TqdmCallback(TOTAL_ITERATIONS)

study = optuna.create_study(directions = ['maximize'])
study.optimize(objective, n_trials=TOTAL_ITERATIONS, callbacks=[tqdm_callback])

  0%|          | 0/50 [00:00<?, ?it/s]

[32m[I 2022-12-05 15:56:07,377][0m A new study created in memory with name: no-name-52c1fcdf-8cb8-4583-aca8-5db6ab268a06[0m
[32m[I 2022-12-05 15:58:40,583][0m Trial 0 finished with value: 0.051569127183431686 and parameters: {'no_components': 24, 'rho': 0.9395897571587353, 'epsilon': 1.854305887516801e-06, 'learning_rate': 0.02526148275821855}. Best is trial 0 with value: 0.051569127183431686.[0m
[32m[I 2022-12-05 15:59:35,600][0m Trial 1 finished with value: 0.03971930889730018 and parameters: {'no_components': 2, 'rho': 0.9881857147779372, 'epsilon': 6.952052472744175e-06, 'learning_rate': 0.23724982913765413}. Best is trial 0 with value: 0.051569127183431686.[0m
[32m[I 2022-12-05 16:01:20,790][0m Trial 2 finished with value: 0.07358321591450179 and parameters: {'no_components': 12, 'rho': 0.9619173207316346, 'epsilon': 2.4217252003747307e-06, 'learning_rate': 0.005564431073978549}. Best is trial 2 with value: 0.07358321591450179.[0m
[32m[I 2022-12-05 16:02:51,215][0m T

[32m[I 2022-12-05 16:36:17,519][0m Trial 30 finished with value: 0.07358134808286242 and parameters: {'no_components': 9, 'rho': 0.9508213992517383, 'epsilon': 2.804611716247611e-06, 'learning_rate': 0.000983126301070189}. Best is trial 8 with value: 0.08119064029860958.[0m
[32m[I 2022-12-05 16:37:22,483][0m Trial 31 finished with value: 0.0783419737168101 and parameters: {'no_components': 4, 'rho': 0.9442936137731517, 'epsilon': 4.421107399747897e-06, 'learning_rate': 0.015061717196197241}. Best is trial 8 with value: 0.08119064029860958.[0m
[32m[I 2022-12-05 16:38:35,460][0m Trial 32 finished with value: 0.08041744831900051 and parameters: {'no_components': 6, 'rho': 0.9404013598512259, 'epsilon': 3.793620741931967e-06, 'learning_rate': 0.0167235703659432}. Best is trial 8 with value: 0.08119064029860958.[0m
[32m[I 2022-12-05 16:39:46,628][0m Trial 33 finished with value: 0.0799688482614787 and parameters: {'no_components': 6, 'rho': 0.9395569243110714, 'epsilon': 3.691206

In [25]:
best_params_with_features = study.best_trial.params
print(best_params_with_features)

{'no_components': 5, 'rho': 0.9479133091288102, 'epsilon': 3.434929518317852e-06, 'learning_rate': 0.0007143586518906775}


## Best models training

In [26]:
retrained_lfm_without_features = LightFMWrapperModel(
    model = LightFM(
        no_components = best_params_without_features['no_components'],
        learning_schedule = 'adadelta',
        loss = 'warp',
        rho = best_params_without_features['rho'],
        epsilon = best_params_without_features['epsilon'],
        learning_rate = best_params_without_features['learning_rate'],
        random_state = RANDOM_STATE
    )
)

retrained_lfm_without_features.fit(full_dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x275e2937970>

In [27]:
retrained_lfm_with_features = LightFMWrapperModel(
    model = LightFM(
        no_components = best_params_with_features['no_components'],
        learning_schedule = 'adadelta',
        loss = 'warp',
        rho = best_params_with_features['rho'],
        epsilon = best_params_with_features['epsilon'],
        learning_rate = best_params_with_features['learning_rate'],
        random_state = RANDOM_STATE
    )
)

retrained_lfm_with_features.fit(full_featured_dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x275e2937220>

In [28]:
MODELS_PATH = Path("../../data/06_models/lightfm")

In [29]:
joblib.dump(retrained_lfm_without_features, MODELS_PATH / 'lfm_without_features.joblib')
joblib.dump(retrained_lfm_with_features, MODELS_PATH / 'lfm_with_features.joblib')

['..\\..\\data\\06_models\\lightfm\\lfm_with_features.joblib']

## ANN via nsmlib based on LightFM with features

In [30]:
retrained_lfm_with_features = joblib.load(MODELS_PATH / 'lfm_with_features.joblib')

In [31]:
user_embeddings, item_embeddings = retrained_lfm_with_features.get_vectors(full_featured_dataset)

In [46]:
ann = UserToItemAnnRecommender(
    user_vectors=user_embeddings,
    item_vectors=item_embeddings,
    user_id_map=full_featured_dataset.user_id_map,
    item_id_map=full_featured_dataset.item_id_map,
    index_init_params={
        'method' : 'hnsw',
        'space' : 'negdotprod',
        'data_type' : nmslib.DataType.DENSE_VECTOR
    }
)

In [47]:
ann.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x276c33195b0>

## Preparing offline recommendations dataframe

In [34]:
lfm_wo_features_reco = retrained_lfm_without_features.recommend(
    users=interactions_full[Columns.User].unique(),
    dataset=full_dataset,
    k=K_RECOS,
    filter_viewed=True,
)

In [55]:
recos_without_features_df = lfm_wo_features_reco[['user_id','item_id']].set_index('user_id')

In [61]:
recos_without_features_df.to_csv(DATA_PATH / 'lfm_wo_features_recos.csv')

In [37]:
lfm_with_features_reco = retrained_lfm_with_features.recommend(
    users=interactions_full[Columns.User].unique(),
    dataset=full_featured_dataset,
    k=K_RECOS,
    filter_viewed=True,
)

In [57]:
recos_with_features_df = lfm_with_features_reco[['user_id','item_id']].set_index('user_id')

In [62]:
recos_with_features_df.to_csv(DATA_PATH / 'lfm_with_features_recos.csv')

In [52]:
ann_reco = ann.get_item_list_for_user_batch(
    user_ids=interactions_full[Columns.User].unique(),
    top_n=K_RECOS
)

In [59]:
ann_recos_df = pd.DataFrame({
    Columns.User: interactions_full[Columns.User].unique(),
    'item_id': ann_reco
}).explode('item_id').set_index('user_id')

In [63]:
ann_recos_df.to_csv(DATA_PATH / 'ann_lightfm_recos.csv')