In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"  
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [39]:
from implicit.als import AlternatingLeastSquares
from rectools.metrics import MAP
from rectools.models import ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.tools import UserToItemAnnRecommender
from pathlib import Path
import pandas as pd
import optuna
from tqdm.auto import tqdm
import joblib
import nmslib

## Loading data

In [4]:
DATA_PATH = Path("../../data/03_primary")

In [5]:
users_features_train = pd.read_csv(DATA_PATH / 'prepared_featured_users_train.csv')
items_features_train = pd.read_csv(DATA_PATH / 'prepared_featured_items_train.csv')

users_features_full = pd.read_csv(DATA_PATH / 'prepared_featured_users_full.csv')
items_features_full = pd.read_csv(DATA_PATH / 'prepared_featured_items_full.csv')

interactions_train = pd.read_csv(DATA_PATH / 'prepared_interactions_train.csv')
interactions_test = pd.read_csv(DATA_PATH / 'prepared_interactions_test.csv')

interactions_full = pd.read_csv(DATA_PATH / 'prepared_interactions_full.csv')


In [6]:
train_dataset = Dataset.construct(
    interactions_df=interactions_train,
)

train_featured_dataset = Dataset.construct(
    interactions_df=interactions_train,
    user_features_df=users_features_train,
    cat_user_features=["sex", "age", "income"],
    item_features_df=items_features_train,
    cat_item_features=["genre", "content_type","age_rating","country","release_year_bin"],
)

In [7]:
full_dataset = Dataset.construct(
    interactions_df=interactions_full,
)

full_featured_dataset = Dataset.construct(
    interactions_df=interactions_full,
    user_features_df=users_features_full,
    cat_user_features=["sex", "age", "income"],
    item_features_df=items_features_full,
    cat_item_features=["genre", "content_type","age_rating","country","release_year_bin"],
)

## Hyperparameters choice via Optuna

In [8]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 4
TOTAL_ITERATIONS = 30

In [9]:
class TqdmCallback:
    '''Callback to check progress of optuna hyperparameters selection'''
    def __init__(self, total):
        self.pbar = tqdm(total = total)

    def __call__(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None:
        self.pbar.update(1)

In [18]:
# Metric
metric_map = MAP(k = K_RECOS)

### Without features

In [19]:
def objective(trial):
    '''Objective for optuna'''  
    # Set parameters borders
    num_factors = trial.suggest_int('factors', 2, 32, log = True)
    reg_coefs = trial.suggest_float('regularization', 3e-4, 0.2, log = True)
    iterations = trial.suggest_int('iterations', 10, 50, log = True)
    
    als = ImplicitALSWrapperModel(
        model = AlternatingLeastSquares(
            factors = num_factors,
            regularization = reg_coefs,
            iterations = iterations,
            use_cg = True,
            use_gpu = False,
            num_threads = NUM_THREADS,
            random_state = RANDOM_STATE
        )
    )
    
    als.fit(train_dataset)
    
    # Make recommendations for users in test
    recos = als.recommend(
        users=interactions_test[Columns.User].unique(),
        dataset=train_dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    
    # Calculate metric
    map_10 = metric_map.calc(recos, interactions_test)
    
    return map_10


In [21]:
tqdm_callback = TqdmCallback(TOTAL_ITERATIONS)

study = optuna.create_study(directions = ['maximize'])
study.optimize(objective, n_trials=TOTAL_ITERATIONS, callbacks=[tqdm_callback])

  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2022-12-05 15:05:59,273][0m A new study created in memory with name: no-name-1c531109-aa02-4734-9d3d-dbbbcb6d2c65[0m
[32m[I 2022-12-05 15:07:37,980][0m Trial 0 finished with value: 0.027883843775673777 and parameters: {'factors': 27, 'regularization': 0.09298906550223413, 'iterations': 17}. Best is trial 0 with value: 0.027883843775673777.[0m
[32m[I 2022-12-05 15:09:11,535][0m Trial 1 finished with value: 0.04887594816091626 and parameters: {'factors': 9, 'regularization': 0.002904285493264573, 'iterations': 24}. Best is trial 1 with value: 0.04887594816091626.[0m
[32m[I 2022-12-05 15:10:27,539][0m Trial 2 finished with value: 0.02812217753784152 and parameters: {'factors': 29, 'regularization': 0.001192299336955479, 'iterations': 10}. Best is trial 1 with value: 0.04887594816091626.[0m
[32m[I 2022-12-05 15:12:37,740][0m Trial 3 finished with value: 0.029871562947528893 and parameters: {'factors': 20, 'regularization': 0.1894957010759684, 'iterations': 31}. Best is

In [22]:
best_params_without_features = study.best_trial.params
print(best_params_without_features)

{'factors': 2, 'regularization': 0.03716111608869566, 'iterations': 42}


### With features

In [28]:
def objective(trial):
    '''Objective for optuna'''  
    # Set parameters borders
    num_factors = trial.suggest_int('factors', 2, 32, log = True)
    reg_coefs = trial.suggest_float('regularization', 3e-4, 0.2, log = True)
    iterations = trial.suggest_int('iterations', 5, 20, log = True)
    
    als = ImplicitALSWrapperModel(
        model = AlternatingLeastSquares(
            factors = num_factors,
            regularization = reg_coefs,
            iterations = iterations,
            use_cg = True,
            use_gpu = False,
            num_threads = NUM_THREADS,
            random_state = RANDOM_STATE
        ),
        fit_features_together = True,
    )
    
    als.fit(train_featured_dataset)
    
    # Make recommendations for users in test
    recos = als.recommend(
        users=interactions_test[Columns.User].unique(),
        dataset=train_featured_dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    
    # Calculate MAP@10
    map_10 = metric_map.calc(recos, interactions_test)
    
    return map_10


In [29]:
tqdm_callback = TqdmCallback(TOTAL_ITERATIONS)

study = optuna.create_study(directions = ['maximize'])
study.optimize(objective, n_trials=TOTAL_ITERATIONS, callbacks=[tqdm_callback])

  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2022-12-05 16:08:54,210][0m A new study created in memory with name: no-name-335b5eaf-9c93-4393-a8b8-9a7eebc649b7[0m
[32m[I 2022-12-05 16:12:54,449][0m Trial 0 finished with value: 0.07677138902818899 and parameters: {'factors': 14, 'regularization': 0.00030577682497789774, 'iterations': 12}. Best is trial 0 with value: 0.07677138902818899.[0m
[32m[I 2022-12-05 16:16:55,544][0m Trial 1 finished with value: 0.0749020130952791 and parameters: {'factors': 6, 'regularization': 0.03145948857158636, 'iterations': 18}. Best is trial 0 with value: 0.07677138902818899.[0m
[32m[I 2022-12-05 16:19:36,573][0m Trial 2 finished with value: 0.06978187905716207 and parameters: {'factors': 2, 'regularization': 0.003852034999515797, 'iterations': 7}. Best is trial 0 with value: 0.07677138902818899.[0m
[32m[I 2022-12-05 16:24:17,033][0m Trial 3 finished with value: 0.07899508906455642 and parameters: {'factors': 17, 'regularization': 0.007893433958018913, 'iterations': 16}. Best is t

In [30]:
best_params_with_features = study.best_trial.params
print(best_params_with_features)

{'factors': 3, 'regularization': 0.0018347283256986877, 'iterations': 13}


## Best models training

In [32]:
retrained_als_without_features = ImplicitALSWrapperModel(
    model = AlternatingLeastSquares(
        factors = best_params_without_features['factors'],
        regularization = best_params_without_features['regularization'],
        iterations = best_params_without_features['iterations'],
        use_cg = True,
        use_gpu = False,
        num_threads = NUM_THREADS,
        random_state = RANDOM_STATE
    )
)

retrained_als_without_features.fit(full_dataset)

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x1a7383d2b50>

In [33]:
retrained_als_with_features = ImplicitALSWrapperModel(
    model = AlternatingLeastSquares(
        factors = best_params_with_features['factors'],
        regularization = best_params_with_features['regularization'],
        iterations = best_params_with_features['iterations'],
        use_cg = True,
        use_gpu = False,
        num_threads = NUM_THREADS,
        random_state = RANDOM_STATE
    ),
    fit_features_together = True,
)

retrained_als_with_features.fit(full_featured_dataset)

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x1a7383d2220>

In [34]:
MODELS_PATH = Path("../../data/06_models/als")

In [35]:
joblib.dump(retrained_als_without_features, MODELS_PATH / 'als_without_features.joblib')
joblib.dump(retrained_als_with_features, MODELS_PATH / 'als_with_features.joblib')

['..\\..\\data\\06_models\\als\\als_with_features.joblib']

## ANN via nsmlib based on ALS with features

In [36]:
retrained_als_with_features = joblib.load(MODELS_PATH / 'als_with_features.joblib')

In [37]:
user_embeddings, item_embeddings = retrained_als_with_features.get_vectors()

In [40]:
ann = UserToItemAnnRecommender(
    user_vectors=user_embeddings,
    item_vectors=item_embeddings,
    user_id_map=full_featured_dataset.user_id_map,
    item_id_map=full_featured_dataset.item_id_map,
    index_init_params={
        'method' : 'hnsw',
        'space' : 'negdotprod',
        'data_type' : nmslib.DataType.DENSE_VECTOR
    }
)

In [41]:
ann.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x1a711506100>

## Preparing offline recommendations dataframes

In [43]:
als_wo_features_reco = retrained_als_without_features.recommend(
    users=interactions_full[Columns.User].unique(),
    dataset=full_dataset,
    k=K_RECOS,
    filter_viewed=True,
)

In [60]:
recos_without_features_df = als_wo_features_reco[['user_id','item_id']].set_index('user_id')

In [70]:
recos_without_features_df.to_csv(DATA_PATH / 'als_wo_features_recos.csv')

In [46]:
als_with_features_reco = retrained_als_with_features.recommend(
    users=interactions_full[Columns.User].unique(),
    dataset=full_featured_dataset,
    k=K_RECOS,
    filter_viewed=True,
)

In [62]:
recos_with_features_df = als_wo_features_reco[['user_id','item_id']].set_index('user_id')

In [71]:
recos_with_features_df.to_csv(DATA_PATH / 'als_with_features_recos.csv')

In [49]:
ann_reco = ann.get_item_list_for_user_batch(
    user_ids=interactions_full[Columns.User].unique(),
    top_n=K_RECOS
)

In [68]:
ann_recos_df = pd.DataFrame({
    Columns.User: interactions_full[Columns.User].unique(),
    'item_id': ann_reco
}).explode('item_id').set_index('user_id')

In [72]:
ann_recos_df.to_csv(DATA_PATH / 'ann_als_recos.csv')