In [2]:
import typing as tp

import dill
import implicit
import numpy as np
import threadpoolctl

threadpoolctl.threadpool_limits(1, "blas")
import optuna
import pandas as pd
from implicit.als import AlternatingLeastSquares
from optuna.samplers import TPESampler
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import MAP
from rectools.models import (
    ImplicitALSWrapperModel,
)

optuna.logging.set_verbosity(10)
implicit.gpu.HAS_CUDA



False

### Read data

In [3]:
users = pd.read_csv("../datasets/users.csv")
items = pd.read_csv("../datasets/items.csv")
interactions = pd.read_csv("../datasets/interactions.csv")

In [4]:
Columns.Datetime = 'last_watch_dt'
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

### Split data

In [5]:
# train\split
max_date = interactions[Columns.Datetime].max()
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

# drop low duration users
train.drop(train.query("total_dur < 300").index, inplace=True)

# drop cold users
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

### Select features

In [6]:
def get_user_features(users: pd.DataFrame, interactions: pd.DataFrame, features: tp.List[str]):
    users.fillna("Unknown", inplace=True)
    users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
    user_features_frames = []
    for feature in features:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    user_features = pd.concat(user_features_frames)
    return user_features

In [7]:
def get_item_features(items: pd.DataFrame, interactions: pd.DataFrame):
    items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()
    items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"
    content_feature = items.reindex(columns=[Columns.Item, "content_type"])
    content_feature.columns = ["id", "value"]
    content_feature["feature"] = "content_type"
    item_features = pd.concat((genre_feature, content_feature))
    return item_features

In [8]:
user_features = get_user_features(users, train, ["sex", "age", "income"])
item_features = get_item_features(items, train)

### Create dataset

In [9]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

### Define metric

In [10]:
metric = MAP(k=10)

In [11]:
K_RECOS = 10
RANDOM_STATE = 228

# ALS MODEL

In [11]:
def als_optuna_objective(trial):
    test_users = test[Columns.User].unique()
    factors = trial.suggest_categorical("n_factors", [8, 16, 32])
    fit_features_together = trial.suggest_categorical("fit_features_together", [True, False])
    regularization = trial.suggest_float('regularization', 0.001, 0.1, log=True)
    iterations = trial.suggest_int('iterations', 5, 20)

    model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=factors,
            random_state=RANDOM_STATE,
            regularization=regularization,
            iterations=iterations,
            num_threads=24,
        ),
        fit_features_together=fit_features_together,
    )

    model.fit(dataset)
    recos = model.recommend(
        users=test_users,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    mtrc = metric.calc_per_user(recos, test)
    return mtrc.mean()

In [12]:
# перебор параметров с помощью optuna
sampler = TPESampler(seed=1)
study = optuna.create_study(directions=['maximize'], sampler=sampler)
study.optimize(als_optuna_objective, n_trials=20, n_jobs=24, gc_after_trial=True)

[I 2023-12-06 02:19:31,116] A new study created in memory with name: no-name-75e09673-f9bf-44e1-b2ce-936e5da30725


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 02:20:59,631] Trial 7 finished with value: 0.06578866193207403 and parameters: {'n_factors': 8, 'fit_features_together': False, 'regularization': 0.008518897149630939, 'iterations': 8}. Best is trial 7 with value: 0.06578866193207403.
[I 2023-12-06 02:20:59,774] Trial 1 finished with value: 0.06230259383072486 and parameters: {'n_factors': 16, 'fit_features_together': False, 'regularization': 0.0012806258247871605, 'iterations': 15}. Best is trial 7 with value: 0.06578866193207403.
[I 2023-12-06 02:20:59,844] Trial 2 finished with value: 0.06234207587193346 and parameters: {'n_factors': 16, 'fit_features_together': False, 'regularization': 0.006333352748031255, 'iterations': 14}. Best is trial 7 with value: 0.06578866193207403.
[I 2023-12-06 02:20:59,852] Trial 0 finished with value: 0.077302997753657 and parameters: {'n_factors': 8, 'fit_features_together': True, 'regularization': 0.006051131188869904, 'iterations': 10}. Best is trial 0 with value: 0.077302997753657.
[I 

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-06 02:22:27,075] Trial 10 finished with value: 0.0629056005353398 and parameters: {'n_factors': 16, 'fit_features_together': False, 'regularization': 0.004823603535780675, 'iterations': 5}. Best is trial 0 with value: 0.077302997753657.
[I 2023-12-06 02:22:27,162] Trial 13 finished with value: 0.07535903191558452 and parameters: {'n_factors': 8, 'fit_features_together': True, 'regularization': 0.007285495620246419, 'iterations': 14}. Best is trial 0 with value: 0.077302997753657.
[I 2023-12-06 02:22:27,248] Trial 12 finished with value: 0.07428950354498708 and parameters: {'n_factors': 32, 'fit_features_together': True, 'regularization': 0.033692056968095026, 'iterations': 20}. Best is trial 0 with value: 0.077302997753657.
[I 2023-12-06 02:22:27,256] Trial 9 finished with value: 0.07440705228771592 and parameters: {'n_factors': 16, 'fit_features_together': True, 'regularization': 0.08429056739753245, 'iterations': 6}. Best is trial 0 with value: 0.077302997753657.
[I 2023-1

## BEST PARAMS

In [13]:
print(f'Лучшее значение MAP@10: {study.best_value}')
print(f'Лучшие параметры: {study.best_params}')

Лучшее значение MAP@10: 0.07741930704805924
Лучшие параметры: {'n_factors': 8, 'fit_features_together': True, 'regularization': 0.02140145192621051, 'iterations': 10}


## Learn Best Model

In [None]:
user_features = get_user_features(users, interactions, ["sex", "age", "income"])
item_features = get_item_features(items, interactions)

dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

model = ImplicitALSWrapperModel(
    AlternatingLeastSquares(
        factors=16,
        random_state=RANDOM_STATE,
        regularization=0.00113,
        iterations=11,
        num_threads=24,
    ),
    fit_features_together=True
)

model.fit(dataset)

In [None]:
with open(f'asl.dill', 'wb') as f:
    dill.dump(model, f)

In [None]:
with open('dataset.dill', 'wb') as f:
    dill.dump(dataset, f)