# Эксперименты с series доменом

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

In [77]:
from pathlib import Path
from tqdm import tqdm
import pickle
from time import time
from tqdm import tqdm
import pandas as pd
import numpy as np

import mlflow
import optuna

from src.models.BaseModel import TopRecommender
from src.models.ItemBasedRecommenders import CosineDistanceRecommender
from src.models.MatrixFactorizationRecommenders import ALSBasedRecommender, LightFMBasedRecommender

from src.metrics import average_single_precision

In [3]:
# Настраиваем MLFlow
mlflow.set_tracking_uri((Path.cwd() / '../src/models/mlflow_tracking/mlruns'))
mlflow.set_experiment('Series Domain')

2023/04/15 19:10:23 INFO mlflow.tracking.fluent: Experiment with name 'Series Domain' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/trybi/PycharmProjects/MFDP-RecSys/src/models/mlflow_tracking/mlruns/2', creation_time=1681575023415, experiment_id='2', last_update_time=1681575023415, lifecycle_stage='active', name='Series Domain', tags={}>

In [4]:
data_path = Path('../data')

In [6]:
df_interactions = pd.read_parquet(data_path / 'series_interactions.parquet')
df_series = pd.read_parquet(data_path / 'series_bd.parquet')

In [8]:
df_interactions = df_interactions.reset_index().drop('index', axis=1)

In [9]:
df_interactions

Unnamed: 0,user_id,item_id,rating
0,864613,s7638,1
1,15607,s3043,1
2,81786,s2616,1
3,474544,s10440,1
4,810825,s4223,1
...,...,...,...
447401,13243,s2657,-1
447402,565181,s15297,-1
447403,616080,s15297,-1
447404,337043,s9689,1


In [10]:
# Отложим 1000 наблюдений в качестве тестовой выборки. На них будем считать метрику MAP@10.
# Немного наблюдений в качестве теста, потому что кажется, что дело это не быстрое. 

test_samples = np.random.choice(range(len(df_interactions)), size = 1000, replace=False)

df_train = df_interactions.drop(test_samples)
df_test = df_interactions.iloc[test_samples]

In [11]:
# Соберем словарь {user: [items]}, потому что так быстрее будет работать поиск истории по юзерам
train_db = {user: items.item().split(', ') for user, items in (df_train
                                                               .groupby('user_id')
                                                               .agg({'item_id': lambda x: ', '.join(x)})
                                                               .iterrows())}

In [12]:
# Посмотрим максимальное количество посмотренных сериалов. Спойлер - не так много
max(len(i) for i in train_db.values())

70

In [13]:
# Соберем список пар (user: item) из test, чтобы по нему итерироваться
test_data = df_test[['user_id', 'item_id']].values

In [31]:
# Функция, которая будет считать эксперименты 
def experiment(run_name, model, model_params, recommend_params, my_favorites, k=10):
    with mlflow.start_run(run_name=run_name):
        mlflow.log_params(model_params)
        
        time_start = time()
        model.fit(interaction_data=df_train.rename({'user_id': 'user', 'item_id': 'item'}, axis=1),
                  **model_params)
        mlflow.log_metric(f'fit_time', time()-time_start)
        
        maps = []
        times = []
        for new_user, new_item in tqdm(test_data):
            user_favorites = train_db.get(new_user, [])
            time_start = time()
            recs = model.recommend(user_favorites, k, **recommend_params)
            times.append(time()-time_start)
            maps.append(average_single_precision(new_item, recs))
        
        
        mlflow.log_metric(f'MAP_on_{k}', sum(maps)/len(maps))
        mlflow.log_metric(f'time_to_rec', sum(times)/len(times))
        
        my_recs = model.recommend(my_favorites, k, **recommend_params)
        
        
        mlflow.log_text(' \n '.join(i + ' - ' + j + ' - ' + z for k in my_recs for i, j, z in df_series.query('item_id==@k')[['title', 
                                                                                                                              'genres',
                                                                                                                              'countries']
                                                                                                                            ].values),
                        'my_recs.txt')

        with open(model_path / f'{run_name}_model_series.pickle', 'wb') as f:
            pickle.dump(model, f)

        mlflow.log_artifact(model_path / f'{run_name}_model_series.pickle')

In [35]:
model_path = Path('../src/models/models_storage')

In [36]:
my_favorites = ['s9157', 's15314', 's1204']

In [37]:
df_series[df_series.item_id.isin(my_favorites)]

Unnamed: 0,item_id,title,release_year,genres,countries,age_rating,directors
4890,s1204,Почему женщины убивают,2019.0,"драмы, мелодрамы, триллеры, комедии",США,18.0,"Дэвид Гроссман, Дэвид Уоррен"
9720,s9157,Твин Пикс,1990.0,"драмы, мистика, триллеры, детективы",США,18.0,"Дэвид Линч, Калеб Дешанель, Лесли Линка Глаттер"
11848,s15314,Острые козырьки,2013.0,"драмы, криминал",Великобритания,16.0,"Колм МакКарти, Тим Милантс, Дэвид Кэффри"


In [39]:
experiment('Top count recommends', TopRecommender(), {'n_to_save':100, 'metric':'count'}, {'mode':'deterministic'},
           my_favorites, k=10)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 5649.61it/s]


In [40]:
experiment('Random recommends', TopRecommender(), {'n_to_save':1000, 'metric':'count'}, {'mode':'probabilistic'},
           my_favorites, k=10)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1282.05it/s]


In [41]:
experiment('CosineDistanceRecommender', CosineDistanceRecommender(), {}, {'mode':'deterministic'},
           my_favorites, k=10)

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 732.48it/s]


In [59]:
experiment('CosineDistanceRecommender + random', CosineDistanceRecommender(), {}, {'mode':'probabilistic'},
           my_favorites, k=10)

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 580.60it/s]


In [72]:
experiment('ALS Recommender', ALSBasedRecommender(), {}, {},
           my_favorites, k=10)

  0%|          | 0/15 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 745.16it/s]


In [73]:
df_train['rating'] += 2

In [74]:
experiment('ALS Recommender + changed DS', ALSBasedRecommender(), {}, {},
           my_favorites, k=10)

  0%|          | 0/15 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 734.75it/s]


In [75]:
experiment('ALS Recommender + changed DS', ALSBasedRecommender(), {'factors': 10, 'regularization': 0.5, 'iterations': 30}, {},
           my_favorites, k=10)

  0%|          | 0/30 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1026.70it/s]


In [76]:
experiment('ALS Recommender + changed DS', ALSBasedRecommender(), {'factors': 50, 'regularization': 1, 'iterations': 30}, {},
           my_favorites, k=10)

  0%|          | 0/30 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 840.33it/s]


## Попробуем подобрать гипперы для ALS

In [78]:
# Отложим 1000 наблюдений в качестве валидационной выборки

val_samples = np.random.choice(range(len(df_train)), size = 1000, replace=False)

df_train_val = df_train.iloc[val_samples]
df_train_train = df_train[~df_train.index.isin(df_train_val.index)]

In [85]:
train_db_new = {user: items.item().split(', ') for user, items in (df_train_train
                                                               .groupby('user_id')
                                                               .agg({'item_id': lambda x: ', '.join(x)})
                                                               .iterrows())}

In [89]:
# Напишем функцию для подбора гипперов
def objective(trial):
    factors = trial.suggest_int("factors", 5, 100)
    regularization = trial.suggest_float("regularization", 0, 10)
    
    model.fit(interaction_data=df_train_train.rename({'user_id': 'user', 'item_id': 'item'}, axis=1),
              factors = factors,
              regularization = regularization,
              iterations=30)
    
    maps = []
    for new_user, new_item in df_train_val[['user_id', 'item_id']].values:
        user_favorites = train_db_new.get(new_user, [])
        recs = model.recommend(user_favorites, 10)
        maps.append(average_single_precision(new_item, recs))

    
    return sum(maps)/len(maps)

In [90]:
model = ALSBasedRecommender()

In [92]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[32m[I 2023-04-18 01:15:55,827][0m A new study created in memory with name: no-name-79b976d2-d68c-4c37-864d-6406a640a1e2[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:16:13,914][0m Trial 0 finished with value: 0.016224960317460315 and parameters: {'factors': 38, 'regularization': 9.919748184947982}. Best is trial 0 with value: 0.016224960317460315.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:17:12,526][0m Trial 1 finished with value: 0.019889325396825423 and parameters: {'factors': 97, 'regularization': 7.748745346339712}. Best is trial 1 with value: 0.019889325396825423.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:17:39,801][0m Trial 2 finished with value: 0.02059658730158734 and parameters: {'factors': 84, 'regularization': 9.375236971368652}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:18:19,070][0m Trial 3 finished with value: 0.016541269841269832 and parameters: {'factors': 99, 'regularization': 5.093130616311459}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:18:33,406][0m Trial 4 finished with value: 0.008406468253968243 and parameters: {'factors': 22, 'regularization': 9.400437314550514}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:18:55,972][0m Trial 5 finished with value: 0.012120793650793634 and parameters: {'factors': 44, 'regularization': 6.244827333876559}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:19:05,200][0m Trial 6 finished with value: 0.014920793650793648 and parameters: {'factors': 5, 'regularization': 0.9888031068997993}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:19:33,317][0m Trial 7 finished with value: 0.0025986507936507936 and parameters: {'factors': 82, 'regularization': 0.4978743151945453}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:19:50,139][0m Trial 8 finished with value: 0.01440857142857141 and parameters: {'factors': 52, 'regularization': 6.249908880335084}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:20:26,758][0m Trial 9 finished with value: 0.013671944444444431 and parameters: {'factors': 84, 'regularization': 4.589232397201232}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:20:55,611][0m Trial 10 finished with value: 0.019101825396825423 and parameters: {'factors': 69, 'regularization': 8.375707337460392}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:21:35,834][0m Trial 11 finished with value: 0.019889682539682572 and parameters: {'factors': 100, 'regularization': 7.781487265939125}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:22:09,382][0m Trial 12 finished with value: 0.019805198412698444 and parameters: {'factors': 73, 'regularization': 8.265286757457549}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:22:36,049][0m Trial 13 finished with value: 0.020472936507936558 and parameters: {'factors': 69, 'regularization': 9.919675769508727}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:23:00,949][0m Trial 14 finished with value: 0.020242261904761933 and parameters: {'factors': 64, 'regularization': 9.463279650947177}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:23:27,206][0m Trial 15 finished with value: 0.020556547619047662 and parameters: {'factors': 62, 'regularization': 9.91345569968235}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:23:46,762][0m Trial 16 finished with value: 0.018722857142857163 and parameters: {'factors': 54, 'regularization': 8.408237373631573}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:24:23,919][0m Trial 17 finished with value: 0.01864750000000001 and parameters: {'factors': 83, 'regularization': 7.08135545428782}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:24:50,479][0m Trial 18 finished with value: 0.019200555555555585 and parameters: {'factors': 57, 'regularization': 8.99277728967387}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:25:10,384][0m Trial 19 finished with value: 0.014561865079365058 and parameters: {'factors': 33, 'regularization': 8.90568819516401}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:25:41,024][0m Trial 20 finished with value: 0.019110714285714297 and parameters: {'factors': 89, 'regularization': 7.150945238140539}. Best is trial 2 with value: 0.02059658730158734.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:26:10,937][0m Trial 21 finished with value: 0.020669365079365124 and parameters: {'factors': 73, 'regularization': 9.947220643151725}. Best is trial 21 with value: 0.020669365079365124.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:26:38,747][0m Trial 22 finished with value: 0.020789484126984177 and parameters: {'factors': 79, 'regularization': 9.860345016597467}. Best is trial 22 with value: 0.020789484126984177.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:27:07,309][0m Trial 23 finished with value: 0.020141944444444487 and parameters: {'factors': 78, 'regularization': 9.076876715902522}. Best is trial 22 with value: 0.020789484126984177.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:27:40,387][0m Trial 24 finished with value: 0.02053706349206354 and parameters: {'factors': 77, 'regularization': 9.872573914169088}. Best is trial 22 with value: 0.020789484126984177.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:28:19,134][0m Trial 25 finished with value: 0.020182023809523844 and parameters: {'factors': 90, 'regularization': 8.836267424384642}. Best is trial 22 with value: 0.020789484126984177.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:28:51,854][0m Trial 26 finished with value: 0.02042107142857147 and parameters: {'factors': 89, 'regularization': 9.192531369477408}. Best is trial 22 with value: 0.020789484126984177.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:29:16,175][0m Trial 27 finished with value: 0.020068412698412738 and parameters: {'factors': 70, 'regularization': 8.482361593388962}. Best is trial 22 with value: 0.020789484126984177.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:29:45,832][0m Trial 28 finished with value: 0.020949206349206394 and parameters: {'factors': 92, 'regularization': 9.970274324848962}. Best is trial 28 with value: 0.020949206349206394.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:30:17,864][0m Trial 29 finished with value: 0.020507658730158772 and parameters: {'factors': 90, 'regularization': 9.481318594009554}. Best is trial 28 with value: 0.020949206349206394.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:30:36,612][0m Trial 30 finished with value: 0.018709682539682554 and parameters: {'factors': 46, 'regularization': 9.886941894650134}. Best is trial 28 with value: 0.020949206349206394.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:31:06,566][0m Trial 31 finished with value: 0.02030345238095243 and parameters: {'factors': 77, 'regularization': 9.418236765793036}. Best is trial 28 with value: 0.020949206349206394.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:31:37,163][0m Trial 32 finished with value: 0.02102349206349212 and parameters: {'factors': 92, 'regularization': 9.905673538017696}. Best is trial 32 with value: 0.02102349206349212.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:32:16,851][0m Trial 33 finished with value: 0.020586587301587335 and parameters: {'factors': 94, 'regularization': 9.952995230888309}. Best is trial 32 with value: 0.02102349206349212.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:32:54,896][0m Trial 34 finished with value: 0.019921269841269875 and parameters: {'factors': 95, 'regularization': 7.861390364955381}. Best is trial 32 with value: 0.02102349206349212.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:33:26,718][0m Trial 35 finished with value: 0.020388730158730196 and parameters: {'factors': 84, 'regularization': 8.783828359186922}. Best is trial 32 with value: 0.02102349206349212.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:34:06,516][0m Trial 36 finished with value: 0.020748928571428606 and parameters: {'factors': 95, 'regularization': 9.280005964592144}. Best is trial 32 with value: 0.02102349206349212.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:34:46,638][0m Trial 37 finished with value: 0.020794365079365128 and parameters: {'factors': 95, 'regularization': 9.231536867633928}. Best is trial 32 with value: 0.02102349206349212.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:35:24,526][0m Trial 38 finished with value: 0.020250436507936544 and parameters: {'factors': 100, 'regularization': 8.206198530054463}. Best is trial 32 with value: 0.02102349206349212.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:36:02,399][0m Trial 39 finished with value: 0.020306626984127014 and parameters: {'factors': 93, 'regularization': 8.729393286984587}. Best is trial 32 with value: 0.02102349206349212.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:36:37,093][0m Trial 40 finished with value: 0.020265515873015914 and parameters: {'factors': 86, 'regularization': 9.41087213000666}. Best is trial 32 with value: 0.02102349206349212.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:37:10,192][0m Trial 41 finished with value: 0.020590555555555598 and parameters: {'factors': 94, 'regularization': 9.0082211499245}. Best is trial 32 with value: 0.02102349206349212.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:37:55,751][0m Trial 42 finished with value: 0.02103257936507941 and parameters: {'factors': 97, 'regularization': 9.418154957993593}. Best is trial 42 with value: 0.02103257936507941.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:38:40,912][0m Trial 43 finished with value: 0.021010833333333385 and parameters: {'factors': 98, 'regularization': 9.51356169903456}. Best is trial 42 with value: 0.02103257936507941.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:39:21,370][0m Trial 44 finished with value: 0.019998809523809558 and parameters: {'factors': 98, 'regularization': 7.661570164095995}. Best is trial 42 with value: 0.02103257936507941.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:39:37,949][0m Trial 45 finished with value: 0.008194166666666648 and parameters: {'factors': 20, 'regularization': 8.70189034248093}. Best is trial 42 with value: 0.02103257936507941.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:40:10,662][0m Trial 46 finished with value: 0.020785238095238124 and parameters: {'factors': 100, 'regularization': 9.424653461711204}. Best is trial 42 with value: 0.02103257936507941.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:40:48,340][0m Trial 47 finished with value: 0.020122777777777812 and parameters: {'factors': 89, 'regularization': 8.09797135992447}. Best is trial 42 with value: 0.02103257936507941.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:41:25,304][0m Trial 48 finished with value: 0.019988452380952416 and parameters: {'factors': 86, 'regularization': 8.560537711175737}. Best is trial 42 with value: 0.02103257936507941.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 01:41:54,654][0m Trial 49 finished with value: 0.02069789682539687 and parameters: {'factors': 81, 'regularization': 9.538592831061518}. Best is trial 42 with value: 0.02103257936507941.[0m


In [95]:
study.best_params

{'factors': 97, 'regularization': 9.418154957993593}

In [96]:
experiment('ALS Recommender + changed DS + optuna', ALSBasedRecommender(), {'factors': 97,
                                                                            'regularization': 9.418154957993593,
                                                                            'iterations':30}, {},
           my_favorites, k=10)

  0%|          | 0/30 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 569.15it/s]
