# Эксперименты с films доменом

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

In [2]:
from pathlib import Path
from tqdm import tqdm
import pickle
from time import time
from tqdm import tqdm
import pandas as pd
import numpy as np

import mlflow
import optuna

from src.models.BaseModel import TopRecommender
from src.models.ItemBasedRecommenders import CosineDistanceRecommender
from src.models.MatrixFactorizationRecommenders import ALSBasedRecommender, LightFMBasedRecommender

from src.metrics import average_single_precision



In [35]:
# Настраиваем MLFlow
mlflow.set_tracking_uri((Path.cwd() / '../src/models/mlflow_tracking/mlruns'))
mlflow.set_experiment('Films Domain')

<Experiment: artifact_location='file:///C:/Users/trybi/PycharmProjects/MFDP-RecSys/src/models/mlflow_tracking/mlruns/3', creation_time=1681633083831, experiment_id='3', last_update_time=1681633083831, lifecycle_stage='active', name='Films Domain', tags={}>

In [36]:
data_path = Path('../data')

In [37]:
df_interactions = pd.read_parquet(data_path / 'films_interactions.parquet')
df_films = pd.read_parquet(data_path / 'films_bd.parquet')

In [38]:
df_interactions = df_interactions.reset_index().drop('index', axis=1)

In [39]:
df_interactions

Unnamed: 0,user_id,item_id,rating
0,176549,f9506,-1
1,699317,f1659,1
2,1032142,f6686,1
3,1016458,f354,-1
4,884009,f693,-1
...,...,...,...
2397261,268216,f3071,1
2397262,438585,f7829,1
2397263,546862,f9673,-1
2397264,384202,f16197,1


In [40]:
# Отложим 1000 наблюдений в качестве тестовой выборки. На них будем считать метрику MAP@10.

test_samples = np.random.choice(range(len(df_interactions)), size = 1000, replace=False)

df_train = df_interactions.drop(test_samples)
df_test = df_interactions.iloc[test_samples]

In [41]:
# Соберем словарь {user: [items]}, потому что так быстрее будет работать поиск истории по юзерам
train_db = {user: items.item().split(', ') for user, items in (df_train
                                                               .groupby('user_id')
                                                               .agg({'item_id': lambda x: ', '.join(x)})
                                                               .iterrows())}

In [42]:
# Посмотрим максимальное количество посмотренных фильмов одним человеком.
max(len(i) for i in train_db.values())

350

In [43]:
# Соберем список пар (user: item) из test, чтобы по нему итерироваться
test_data = df_test[['user_id', 'item_id']].values

In [44]:
# Функция, которая будет считать эксперименты 
def experiment(run_name, model, model_params, recommend_params, my_favorites, k=10):
    with mlflow.start_run(run_name=run_name):
        mlflow.log_params(model_params)
        
        time_start = time()
        model.fit(interaction_data=df_train.rename({'user_id': 'user', 'item_id': 'item'}, axis=1),
                  **model_params)
        mlflow.log_metric(f'fit_time', time()-time_start)
        
        maps = []
        times = []
        for new_user, new_item in tqdm(test_data):
            user_favorites = train_db.get(new_user, [])
            time_start = time()
            recs = model.recommend(user_favorites, k, **recommend_params)
            times.append(time()-time_start)
            maps.append(average_single_precision(new_item, recs))
        
        
        mlflow.log_metric(f'MAP_on_{k}', sum(maps)/len(maps))
        mlflow.log_metric(f'time_to_rec', sum(times)/len(times))
        
        my_recs = model.recommend(my_favorites, k, **recommend_params)
        
        
        mlflow.log_text(' \n '.join(str(i) + ' - ' + str(j) + ' - ' + str(round(z)) + ' - ' + str(g) 
                                    for k in my_recs for i, j, z, g in df_films.query('item_id==@k')[['title',
                                                                                                      'directors',
                                                                                                      'release_year',
                                                                                                      'genres']].values),
                        'my_recs.txt')

        with open(model_path / f'{run_name}_model_series.pickle', 'wb') as f:
            pickle.dump(model, f)

        mlflow.log_artifact(model_path / f'{run_name}_model_series.pickle')

In [45]:
model_path = Path('../src/models/models_storage')

In [46]:
my_favorites = ['f8148', 'f14804', 'f11760', 'f3969', 'f1048', 'f2866', 'f4482', 'f11459', 'f3755', 'f10696']

In [47]:
df_films[df_films.item_id.isin(my_favorites)]

Unnamed: 0,item_id,title,release_year,genres,countries,age_rating,directors
107,f14804,Криминальное чтиво,1994.0,"зарубежные, триллеры, криминал, комедии",США,18.0,Квентин Тарантино
3123,f11459,Драйв,2011.0,"драмы, зарубежные, криминал",США,18.0,Николас Виндинг Рефн
8609,f4482,Господин никто,2009.0,"драмы, фантастика, фэнтези, мелодрамы","Германия, Бельгия",18.0,Жако Ван Дормель
9706,f3969,Пи,1997.0,"драмы, ужасы, фантастика, триллеры",США,16.0,Даррен Аронофски
9927,f11760,Реквием по мечте,2000.0,драмы,США,18.0,Даррен Аронофски
10785,f3755,Форрест Гамп,1994.0,"драмы, мелодрамы",США,12.0,Роберт Земекис
11420,f2866,"Карты, деньги, два ствола",1999.0,комедии,Великобритания,18.0,Гай Ричи
13279,f1048,Чёрный лебедь,2010.0,"драмы, триллеры",США,16.0,Даррен Аронофски
13803,f8148,Бешеные псы,1991.0,"драмы, триллеры",США,18.0,Квентин Тарантино
15296,f10696,Назад в будущее,1985.0,"фантастика, приключения, комедии",США,6.0,Роберт Земекис


In [55]:
experiment('Top count recommends', TopRecommender(), {'n_to_save':500, 'metric':'count'}, {'mode':'deterministic'},
           my_favorites, k=10)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3254.88it/s]


In [56]:
experiment('Random recommends', TopRecommender(), {'n_to_save':500, 'metric':'count'}, {'mode':'probabilistic'},
           my_favorites, k=10)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1785.71it/s]


In [58]:
experiment('CosineDistanceRecommender', CosineDistanceRecommender(), {}, {'mode':'deterministic'},
           my_favorites, k=10)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:26<00:00, 37.95it/s]


In [64]:
experiment('CosineDistanceRecommender + random', CosineDistanceRecommender(), {}, {'mode':'probabilistic'},
           my_favorites, k=10)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:27<00:00, 36.11it/s]


In [65]:
experiment('ALS Recommender', ALSBasedRecommender(), {}, {},
           my_favorites, k=10)

  "Intel MKL BLAS detected. Its highly recommend to set the environment "


  0%|          | 0/15 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:09<00:00, 110.88it/s]


In [48]:
df_train['rating'] += 2

In [67]:
experiment('ALS Recommender + changed DS', ALSBasedRecommender(), {}, {},
           my_favorites, k=10)

  0%|          | 0/15 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:10<00:00, 98.29it/s]


In [68]:
experiment('ALS Recommender + changed DS', ALSBasedRecommender(), {'factors': 10, 'regularization': 0.5, 'iterations': 30}, {},
           my_favorites, k=10)

  0%|          | 0/30 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:09<00:00, 102.19it/s]


In [69]:
experiment('ALS Recommender + changed DS', ALSBasedRecommender(), {'factors': 50, 'regularization': 1, 'iterations': 30}, {},
           my_favorites, k=10)

  0%|          | 0/30 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:08<00:00, 114.63it/s]


In [70]:
experiment('ALS Recommender + changed DS', ALSBasedRecommender(), {'factors': 20, 'regularization': 3, 'iterations': 50}, {},
           my_favorites, k=10)

  0%|          | 0/50 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:08<00:00, 124.39it/s]


In [71]:
experiment('ALS Recommender + changed DS', ALSBasedRecommender(), {'factors': 200, 'regularization': 1, 'iterations': 50}, {},
           my_favorites, k=10)

  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:15<00:00, 63.87it/s]


### Подбор гипперпараметров

In [18]:
# Отложим 1000 наблюдений в качестве валидационной выборки

val_samples = np.random.choice(range(len(df_train)), size = 1000, replace=False)

df_train_val = df_train.iloc[val_samples]
df_train_train = df_train[~df_train.index.isin(df_train_val.index)]

In [19]:
train_db_new = {user: items.item().split(', ') for user, items in (df_train_train
                                                               .groupby('user_id')
                                                               .agg({'item_id': lambda x: ', '.join(x)})
                                                               .iterrows())}

In [20]:
# Напишем функцию для подбора гипперов
def objective(trial):
    factors = trial.suggest_int("factors", 5, 200)
    regularization = trial.suggest_float("regularization", 0, 20)
    
    model.fit(interaction_data=df_train_train.rename({'user_id': 'user', 'item_id': 'item'}, axis=1),
              factors = factors,
              regularization = regularization,
              iterations=30)
    
    maps = []
    for new_user, new_item in df_train_val[['user_id', 'item_id']].values:
        user_favorites = train_db_new.get(new_user, [])
        recs = model.recommend(user_favorites, 10)
        maps.append(average_single_precision(new_item, recs))

    
    return sum(maps)/len(maps)

In [21]:
model = ALSBasedRecommender()

In [22]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40)

[32m[I 2023-04-18 13:21:02,147][0m A new study created in memory with name: no-name-13eec682-6dc4-49c1-ade2-557a382cd8af[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:23:58,403][0m Trial 0 finished with value: 0.00535535714285714 and parameters: {'factors': 98, 'regularization': 6.672605043737711}. Best is trial 0 with value: 0.00535535714285714.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:26:37,229][0m Trial 1 finished with value: 0.0051353174603174555 and parameters: {'factors': 102, 'regularization': 3.778660492292516}. Best is trial 0 with value: 0.00535535714285714.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:30:10,179][0m Trial 2 finished with value: 0.005113373015873011 and parameters: {'factors': 137, 'regularization': 9.134532017165212}. Best is trial 0 with value: 0.00535535714285714.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:31:11,386][0m Trial 3 finished with value: 0.007372103174603161 and parameters: {'factors': 5, 'regularization': 1.4987281575143663}. Best is trial 3 with value: 0.007372103174603161.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:33:41,385][0m Trial 4 finished with value: 0.005553690476190471 and parameters: {'factors': 91, 'regularization': 10.880985662030596}. Best is trial 3 with value: 0.007372103174603161.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:34:57,284][0m Trial 5 finished with value: 0.004868134920634914 and parameters: {'factors': 27, 'regularization': 5.240495333320672}. Best is trial 3 with value: 0.007372103174603161.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:35:56,101][0m Trial 6 finished with value: 0.005333015873015868 and parameters: {'factors': 10, 'regularization': 8.157105460246276}. Best is trial 3 with value: 0.007372103174603161.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:38:47,001][0m Trial 7 finished with value: 0.005227817460317458 and parameters: {'factors': 115, 'regularization': 15.083078367930657}. Best is trial 3 with value: 0.007372103174603161.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:41:30,707][0m Trial 8 finished with value: 0.005022499999999996 and parameters: {'factors': 134, 'regularization': 7.823498448654853}. Best is trial 3 with value: 0.007372103174603161.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:44:38,819][0m Trial 9 finished with value: 0.004770119047619045 and parameters: {'factors': 154, 'regularization': 4.571326758106212}. Best is trial 3 with value: 0.007372103174603161.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:48:48,761][0m Trial 10 finished with value: 0.004923134920634918 and parameters: {'factors': 189, 'regularization': 1.877306617967399}. Best is trial 3 with value: 0.007372103174603161.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:50:13,280][0m Trial 11 finished with value: 0.005130119047619042 and parameters: {'factors': 55, 'regularization': 0.08158122073573537}. Best is trial 3 with value: 0.007372103174603161.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:51:29,215][0m Trial 12 finished with value: 0.00484646825396825 and parameters: {'factors': 60, 'regularization': 13.101630956811313}. Best is trial 3 with value: 0.007372103174603161.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:53:14,149][0m Trial 13 finished with value: 0.005223293650793645 and parameters: {'factors': 64, 'regularization': 12.15988946538011}. Best is trial 3 with value: 0.007372103174603161.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:54:34,648][0m Trial 14 finished with value: 0.005036111111111102 and parameters: {'factors': 33, 'regularization': 19.585928180505377}. Best is trial 3 with value: 0.007372103174603161.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:57:13,732][0m Trial 15 finished with value: 0.0055520238095238044 and parameters: {'factors': 82, 'regularization': 0.03550220477643706}. Best is trial 3 with value: 0.007372103174603161.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:58:04,227][0m Trial 16 finished with value: 0.007771190476190463 and parameters: {'factors': 6, 'regularization': 10.704300633455523}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 13:58:48,810][0m Trial 17 finished with value: 0.007758333333333319 and parameters: {'factors': 5, 'regularization': 6.270995751097528}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:00:05,151][0m Trial 18 finished with value: 0.004799880952380946 and parameters: {'factors': 35, 'regularization': 9.819390138222932}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:01:21,653][0m Trial 19 finished with value: 0.004700674603174599 and parameters: {'factors': 27, 'regularization': 6.967374725586941}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:05:57,164][0m Trial 20 finished with value: 0.004627420634920634 and parameters: {'factors': 191, 'regularization': 5.995348502786856}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:06:49,947][0m Trial 21 finished with value: 0.007213650793650783 and parameters: {'factors': 6, 'regularization': 2.8998027089442138}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:07:43,624][0m Trial 22 finished with value: 0.0075706746031745905 and parameters: {'factors': 5, 'regularization': 1.9775115478008432}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:09:31,528][0m Trial 23 finished with value: 0.005299285714285708 and parameters: {'factors': 50, 'regularization': 4.307182119822377}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:10:39,706][0m Trial 24 finished with value: 0.004548849206349202 and parameters: {'factors': 20, 'regularization': 5.913854948283012}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:12:30,582][0m Trial 25 finished with value: 0.005341230158730153 and parameters: {'factors': 47, 'regularization': 3.02068678769128}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:14:42,121][0m Trial 26 finished with value: 0.005371507936507931 and parameters: {'factors': 75, 'regularization': 8.610281704701972}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:15:46,067][0m Trial 27 finished with value: 0.004572539682539681 and parameters: {'factors': 17, 'regularization': 5.499655164395529}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:17:21,412][0m Trial 28 finished with value: 0.005191666666666663 and parameters: {'factors': 39, 'regularization': 7.013371587694641}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:18:20,333][0m Trial 29 finished with value: 0.004674722222222219 and parameters: {'factors': 17, 'regularization': 6.763057550345998}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:20:00,167][0m Trial 30 finished with value: 0.005393730158730153 and parameters: {'factors': 70, 'regularization': 3.973100116938614}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:20:45,139][0m Trial 31 finished with value: 0.0075711111111111005 and parameters: {'factors': 5, 'regularization': 2.24475340609706}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:21:32,771][0m Trial 32 finished with value: 0.007456190476190467 and parameters: {'factors': 6, 'regularization': 2.032717927445736}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:22:38,550][0m Trial 33 finished with value: 0.004700317460317456 and parameters: {'factors': 22, 'regularization': 3.248395205429092}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:23:50,961][0m Trial 34 finished with value: 0.005105515873015866 and parameters: {'factors': 44, 'regularization': 0.9337992127549315}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:24:41,712][0m Trial 35 finished with value: 0.007657023809523796 and parameters: {'factors': 5, 'regularization': 2.3933689397027074}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:25:53,389][0m Trial 36 finished with value: 0.005081944444444437 and parameters: {'factors': 30, 'regularization': 4.73505202807431}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:26:51,989][0m Trial 37 finished with value: 0.004769087301587299 and parameters: {'factors': 13, 'regularization': 3.640101597809995}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:27:50,826][0m Trial 38 finished with value: 0.004839880952380949 and parameters: {'factors': 24, 'regularization': 9.04561607693111}. Best is trial 16 with value: 0.007771190476190463.[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-04-18 14:30:13,790][0m Trial 39 finished with value: 0.005313888888888883 and parameters: {'factors': 103, 'regularization': 5.209907133441384}. Best is trial 16 with value: 0.007771190476190463.[0m


In [23]:
study.best_params

{'factors': 6, 'regularization': 10.704300633455523}

In [49]:
experiment('ALS Recommender + changed DS + optuna', ALSBasedRecommender(), {'factors': 6, 
                                                                   'regularization': 10.704300633455523, 
                                                                   'iterations': 30}, {},
           my_favorites, k=10)

  0%|          | 0/30 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:09<00:00, 100.94it/s]
