In [14]:
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
from service.api.recsys.userknn import UserKnn
from service.api.metrcis_validation.GridSearchCV import GridSearchCV
from dotenv import load_dotenv
from rectools.metrics import calc_metrics
import os
import pickle
import json
from tqdm.auto import tqdm

from service.api.recsys.PopularSocialDem import PopularSocialDem
from service.api.metrcis_validation.metrics import CrossValScore
import pandas as pd
from rectools import Columns
from rectools.models.popular_in_category import PopularInCategoryModel
from rectools.dataset import Dataset
from rectools.dataset import Interactions
from rectools.metrics.classification import Recall
from rectools.metrics.ranking import MAP
from rectools.metrics.serendipity import Serendipity
from rectools.metrics.novelty import MeanInvUserFreq
from rectools.model_selection import TimeRangeSplitter

In [15]:
interactions_df = pd.read_csv('/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/kion_train/interactions.csv')
items = pd.read_csv('/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/kion_train/items.csv')
users = pd.read_csv('/Users/tanchik/Desktop/Настоящее/учеба/RecSys/RecoServiceTemplate/kion_train/users.csv')

In [None]:
load_dotenv()
interactions_df = pd.read_csv(os.getenv('INTERACTIONS'))
items = pd.read_csv(os.getenv('ITEMS'))
users = pd.read_csv(os.getenv('USERS'))

In [16]:
models = {'User Knn': UserKnn(CosineRecommender(K=50)),
          'Popular Social Dem': PopularSocialDem(users=users)}
metrics = {
    'Recall@10': Recall(k=10),
    'MAP@10': MAP(k=10),
    'Serendipity10': Serendipity(k=10),
    'MeanInvUserFreq10': MeanInvUserFreq(k=10)
}

In [17]:
interactions_df = interactions_df.drop(columns='total_dur')
interactions_df['watched_pct'] = interactions_df['watched_pct'] / 100
interactions_df.rename(columns={'user_id': Columns.User, 'item_id': Columns.Item,
                             'last_watch_dt': Columns.Datetime, 'watched_pct': Columns.Weight}, inplace=True)

In [18]:
interactions = Interactions(interactions_df)

In [19]:
n_splits = 1

cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

cv.get_test_fold_borders(interactions)

[(Timestamp('2021-08-16 00:00:00', freq='7D'),
  Timestamp('2021-08-23 00:00:00', freq='7D'))]

In [13]:
cross_val = CrossValScore(models, metrics, cv, interactions)
cross_val.evaluate(n_splits = 1)

  0%|          | 0/1 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 3919,
 'test_items': 1556,
 'test_users': 3470,
 'train': 184359,
 'train_items': 7567,
 'train_users': 138462}




  0%|          | 0/138462 [00:00<?, ?it/s]

Unnamed: 0,model,training_time,Recall@10,MAP@10,MeanInvUserFreq10,Serendipity10
0,Popular Social Dem,28.251833,0.07195,0.028616,5.559702,1e-06
1,User Knn,6.293869,0.002305,0.000323,10.119112,3e-06


### Обучить модель по отдельности

In [20]:
knn = UserKnn(CosineRecommender(K=50))
knn.fit(df)



  0%|          | 0/962179 [00:00<?, ?it/s]

In [21]:
|with open('knn.pkl', 'wb') as f:
    pickle.dump(knn, f)

In [6]:
pop = PopularSocialDem(users)
pop.fit(interactions.df)

In [7]:
users_list = list(interactions.df['user_id'].unique())
users_recommendations = {}
for user in tqdm(users_list):
    users_recommendations[int(user)] = list(pop.predict(user=user, df=False))

  0%|          | 0/962179 [00:00<?, ?it/s]

In [11]:
with open('./popular_recommendations.json', 'w') as f:
    json.dump(users_recommendations, f)

### Холодных и теплых пользователей дополнять популярными в разрезе пола и возраста

In [None]:
def get_recommendations(user: int, pop: PopularSocialDem, knn: UserKnn, n=10):
    recommendations = knn.recommend(user)
    if len(recommendations) < n:
        recommendations = recommendations + list(pop.predict(user=user, n_rec=(n - len(recommendations)), df=False))
    return recommendations[:n]

get_recommendations(1016458, pop, knn)

### Тюнинг параметров модели userknn

In [None]:
param_grid = {
    'model': [CosineRecommender, TFIDFRecommender, BM25Recommender],
    'K': [10, 20, 50, 100]
}

n_splits = 1

cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

cross_val = CrossValScore(models, metrics, cv, interactions)

grid_search = GridSearchCV(cross_val, param_grid)
grid_search.search(n_splits)

In [19]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.000999382461918485
{'K': 50, 'model': <class 'implicit.nearest_neighbours.TFIDFRecommender'>}


### Обучение с разными пропорциями популярных

In [45]:
max_date = df['datetime'].max()

train = df[(df['datetime'] < max_date - pd.Timedelta(days=7))]
test = df[(df['datetime'] >= max_date - pd.Timedelta(days=7))]

test = test[test['user_id'].isin(train['user_id'].unique())]
catalog = train[Columns.Item].unique()

In [None]:
knn.fit(train)
pop.fit(train)

In [50]:
def get_knn_popular_recommendation(test, train, catalog, knn, pop, pop_procent, n=10):
    pop_n = int(n * (pop_procent/100))
    knn_recommendations = knn.predict(test, N_recs=int(n-pop_n))
    pop_recommendations = pop.predict(user=test, n_rec=pop_n)
    df = pd.concat([knn_recommendations, pop_recommendations])
    df["rank"] = df.groupby("user_id").cumcount() + 1
    metric_values = calc_metrics(
                    metrics, reco=df, interactions=test, prev_interactions=train, catalog=catalog)
    print(metric_values)
print('10% популярных')
get_knn_popular_recommendation(test, train, catalog, knn, pop, 10, n=10)
print('20% популярных')
get_knn_popular_recommendation(test, train, catalog, knn, pop, 20, n=10)
print('30% популярных')
get_knn_popular_recommendation(test, train, catalog, knn, pop, 30, n=10)
print('40% популярных')
get_knn_popular_recommendation(test, train, catalog, knn, pop, 40, n=10)
print('50% популярных')
get_knn_popular_recommendation(test, train, catalog, knn, pop, 50, n=10)
print('100% популярных')
get_knn_popular_recommendation(test, train, catalog, knn, pop, 100, n=10)

10% популярных
{'Recall@10': 0.01354346220142822, 'MAP@10': 0.004397032555121696, 'MeanInvUserFreq10': 8.662936701245114, 'Serendipity10': 3.338864716106809e-06}
20% популярных
{'Recall@10': 0.023639497660674712, 'MAP@10': 0.006678021286492106, 'MeanInvUserFreq10': 7.9064242690040105, 'Serendipity10': 3.1359515729495627e-06}
30% популярных
{'Recall@10': 0.034474267421817285, 'MAP@10': 0.008789570166078415, 'MeanInvUserFreq10': 7.493000968686884, 'Serendipity10': 1.842758854619325e-06}
40% популярных
{'Recall@10': 0.043585323811869, 'MAP@10': 0.01030934987472786, 'MeanInvUserFreq10': 7.134606107137818, 'Serendipity10': 1.0539369226672293e-06}
50% популярных
{'Recall@10': 0.04752524008864811, 'MAP@10': 0.011581077066794869, 'MeanInvUserFreq10': 6.895923197173432, 'Serendipity10': 2.4896270071261795e-07}
100% популярных
{'Recall@10': 0.06866124928178609, 'MAP@10': 0.02701334412119933, 'MeanInvUserFreq10': 5.56080911926691, 'Serendipity10': 1.1643626745117612e-06}


### Попытки использовать юзер фичи в rectools

In [None]:
age_df = pd.DataFrame({'id': list(users['user_id']), 'value': list(users['age']), 'feature': ['age'] * users.shape[0]})
sex_df = pd.DataFrame({'id': list(users['user_id']), 'value': list(users['sex']), 'feature': ['sex'] * users.shape[0]})
sex_age_df = pd.concat([age_df, sex_df], axis=0)
sex_age_df = sex_age_df.dropna(axis=0, how="any")

sex_age_df = sex_age_df[sex_age_df['id'].isin(list(interactions_df['user_id']))]

In [None]:
dataset_feature = Dataset.construct(
    interactions_df=interactions_df,
    user_features_df=sex_age_df,
    item_features_df=None,
    cat_item_features=['age', 'sex']
)


In [None]:
pop_cat = PopularInCategoryModel(category_feature='genre', n_categories=5)
pop_cat.fit(dataset_feature);
