## Подключим библиотеки, загрузим данные и определим метрики:

In [45]:
# ACHTUNG! DO NOT TOUCH 

def ndcg_metric(gt_items: np.ndarray, predicted: np.ndarray) -> float:
    at = len(predicted)
    relevance = np.array([1 if x in predicted else 0 for x in gt_items])
    # DCG uses the relevance of the recommended items
    rank_dcg = dcg(relevance)
    if rank_dcg == 0.0:
        return 0.0

    # IDCG has all relevances to 1 (or the values provided), up to the number of items in the test set that can fit in the list length
    ideal_dcg = dcg(np.sort(relevance)[::-1][:at])

    if ideal_dcg == 0.0:
        return 0.0

    ndcg_ = rank_dcg / ideal_dcg

    return ndcg_


def dcg(scores: np.ndarray) -> float:
    return np.sum(
        np.divide(np.power(2, scores) - 1, np.log2(np.arange(scores.shape[0], dtype=np.float64) + 2)), dtype=np.float64
    )


def recall_metric(gt_items: np.ndarray, predicted: np.ndarray) -> float:
    n_gt = len(gt_items)
    intersection = len(set(gt_items).intersection(set(predicted)))
    return intersection / n_gt


def evaluate_recommender(df: pd.DataFrame, model_preds_col: str, gt_col: str = "movie_id") -> Dict[str, float]:
    metric_values = []

    for _, row in df.iterrows():
        metric_values.append(
            (ndcg_metric(row[gt_col], row[model_preds_col]), recall_metric(row[gt_col], row[model_preds_col]))
        )

    return {"ndcg": np.mean([x[0] for x in metric_values]), "recall": np.mean([x[1] for x in metric_values])}

In [46]:
from abc import ABC, abstractmethod
from typing import Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from catboost import CatBoostRegressor

In [47]:
data_folder = "data/"

users_df = pd.read_csv(data_folder + "users_df.csv")
items_df = pd.read_csv(data_folder + "items_df.csv")

countries = pd.read_csv(data_folder + "countries.csv")
genres = pd.read_csv(data_folder + "genres.csv")
staff = pd.read_csv(data_folder + "staff.csv")

train_part = pd.read_csv(data_folder + "train_data.csv")#, parse_dates=["datetime"])
test_part = pd.read_csv(data_folder + "test_data.csv")
test_part = test_part.groupby("user_id").agg({"movie_id": list}).reset_index()
movie_duration = pd.read_csv('data/movies_durations_5175.csv')

In [48]:
duration = lambda movie_id: train_part[train_part['movie_id'] == movie_id]['duration'].quantile(0.95)

In [49]:
items_df['movies_duration'] = items_df['id'].apply(duration)

In [50]:
items_df = items_df[items_df['movies_duration'] >= 0]

In [51]:
import ast
items_df['genres'] = items_df['genres'].apply(lambda x: ast.literal_eval(x))
items_df['countries'] = items_df['countries'].apply(lambda x: ast.literal_eval(x))

In [52]:
train_part['movie_id'] = train_part['movie_id'].astype(int)

In [53]:
items_df

Unnamed: 0,id,title,year,date_publication,description,genres,countries,staff,title_orig,age_rating,keywords,movies_duration
0,0,"Мама, я дома",2022-01-01,2022-11-23T00:00:00,Где-то в глубинке вместе с дочерью и внуком жи...,[97],[238],"[1883, 33655, 25890, 1001, 12051, 10110, 16895]",,,,6073.15
3,3,Капитан,2017-01-01,2022-10-20T00:00:00,Вторая мировая война подходит к концу. В это в...,"[97, 303, 143, 319]","[188, 212, 0]","[16006, 12217, 30668, 28806, 16172, 5045, 1663...",,,,6846.60
4,4,Лазурный берег,2015-01-01,2023-01-12T00:00:00,У бывшей танцовщицы Ванессы и писателя Роланда...,"[138, 97]","[1, 102, 0]","[23586, 8823, 8040, 34555, 32484, 18446]",By the Sea,18.0,"Франция, отель, 1970-е, горе, путешествие, нес...",7031.00
5,5,Просто Джексон,2012-01-01,2023-02-01T00:00:00,Майор Евгений Иванов по прозвищу Джексон напад...,[234],[238],"[17317, 19228, 35448, 17666, 11726, 32044, 266...",,,,5527.40
6,6,Всероссийская лига юмора. Выпуск №3. Саранск,2021-01-01,2021-05-31T00:00:00,Смотри новый выпуск в «Большом эфире» по суббо...,[320],[238],"[32576, 25218, 22952, 2749, 17268, 28056, 32322]",,,,480.05
...,...,...,...,...,...,...,...,...,...,...,...,...
7419,7419,"Фиби Робинсон: простите, Гарриет Табмен",2021-01-01,,"Новый спешл комедиантки Фиби Робинсон, где она...",[362],[102],[],,,,349.30
7422,7422,Ромео + Джульетта,1996-01-01,2023-03-03T00:00:00,"Переосмысление знаменитой истории Шекспира, пе...","[138, 97]","[49, 102]","[31732, 12514, 5845, 12650, 23433, 28088, 64, ...",Romeo + Juliet,12.0,"запретная любовь, основанная на пьесе или мюзи...",6670.40
7423,7423,"Поймай меня, если сможешь",2002-01-01,2020-03-27T07:13:01.449866,"Фрэнк Эбэгнейл успел поработать врачом, адвока...","[85, 303]","[121, 102]","[5166, 24667, 6277, 8266, 4854, 2186, 29049, 6...",Catch Me If You Can,12.0,"ФБР, аферист, биография, основано на реальных ...",8092.00
7425,7425,Юлия Колерт — «Окна»,2023-01-01,2023-06-01T00:00:00,Юлия Колерт \n«Окна»\nКоличество просмотров*: ...,[247],[238],[],,,,174.50


In [54]:
items_df['year'] = items_df['year'].apply(lambda x: x[:4]).astype(int)

In [55]:
movies = items_df[['id', 'genres', 'countries',  'year', 'movies_duration']]

In [56]:
movies.head(5)

Unnamed: 0,id,genres,countries,year,movies_duration
0,0,[97],[238],2022,6073.15
3,3,"[97, 303, 143, 319]","[188, 212, 0]",2017,6846.6
4,4,"[138, 97]","[1, 102, 0]",2015,7031.0
5,5,[234],[238],2012,5527.4
6,6,[320],[238],2021,480.05


In [57]:
countries = movies['countries'].explode()
dummies = pd.get_dummies(countries).groupby(level=0).sum()
countries_list = list(map(str, list(dummies.columns)))
countries_list = ['countries' + str(i) for i in countries_list]
dummies.columns = countries_list
movies = movies.drop(columns='countries').join(dummies)

In [58]:
genres = movies['genres'].explode()
dummies = pd.get_dummies(genres).groupby(level=0).sum()
genres_list = list(map(str, list(dummies.columns)))
genres_list = ['genres' + str(i) for i in genres_list]
dummies.columns = genres_list
movies = movies.drop(columns='genres').join(dummies)

In [59]:
# staff = movies['staff'].explode()
# dummies = pd.get_dummies(staff).groupby(level=0).sum()
# staff_list = list(map(str, list(dummies.columns)))
# staff_list = ['staff' + str(i) for i in staff_list]
# dummies.columns = staff_list
# movies = movies.drop(columns='staff').join(dummies)
movies.head()

Unnamed: 0,id,year,movies_duration,countries0,countries1,countries5,countries8,countries10,countries11,countries13,...,genres333,genres336,genres341,genres348,genres349,genres355,genres356,genres362,genres366,genres367
0,0,2022,6073.15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,2017,6846.6,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,2015,7031.0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,2012,5527.4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6,2021,480.05,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
top_15_countries = movies[countries_list].sum().nlargest(10).index.to_list()
top_15_genres = movies[genres_list].sum().nlargest(10).index.to_list()

In [61]:
train_part = train_part.merge(movies[['id','year', 'movies_duration']+top_15_countries+top_15_genres], left_on = 'movie_id', right_on = 'id', how = 'left')

In [62]:
train_part.head()

Unnamed: 0,id_x,datetime,user_id,movie_id,duration,is_train,id_y,year,movies_duration,countries102,...,genres97,genres46,genres85,genres303,genres294,genres138,genres158,genres326,genres266,genres302
0,0,2023-04-06 15:00:00.071114+03:00,10250,427,485.0,True,427,2021,2970.8,1,...,1,0,0,0,0,0,0,0,0,0
1,1,2023-04-06 15:00:01.123928+03:00,2062,1521,129.0,True,1521,2022,5035.0,1,...,0,1,0,0,0,0,0,0,1,0
2,2,2023-04-06 15:00:03.957246+03:00,12980,4598,2795.0,True,4598,2006,5220.0,1,...,0,0,0,0,0,0,0,0,0,0
3,3,2023-04-06 15:00:04.990565+03:00,30646,5324,5094.0,True,5324,1998,10374.0,1,...,1,0,0,0,0,1,0,0,0,1
4,4,2023-04-06 15:00:10.495017+03:00,43069,4291,75.0,True,4291,2007,5758.0,0,...,1,0,0,0,0,0,0,1,1,0


In [63]:
train_part_films = train_part.copy()
train_part_films['part_duration'] = train_part_films['duration'] / train_part_films['movies_duration']
train_part_films['part_duration'] = train_part_films['part_duration'].apply(lambda x: 1.0 if x > 1 else x)
# train_part_films.groupby('movies_duration').rank(method = 'mean')
train_part_films['mean_duration'] = train_part_films.groupby('movie_id')['part_duration'].transform('mean')
train_part_films['count_watched'] = train_part_films.groupby('movie_id')['part_duration'].transform('count')
train_part_films['count_watched'] = train_part_films['count_watched'] * train_part_films['mean_duration']
train_part_films['quantile25'] = train_part_films.groupby('movie_id')['part_duration'].transform(lambda x: x.quantile(0.25))
train_part_films['quantile50'] = train_part_films.groupby('movie_id')['part_duration'].transform(lambda x: x.quantile(0.5))
train_part_films['quantile75'] = train_part_films.groupby('movie_id')['part_duration'].transform(lambda x: x.quantile(0.75))
# train_part_films[''] = train_part_films.groupby('movie_id')['part_duration'].transform('mean')

In [67]:
print(train_part_films.shape)
train_part_films.head()

(1251871, 35)


Unnamed: 0,id_x,datetime,user_id,movie_id,duration,is_train,id_y,year,movies_duration,countries102,...,genres158,genres326,genres266,genres302,part_duration,mean_duration,count_watched,quantile25,quantile50,quantile75
0,0,2023-04-06 15:00:00.071114+03:00,10250,427,485.0,True,427,2021,2970.8,1,...,0,0,0,0,0.163256,0.150694,28.481217,0.008752,0.053858,0.149118
1,1,2023-04-06 15:00:01.123928+03:00,2062,1521,129.0,True,1521,2022,5035.0,1,...,0,0,1,0,0.025621,0.575203,1999.979146,0.149752,0.700894,0.943396
2,2,2023-04-06 15:00:03.957246+03:00,12980,4598,2795.0,True,4598,2006,5220.0,1,...,0,0,0,0,0.535441,0.601789,501.290421,0.203065,0.754406,0.949234
3,3,2023-04-06 15:00:04.990565+03:00,30646,5324,5094.0,True,5324,1998,10374.0,1,...,0,0,0,1,0.491035,0.518711,161.318971,0.11095,0.477829,0.967997
4,4,2023-04-06 15:00:10.495017+03:00,43069,4291,75.0,True,4291,2007,5758.0,0,...,0,1,1,0,0.013025,0.37913,270.319382,0.027266,0.194338,0.90066


In [68]:
# user_genres = train_part_films[genres_list + ['user_id']].groupby('user_id').sum().reset_index()
user_genres = train_part_films[train_part_films['part_duration'] >= 0.8][top_15_genres + ['user_id']].groupby('user_id').sum().reset_index()

sum_genres = user_genres.groupby('user_id').sum().reset_index()[top_15_genres].sum(axis=1)

for g in top_15_genres:
    user_genres[g] /= sum_genres
genres_list_user = ['U' + i for i in top_15_genres]
user_genres.columns = ['user_id'] + genres_list_user
user_genres
# здесь можно добавить фичи из users_df

Unnamed: 0,user_id,Ugenres97,Ugenres46,Ugenres85,Ugenres303,Ugenres294,Ugenres138,Ugenres158,Ugenres326,Ugenres266,Ugenres302
0,0,,,,,,,,,,
1,3,0.000,0.000000,0.0,0.0,0.000000,0.000,1.000000,0.0,0.000000,0.000000
2,4,0.000,0.210526,0.0,0.0,0.105263,0.000,0.368421,0.0,0.157895,0.157895
3,5,1.000,0.000000,0.0,0.0,0.000000,0.000,0.000000,0.0,0.000000,0.000000
4,6,0.375,0.500000,0.0,0.0,0.000000,0.125,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
99100,263826,0.000,0.500000,0.0,0.0,0.000000,0.500,0.000000,0.0,0.000000,0.000000
99101,263845,,,,,,,,,,
99102,263848,,,,,,,,,,
99103,263850,0.000,0.000000,0.0,0.0,0.000000,0.000,0.000000,1.0,0.000000,0.000000


In [69]:
for column in users_df.columns:
    most_frequent = users_df[column].mode()[0]  # Получаем самое частое значение
    users_df[column].fillna(most_frequent, inplace=True) 

In [70]:
# user_genres = train_part_films[genres_list + ['user_id']].groupby('user_id').sum().reset_index()
user_countries = train_part_films[train_part_films['part_duration'] >= 0.8][top_15_countries + ['user_id']].groupby('user_id').sum().reset_index()

sum_countries = user_countries.groupby('user_id').sum().reset_index()[top_15_countries].sum(axis=1)

for g in top_15_countries:
    user_countries[g] /= sum_countries
countries_list_user = ['U' + i for i in top_15_countries]
user_countries.columns = ['user_id'] + countries_list_user
user_countries
# здесь можно добавить фичи из users_df

Unnamed: 0,user_id,Ucountries102,Ucountries238,Ucountries205,Ucountries122,Ucountries0,Ucountries188,Ucountries121,Ucountries117,Ucountries104,Ucountries175
0,0,0.000000,0.000000,1.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1,3,0.000000,0.250000,0.750000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
2,4,0.567568,0.027027,0.000000,0.054054,0.108108,0.0,0.081081,0.108108,0.027027,0.027027
3,5,,,,,,,,,,
4,6,0.000000,0.111111,0.222222,0.111111,0.444444,0.0,0.000000,0.000000,0.000000,0.111111
...,...,...,...,...,...,...,...,...,...,...,...
99100,263826,0.000000,0.000000,0.000000,0.500000,0.000000,0.5,0.000000,0.000000,0.000000,0.000000
99101,263845,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.000000,0.000000
99102,263848,0.500000,0.000000,0.000000,0.000000,0.000000,0.0,0.500000,0.000000,0.000000,0.000000
99103,263850,0.333333,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.333333,0.333333,0.000000


In [71]:
train_part_films = train_part_films[train_part_films['movies_duration'] > 0]

In [72]:
train_part_films['part_duration'].mean()

0.4531739121611301

In [73]:
# train_part_films_users_filtred['bin_target'] = train_part_films_users_filtred['part_duration'].apply(lambda x: 1 if x >= 0.98 else 0)
users_df['kids_flg'] = users_df['kids_flg'].astype(int)

In [74]:
train_part_films_users = train_part_films.merge(user_genres, on = 'user_id', how = 'left').dropna()
train_part_films_users = train_part_films_users.merge(user_countries, on = 'user_id', how = 'left').dropna()
train_part_films_users = train_part_films_users.merge(users_df, on = 'user_id', how = 'left').dropna()

In [77]:
print(train_part_films_users.shape)
train_part_films_users.head()

(992620, 60)


Unnamed: 0,id_x,datetime,user_id,movie_id,duration,is_train,id_y,year,movies_duration,countries102,...,Ucountries188,Ucountries121,Ucountries117,Ucountries104,Ucountries175,age_category,income,sex,kids_flg,education
0,0,2023-04-06 15:00:00.071114+03:00,10250,427,485.0,True,427,2021,2970.8,1,...,0.0,0.0,0.125,0.0,0.0,35-44,средний,Женский,0,Высшее
1,1,2023-04-06 15:00:01.123928+03:00,2062,1521,129.0,True,1521,2022,5035.0,1,...,0.0,0.0,0.066667,0.0,0.066667,18-24,средний,Женский,0,Среднее
2,2,2023-04-06 15:00:03.957246+03:00,12980,4598,2795.0,True,4598,2006,5220.0,1,...,0.0,0.0,0.0,0.0,0.0,35-44,средний,Женский,1,Неполное высшее
3,3,2023-04-06 15:00:04.990565+03:00,30646,5324,5094.0,True,5324,1998,10374.0,1,...,0.0,0.038462,0.038462,0.038462,0.038462,35-44,низкий,Женский,1,Высшее
4,4,2023-04-06 15:00:10.495017+03:00,43069,4291,75.0,True,4291,2007,5758.0,0,...,0.0,0.0,0.0,0.0,0.083333,25-34,средний,Женский,0,Высшее


In [76]:
train_part_films_users_filtred = train_part_films_users.drop(['is_train', 'id_x', 'id_y', 'datetime','user_id','movie_id', 'user_id', 'duration'], axis=1)

In [32]:
train_part_films_users_filtred.columns[:110]

Index(['year', 'movies_duration', 'countries102', 'countries238',
       'countries205', 'countries122', 'countries0', 'countries188',
       'countries121', 'countries117', 'countries104', 'countries175',
       'genres97', 'genres46', 'genres85', 'genres303', 'genres294',
       'genres138', 'genres158', 'genres326', 'genres266', 'genres302',
       'part_duration', 'mean_duration', 'count_watched', 'quantile25',
       'quantile50', 'quantile75', 'Ugenres97', 'Ugenres46', 'Ugenres85',
       'Ugenres303', 'Ugenres294', 'Ugenres138', 'Ugenres158', 'Ugenres326',
       'Ugenres266', 'Ugenres302', 'Ucountries102', 'Ucountries238',
       'Ucountries205', 'Ucountries122', 'Ucountries0', 'Ucountries188',
       'Ucountries121', 'Ucountries117', 'Ucountries104', 'Ucountries175',
       'age_category', 'income', 'sex', 'kids_flg', 'education'],
      dtype='object')

In [33]:
train_part_films_users_filtred['year'] = train_part_films_users_filtred['year'].astype(int)

In [34]:
%%time
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

cat_features = ['age_category', 'income', 'sex', 'kids_flg', 'education']

X_train, X_test, y_train, y_test = train_test_split(train_part_films_users_filtred.drop(['part_duration'], axis=1), 
                                                    train_part_films_users_filtred['part_duration'],
                                                    test_size = 0.2, random_state = 42)

# Инициализация CatBoostRegressor
model = CatBoostRegressor(iterations=600, learning_rate=0.1, depth=8, cat_features = cat_features, 
                          early_stopping_rounds = 50, 
                          thread_count=-1,)

# Обучение модели
model.fit(X_train, y_train, verbose = 100)

# Прогнозирование на тестовой выборке
y_pred = model.predict(X_test)

# Оценка модели
mse = mean_squared_error(y_test, y_pred)

print(f'Mean Squared Error: {mse}')

0:	learn: 0.4041951	total: 347ms	remaining: 3m 27s
100:	learn: 0.3414000	total: 25.5s	remaining: 2m 6s
200:	learn: 0.3341074	total: 50s	remaining: 1m 39s


KeyboardInterrupt: 

In [35]:
importances = model.get_feature_importance()
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df.head(60)

CatBoostError: Model has no meta information needed to calculate feature importances.                             Pass training dataset to this function.

In [243]:
train_part_films_users_filtred.drop(['part_duration'], axis=1).columns

Index(['year', 'movies_duration', 'countries102', 'countries238',
       'countries205', 'countries122', 'countries0', 'countries188',
       'countries121', 'countries117', 'countries104', 'countries175',
       'countries109', 'countries242', 'countries166', 'countries49',
       'countries250', 'countries46', 'countries170', 'countries5',
       'countries20', 'countries237', 'genres97', 'genres46', 'genres85',
       'genres303', 'genres294', 'genres138', 'genres158', 'genres326',
       'genres266', 'genres302', 'genres198', 'genres130', 'genres261',
       'genres72', 'genres100', 'genres223', 'genres341', 'genres304',
       'genres124', 'genres320', 'mean_duration', 'count_watched', 'Ugenres97',
       'Ugenres46', 'Ugenres85', 'Ugenres303', 'Ugenres294', 'Ugenres138',
       'Ugenres158', 'Ugenres326', 'Ugenres266', 'Ugenres302', 'Ugenres198',
       'Ugenres130', 'Ugenres261', 'Ugenres72', 'Ugenres100', 'Ugenres223',
       'Ugenres341', 'Ugenres304', 'Ugenres124', 'Ugenres32

In [28]:
%%time
# for user in 
#     user_movies = train_part[train_part['user_id'] == user['user_id'].values[0]]['movie_id'].values
#     df_repeated = pd.concat([user] * 4981, ignore_index=True).drop('user_id', axis = 1)
#     result = pd.concat([movies.reset_index(drop=True), df_repeated.reset_index(drop=True)], axis=1)
#     y_pred = model.predict(result)
#     indices = np.argpartition(y_pred, -10)[-10:]
#     list(result.iloc[indices]['id'].values.astype(float))

def my_predict(user, train_part, user_genres, best_movies):

    A = pd.DataFrame()

    A = best_movies.copy()
    
    user_df = user_genres[user_genres['user_id'] == user]
    
    user_movies = train_part[train_part['user_id'] == user_df['user_id'].values[0]]['movie_id'].values

    A['watched'] = A['id'].apply(lambda x: 1 if x in user_movies else 0)
    A = A[A['watched'] == 0]

    print(user, len(A))
    
    
    df_repeated = pd.concat([user_df] * len(A), ignore_index=True).drop('user_id', axis = 1)
    
    
    result = pd.concat([A.reset_index(drop=True), df_repeated.reset_index(drop=True)], axis=1)
    y_pred = model.predict(result)
    indices = np.argpartition(y_pred, -10)[-10:]
    
    return list(result.iloc[indices]['id'].values.astype(float))

my_func = lambda user: my_predict(user = user, train_part = train_part, user_genres = user_genres, best_movies = best_movies)

CPU times: total: 0 ns
Wall time: 0 ns


In [29]:
recommendations = train_part['movie_id'].value_counts().index.values[:1000]

In [30]:
movies['best'] = movies['id'].apply(lambda x: 1 if x in recommendations else 0)
best_movies = movies[movies['best'] == 1]
best_movies

Unnamed: 0,id,year,movies_duration,genre2,genre8,genre24,genre38,genre41,genre46,genre62,...,genre341,genre348,genre349,genre355,genre356,genre360,genre362,genre366,genre367,best
5,8,2018,104,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
29,41,2002,133,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
39,59,1971,153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
42,62,1995,81,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
51,74,1964,84,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5145,7381,2011,96,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5149,7390,2015,148,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5159,7401,2019,104,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
5162,7405,1997,116,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [39]:
test_part_30 = test_part.iloc[100:200]

In [40]:
%%time
test_part_30['prediction'] = test_part_30['user_id'].apply(my_func)

101 746
102 699
103 667
104 708
105 719
106 742
107 728
108 697
109 726
110 729
111 685


IndexError: index 0 is out of bounds for axis 0 with size 0

In [49]:
evaluate_recommender(df=test_part_30, model_preds_col="prediction")

KeyError: 'prediction'