# Recommender Systems

# Библиотеки

In [3]:
from abc import ABC, abstractmethod
from typing import Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm


## Определим метрики:

In [4]:
# ACHTUNG! DO NOT TOUCH 

def ndcg_metric(gt_items: np.ndarray, predicted: np.ndarray) -> float:
    at = len(predicted)
    relevance = np.array([1 if x in predicted else 0 for x in gt_items])
    # DCG uses the relevance of the recommended items
    rank_dcg = dcg(relevance)
    if rank_dcg == 0.0:
        return 0.0

    # IDCG has all relevances to 1 (or the values provided), up to the number of items in the test set that can fit in the list length
    ideal_dcg = dcg(np.sort(relevance)[::-1][:at])

    if ideal_dcg == 0.0:
        return 0.0

    ndcg_ = rank_dcg / ideal_dcg

    return ndcg_


def dcg(scores: np.ndarray) -> float:
    return np.sum(
        np.divide(np.power(2, scores) - 1, np.log2(np.arange(scores.shape[0], dtype=np.float64) + 2)), dtype=np.float64
    )


def recall_metric(gt_items: np.ndarray, predicted: np.ndarray) -> float:
    n_gt = len(gt_items)
    intersection = len(set(gt_items).intersection(set(predicted)))
    return intersection / n_gt


def evaluate_recommender(df: pd.DataFrame, model_preds_col: str, gt_col: str = "movie_id") -> Dict[str, float]:
    metric_values = []

    for _, row in df.iterrows():
        metric_values.append(
            (ndcg_metric(row[gt_col], row[model_preds_col]), recall_metric(row[gt_col], row[model_preds_col]))
        )

    return {"ndcg": np.mean([x[0] for x in metric_values]), "recall": np.mean([x[1] for x in metric_values])}

# Данные

In [5]:
data_folder = "data/"

users_df = pd.read_csv(data_folder + "users_df.csv")
items_df = pd.read_csv(data_folder + "items_df.csv")

countries = pd.read_csv(data_folder + "countries.csv")
genres = pd.read_csv(data_folder + "genres.csv")
staff = pd.read_csv(data_folder + "staff.csv")

train_part = pd.read_csv(data_folder + "train_data.csv")#, parse_dates=["datetime"])
test_part = pd.read_csv(data_folder + "test_data.csv")
test_part = test_part.groupby("user_id").agg({"movie_id": list}).reset_index()


In [6]:
train_part = train_part.sort_values('movie_id')
train_part_with_users = train_part.merge(users_df, on = 'user_id' ,how = 'left')

In [7]:
# Рекомендации по всем пользователям
recommendations = train_part['movie_id'].value_counts().index.values[:300] # самые просматриваемые видео

# Определим самые просматриваемые видео для каждой категории пользователей:

In [8]:
# По доходу:
recom_income_1 = train_part_with_users[train_part_with_users['income'] == 'низкий']['movie_id'].value_counts().index.values[:300]
recom_income_2 = train_part_with_users[train_part_with_users['income'] == 'средний']['movie_id'].value_counts().index.values[:300]
recom_income_3 = train_part_with_users[train_part_with_users['income'] == 'высокий']['movie_id'].value_counts().index.values[:300]
recom_income_4 = train_part_with_users[train_part_with_users['income'] == 'очень высокий']['movie_id'].value_counts().index.values[:300]

mapping_income = {'низкий': recom_income_1,'средний': recom_income_2, 'высокий': recom_income_3, 'очень высокий': recom_income_4}

In [9]:
# По полу:
recom_female = train_part_with_users[train_part_with_users['sex'] == 'Женский']['movie_id'].value_counts().index.values[:300]
recom_male = train_part_with_users[train_part_with_users['sex'] == 'Мужской']['movie_id'].value_counts().index.values[:300]

mapping_sex = {'Женский': recom_female, 'Мужской': recom_male}

In [10]:
# По образованию
recom_education_1 = train_part_with_users[train_part_with_users['education'] == 'Среднее']['movie_id'].value_counts().index.values[:300]
recom_education_2 = train_part_with_users[train_part_with_users['education'] == 'Высшее']['movie_id'].value_counts().index.values[:300]
recom_education_3 = train_part_with_users[train_part_with_users['education'] == 'Неполное высшее']['movie_id'].value_counts().index.values[:300]
recom_education_4 = train_part_with_users[train_part_with_users['education'] == 'Без образования']['movie_id'].value_counts().index.values[:300]

mapping_education = {'Среднее': recom_education_1,'Высшее': recom_education_2, 'Неполное высшее': recom_education_3, 'Без образования': recom_education_4}

In [11]:
# По наличию детей
recom_kids = train_part_with_users[train_part_with_users['kids_flg'] == 1.0]['movie_id'].value_counts().index.values[:300]
recom_nokids = train_part_with_users[train_part_with_users['kids_flg'] == 0.0]['movie_id'].value_counts().index.values[:300]

mapping_kids = {1.0: recom_kids, 0.0: recom_nokids}

In [12]:
# По возрастной категории
recom18 = train_part_with_users[train_part_with_users['age_category'] == '18-24']['movie_id'].value_counts().index.values[:300]
recom25 = train_part_with_users[train_part_with_users['age_category'] == '25-34']['movie_id'].value_counts().index.values[:300]
recom35 = train_part_with_users[train_part_with_users['age_category'] == '35-44']['movie_id'].value_counts().index.values[:300]
recom45 = train_part_with_users[train_part_with_users['age_category'] == '45-54']['movie_id'].value_counts().index.values[:300]
recom55 = train_part_with_users[train_part_with_users['age_category'] == '55-70']['movie_id'].value_counts().index.values[:300]

mapping_ages = {'18-24': recom18,'25-34': recom25, '35-44': recom35, '45-54': recom45, '55-70': recom55}

In [13]:
recommendations = train_part['movie_id'].value_counts().index.values[:300]

In [14]:
def my_predict(user, recommendations, train_part, mapping_ages):

    # По возрасту:
    # age_category = users_df[users_df['user_id'] == user]['age_category'].values[0]
    # if age_category in {'18-24','25-34','35-44','45-54','55-70'}:
    #     rec = mapping_ages.get(age_category)
    # else:
    #     rec = recommendations
    
    # По наличию детей:
    # kids_flg = users_df[users_df['user_id'] == user]['kids_flg'].values[0]
    # if kids_flg in [0.0, 1.0]:
    #     # print('da')
    #     rec = mapping_kids.get(kids_flg)
    # else:
    #     rec = recommendations

    # По образованию:
    # education = users_df[users_df['user_id'] == user]['education'].values[0]
    # if education in mapping_education.keys():
    #     # print('da', education)
    #     rec = mapping_education.get(education)
    # else:
    #     rec = recommendations

    # По полу:
    # sex = users_df[users_df['user_id'] == user]['sex'].values[0]
    # if sex in mapping_sex.keys():
    #     # print('da', education)
    #     rec = mapping_sex.get(sex)
    # else:
    #     rec = recommendations

    # По доходу:   
    income = users_df[users_df['user_id'] == user]['income'].values[0]
    if income in mapping_income.keys():
        rec = mapping_income.get(income)
    else:
        rec = recommendations
    

    user_movies = train_part[train_part['user_id'] == user]['movie_id'].values

    filt_rec = [elem for elem in rec if elem not in set(user_movies)]
    
    # print(user_movies[:20], filt_rec[:20])
    return filt_rec[:10]

my_func = lambda user: my_predict(user = user, recommendations = recommendations, train_part = train_part, mapping_ages = mapping_ages)

In [16]:
%%time
test_part["predict_income"] = test_part['user_id'].apply(my_func)
test_part.head()

CPU times: total: 1min 28s
Wall time: 1min 40s


Unnamed: 0,user_id,movie_id,predict_income
0,0,"[12.0, 6201.0, 5542.0, 2025.0, 190.0, 5358.0, ...","[484.0, 6194.0, 641.0, 2679.0, 2820.0, 5876.0,..."
1,1,"[2515.0, 1540.0, 5210.0, 1608.0, 3590.0, 7215....","[6194.0, 641.0, 2679.0, 2820.0, 5336.0, 2074.0..."
2,2,"[5998.0, 190.0, 7327.0, 947.0, 3814.0, 876.0, ...","[484.0, 6194.0, 5336.0, 2074.0, 2679.0, 2480.0..."
3,3,"[4812.0, 3935.0, 802.0, 4459.0, 4340.0, 5975.0...","[484.0, 6194.0, 2679.0, 641.0, 5336.0, 5876.0,..."
4,4,"[152.0, 195.0, 800.0, 2266.0, 6634.0, 7412.0, ...","[6194.0, 641.0, 2679.0, 5876.0, 2074.0, 1978.0..."


In [17]:
evaluate_recommender(df=test_part, model_preds_col="predict_income")

{'ndcg': 0.13482940546964242, 'recall': 0.06347190351054333}

Лучший скор получаем, если делим пользователей по доходу. Еще одно отичие от бейзлайна - это исключение из рекомендаций видео, которые пользователь уже смотрел. Если делить по всем другим критериям, то скор будет немного хуже:
{'ndcg': 0.13482940546964242, 'recall': 0.06347190351054333} - predict_income + ненужные убраны
{'ndcg': 0.13395530914747164, 'recall': 0.06316759284277068} - predict_sex + ненужные убраны
{'ndcg': 0.13394791301917217, 'recall': 0.06317175906920132} -predict_education + ненужные убраны
{'ndcg': 0.1313561018280454, 'recall': 0.061327869644033056} -predict_kids + ненужные убраны
{'ndcg': 0.12958166348560562, 'recall': 0.06052906438068992} -predict_ages + ненужные убраны
{'ndcg': 0.1289555139863353, 'recall': 0.06034992834762613} ненужные убраны
{'ndcg': 0.1213225408775555, 'recall': 0.05789212284812527} - скор бейзлайна

# Сделаем немного иначе. Учтем время, которое люди смотрели видео

### Оценим время каждого видео. Для этого возьмем квантиль 0.95 (Т.к. duration может быть и больше самого видео)

In [23]:
train_part.head()

Unnamed: 0,id,datetime,user_id,movie_id,duration,is_train
397688,531685,2023-04-20 09:45:11.395639+03:00,23147,0.0,548.0,True
472395,640300,2023-04-22 22:51:27.269769+03:00,238515,0.0,6052.0,True
840846,1188907,2023-05-07 20:05:12.829575+03:00,5825,0.0,42.0,True
663341,922276,2023-04-30 16:54:57.456289+03:00,73889,0.0,5836.0,True
1228001,2316369,2023-06-10 16:10:37.519494+03:00,199807,0.0,2145.0,True


In [28]:
movies_durations = train_part.groupby('movie_id')[['duration']].quantile(0.95).reset_index()
movies_durations.rename(columns={'duration': 'movie_duration'}, inplace=True)

In [31]:
train_part_dur = train_part.merge(movies_durations, on = 'movie_id', how = 'left')

In [33]:
# Определим часть просмотренного видео:
train_part_dur['satisfaction'] = train_part_dur['duration'] / train_part_dur['movie_duration']

In [37]:
# Уберем значения > 1 
train_part_dur['satisfaction'] = train_part_dur['satisfaction'].apply(lambda x: 1 if x > 1 else x) 

In [48]:
# Получим рекомендации по суммарной удолетворенности
recomendations_by_satisfaction = train_part_dur.groupby('movie_id')['satisfaction'].sum().sort_values(ascending=[False]).index.values[:300]

In [51]:
def my_predict(user, recommendations, train_part, mapping_ages):

    rec = recommendations

    user_movies = train_part[train_part['user_id'] == user]['movie_id'].values

    filt_rec = [elem for elem in rec if elem not in set(user_movies)]
    
    return filt_rec[:10]

my_func = lambda user: my_predict(user = user, recommendations = recomendations_by_satisfaction, train_part = train_part, mapping_ages = mapping_ages)

In [52]:
%%time
test_part["predict_satisfaction"] = test_part['user_id'].apply(my_func)
test_part.head()

CPU times: total: 1min 10s
Wall time: 1min 22s


Unnamed: 0,user_id,movie_id,predict_income,predict_satisfaction
0,0,"[12.0, 6201.0, 5542.0, 2025.0, 190.0, 5358.0, ...","[484.0, 6194.0, 641.0, 2679.0, 2820.0, 5876.0,...","[484.0, 6194.0, 2074.0, 2820.0, 1978.0, 5876.0..."
1,1,"[2515.0, 1540.0, 5210.0, 1608.0, 3590.0, 7215....","[6194.0, 641.0, 2679.0, 2820.0, 5336.0, 2074.0...","[6194.0, 2074.0, 2820.0, 1978.0, 5707.0, 1521...."
2,2,"[5998.0, 190.0, 7327.0, 947.0, 3814.0, 876.0, ...","[484.0, 6194.0, 5336.0, 2074.0, 2679.0, 2480.0...","[484.0, 6194.0, 2074.0, 1978.0, 5876.0, 5707.0..."
3,3,"[4812.0, 3935.0, 802.0, 4459.0, 4340.0, 5975.0...","[484.0, 6194.0, 2679.0, 641.0, 5336.0, 5876.0,...","[484.0, 6194.0, 2074.0, 1978.0, 5876.0, 5707.0..."
4,4,"[152.0, 195.0, 800.0, 2266.0, 6634.0, 7412.0, ...","[6194.0, 641.0, 2679.0, 5876.0, 2074.0, 1978.0...","[6194.0, 2074.0, 1978.0, 5876.0, 5707.0, 2679...."


### Получим следующую оценку:

In [53]:
evaluate_recommender(df=test_part, model_preds_col="predict_satisfaction")

{'ndcg': 0.08464253875179696, 'recall': 0.03539089332964322}

### Попробуем предсказать по суммарному времени просмотров

In [59]:
# Определим часть просмотренного видео:
train_part_dur['cut_whatch_time'] = train_part_dur['movie_duration'] * train_part_dur['satisfaction']

In [62]:
# Получим рекомендации по суммарному времени просмотра:
recomendations_by_whatch_time = train_part_dur.groupby('movie_id')['cut_whatch_time'].sum().sort_values(ascending=[False]).index.values[:300]

In [64]:
def my_predict(user, recommendations, train_part, mapping_ages):

    rec = recommendations

    user_movies = train_part[train_part['user_id'] == user]['movie_id'].values

    filt_rec = [elem for elem in rec if elem not in set(user_movies)]
    
    return filt_rec[:10]

my_func = lambda user: my_predict(user = user, recommendations = recomendations_by_whatch_time, train_part = train_part, mapping_ages = mapping_ages)

In [65]:
%%time
test_part["predict_by_whatch_time"] = test_part['user_id'].apply(my_func)
test_part.head()

CPU times: total: 1min 13s
Wall time: 1min 24s


Unnamed: 0,user_id,movie_id,predict_income,predict_satisfaction,predict_by_whatch_time
0,0,"[12.0, 6201.0, 5542.0, 2025.0, 190.0, 5358.0, ...","[484.0, 6194.0, 641.0, 2679.0, 2820.0, 5876.0,...","[484.0, 6194.0, 2074.0, 2820.0, 1978.0, 5876.0...","[484.0, 6194.0, 5876.0, 2480.0, 2679.0, 2074.0..."
1,1,"[2515.0, 1540.0, 5210.0, 1608.0, 3590.0, 7215....","[6194.0, 641.0, 2679.0, 2820.0, 5336.0, 2074.0...","[6194.0, 2074.0, 2820.0, 1978.0, 5707.0, 1521....","[6194.0, 2480.0, 2679.0, 2074.0, 5707.0, 1978...."
2,2,"[5998.0, 190.0, 7327.0, 947.0, 3814.0, 876.0, ...","[484.0, 6194.0, 5336.0, 2074.0, 2679.0, 2480.0...","[484.0, 6194.0, 2074.0, 1978.0, 5876.0, 5707.0...","[484.0, 6194.0, 5876.0, 2480.0, 2679.0, 2074.0..."
3,3,"[4812.0, 3935.0, 802.0, 4459.0, 4340.0, 5975.0...","[484.0, 6194.0, 2679.0, 641.0, 5336.0, 5876.0,...","[484.0, 6194.0, 2074.0, 1978.0, 5876.0, 5707.0...","[484.0, 6194.0, 5876.0, 2480.0, 2679.0, 2074.0..."
4,4,"[152.0, 195.0, 800.0, 2266.0, 6634.0, 7412.0, ...","[6194.0, 641.0, 2679.0, 5876.0, 2074.0, 1978.0...","[6194.0, 2074.0, 1978.0, 5876.0, 5707.0, 2679....","[6194.0, 5876.0, 2679.0, 2074.0, 5707.0, 1978...."


In [66]:
evaluate_recommender(df=test_part, model_preds_col="predict_by_whatch_time")

{'ndcg': 0.07611247642258426, 'recall': 0.03251856388023836}

## Вернемся к предыдущему подходу сегментирования по категориям и попробуем отбирать видео, учитывая все характеристики пользователя

In [67]:
def my_predict(user, recommendations, train_part, mapping_ages):
    
    age_category = users_df[users_df['user_id'] == user]['age_category'].values[0]
    if age_category in {'18-24','25-34','35-44','45-54','55-70'}:
        rec_age = mapping_ages.get(age_category)
    else:
        rec_age = recommendations

    kids_flg = users_df[users_df['user_id'] == user]['kids_flg'].values[0]
    if kids_flg in [0.0, 1.0]:
        rec_kids = mapping_kids.get(kids_flg)
    else:
        rec_kids = recommendations

    education = users_df[users_df['user_id'] == user]['education'].values[0]
    if education in mapping_education.keys():
        rec_education = mapping_education.get(education)
    else:
        rec_education = recommendations

    sex = users_df[users_df['user_id'] == user]['sex'].values[0]
    if sex in mapping_sex.keys():
        rec_sex = mapping_sex.get(sex)
    else:
        rec_sex = recommendations

    income = users_df[users_df['user_id'] == user]['income'].values[0]
    if income in mapping_income.keys():
        rec_income = mapping_income.get(income)
    else:
        rec_income = recommendations

    rec = rec_income

    movie_scores = {}
    
    # Функция для добавления очков
    def add_scores(movie_list):
        for index, movie in enumerate(movie_list):
            # Очки равны позиции + 1 (индексация начинается с 0)
            score = len(movie_list) - index  # Чем выше позиция, тем больше очков
            if movie in movie_scores:
                movie_scores[movie] += score
            else:
                movie_scores[movie] = score
                
    # Присвоение очков из каждого списка
    add_scores(rec_age)
    add_scores(rec_kids)
    add_scores(rec_education)
    add_scores(rec_sex)
    add_scores(rec_income)
    
    # Сортировка фильмов по сумме очков
    sorted_recommendations = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)
    
    user_movies = train_part[train_part['user_id'] == user]['movie_id'].values

    recommended_movies = [movie for movie, score in sorted_recommendations]

    filt_rec = [elem for elem in recommended_movies if elem not in set(user_movies)]
    
    return filt_rec[:10]

my_func = lambda user: my_predict(user = user, recommendations = recommendations, train_part = train_part, mapping_ages = mapping_ages)

In [68]:
%%time
test_part["predict_by_scores"] = test_part['user_id'].apply(my_func)
test_part.head()

CPU times: total: 2min 53s
Wall time: 3min 20s


Unnamed: 0,user_id,movie_id,predict_income,predict_satisfaction,predict_by_whatch_time,predict_by_scores
0,0,"[12.0, 6201.0, 5542.0, 2025.0, 190.0, 5358.0, ...","[484.0, 6194.0, 641.0, 2679.0, 2820.0, 5876.0,...","[484.0, 6194.0, 2074.0, 2820.0, 1978.0, 5876.0...","[484.0, 6194.0, 5876.0, 2480.0, 2679.0, 2074.0...","[484.0, 6194.0, 641.0, 2679.0, 2820.0, 5876.0,..."
1,1,"[2515.0, 1540.0, 5210.0, 1608.0, 3590.0, 7215....","[6194.0, 641.0, 2679.0, 2820.0, 5336.0, 2074.0...","[6194.0, 2074.0, 2820.0, 1978.0, 5707.0, 1521....","[6194.0, 2480.0, 2679.0, 2074.0, 5707.0, 1978....","[6194.0, 641.0, 2679.0, 2820.0, 5336.0, 2074.0..."
2,2,"[5998.0, 190.0, 7327.0, 947.0, 3814.0, 876.0, ...","[484.0, 6194.0, 5336.0, 2074.0, 2679.0, 2480.0...","[484.0, 6194.0, 2074.0, 1978.0, 5876.0, 5707.0...","[484.0, 6194.0, 5876.0, 2480.0, 2679.0, 2074.0...","[484.0, 6194.0, 2679.0, 5336.0, 2074.0, 5876.0..."
3,3,"[4812.0, 3935.0, 802.0, 4459.0, 4340.0, 5975.0...","[484.0, 6194.0, 2679.0, 641.0, 5336.0, 5876.0,...","[484.0, 6194.0, 2074.0, 1978.0, 5876.0, 5707.0...","[484.0, 6194.0, 5876.0, 2480.0, 2679.0, 2074.0...","[484.0, 6194.0, 2679.0, 641.0, 5336.0, 5876.0,..."
4,4,"[152.0, 195.0, 800.0, 2266.0, 6634.0, 7412.0, ...","[6194.0, 641.0, 2679.0, 5876.0, 2074.0, 1978.0...","[6194.0, 2074.0, 1978.0, 5876.0, 5707.0, 2679....","[6194.0, 5876.0, 2679.0, 2074.0, 5707.0, 1978....","[6194.0, 641.0, 2679.0, 5876.0, 5336.0, 2074.0..."


In [69]:
evaluate_recommender(df=test_part, model_preds_col="predict_by_scores")

{'ndcg': 0.13037015044957553, 'recall': 0.06096128345309949}