In [1]:
from abc import ABC, abstractmethod
from typing import Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from catboost import CatBoostRegressor

## Определим метрики:

In [2]:
# ACHTUNG! DO NOT TOUCH 

def ndcg_metric(gt_items: np.ndarray, predicted: np.ndarray) -> float:
    at = len(predicted)
    relevance = np.array([1 if x in predicted else 0 for x in gt_items])
    # DCG uses the relevance of the recommended items
    rank_dcg = dcg(relevance)
    if rank_dcg == 0.0:
        return 0.0

    # IDCG has all relevances to 1 (or the values provided), up to the number of items in the test set that can fit in the list length
    ideal_dcg = dcg(np.sort(relevance)[::-1][:at])

    if ideal_dcg == 0.0:
        return 0.0

    ndcg_ = rank_dcg / ideal_dcg

    return ndcg_


def dcg(scores: np.ndarray) -> float:
    return np.sum(
        np.divide(np.power(2, scores) - 1, np.log2(np.arange(scores.shape[0], dtype=np.float64) + 2)), dtype=np.float64
    )


def recall_metric(gt_items: np.ndarray, predicted: np.ndarray) -> float:
    n_gt = len(gt_items)
    intersection = len(set(gt_items).intersection(set(predicted)))
    return intersection / n_gt


def evaluate_recommender(df: pd.DataFrame, model_preds_col: str, gt_col: str = "movie_id") -> Dict[str, float]:
    metric_values = []

    for _, row in df.iterrows():
        metric_values.append(
            (ndcg_metric(row[gt_col], row[model_preds_col]), recall_metric(row[gt_col], row[model_preds_col]))
        )

    return {"ndcg": np.mean([x[0] for x in metric_values]), "recall": np.mean([x[1] for x in metric_values])}

In [6]:
data_folder = "data/"

users_df = pd.read_csv(data_folder + "users_df.csv")
items_df = pd.read_csv(data_folder + "items_df.csv")

countries = pd.read_csv(data_folder + "countries.csv")
genres = pd.read_csv(data_folder + "genres.csv")
staff = pd.read_csv(data_folder + "staff.csv")

train_part = pd.read_csv(data_folder + "train_data.csv")#, parse_dates=["datetime"])
test_part = pd.read_csv(data_folder + "test_data.csv")
test_part = test_part.groupby("user_id").agg({"movie_id": list}).reset_index()
movie_duration = pd.read_csv('data/movies_durations_5175.csv') # Таблица с продолжительностью фильмов

## Подготовим данные для вычисления матрицы схожестей фильмов

Из Кинопоиска я спарсил данные о продолжительности фильмов. Только позже я додумался, что их можно оценить по времени просмотров пользователей. Парсинг данных написан в другом нойтбуке. Через него мы получаем таблицу movies_durations_5175

In [7]:
import ast
items_df['genres'] = items_df['genres'].apply(lambda x: ast.literal_eval(x))
items_df['staff'] = items_df['staff'].apply(lambda x: ast.literal_eval(x))
items_df['countries'] = items_df['countries'].apply(lambda x: ast.literal_eval(x))

train_part['movie_id'] = train_part['movie_id'].astype(int)

items_df['year'] = items_df['year'].apply(lambda x: x[:4]).astype(int)

movies = movie_duration.merge(items_df[['id', 'title', 'genres', 'year', 'countries', 'staff']].drop_duplicates(subset=['title'], keep='first'), 
                     left_on = 'Название', 
                     right_on = 'title', 
                     how = 'left')[['id', 'genres', 'countries', 'staff', 'year', 'movies_duration']].drop_duplicates(subset=['id'], keep='first')

In [8]:
movies

Unnamed: 0,id,genres,countries,staff,year,movies_duration
0,0,[97],[238],"[1883, 33655, 25890, 1001, 12051, 10110, 16895]",2022,104
1,1,"[138, 97, 294]",[242],"[18168, 23444, 10850, 21847, 30555, 24469, 268...",2010,123
2,3,"[97, 303, 143, 319]","[188, 212, 0]","[16006, 12217, 30668, 28806, 16172, 5045, 1663...",2017,126
3,4,"[138, 97]","[1, 102, 0]","[23586, 8823, 8040, 34555, 32484, 18446]",2015,122
4,5,[234],[238],"[17317, 19228, 35448, 17666, 11726, 32044, 266...",2012,90
...,...,...,...,...,...,...
5170,7421,[46],[102],"[19959, 28032, 5725, 942, 17135]",2019,90
5171,7422,"[138, 97]","[49, 102]","[31732, 12514, 5845, 12650, 23433, 28088, 64, ...",1996,120
5172,7423,"[85, 303]","[121, 102]","[5166, 24667, 6277, 8266, 4854, 2186, 29049, 6...",2002,141
5173,7424,"[138, 85]",[0],"[5828, 24468, 5125, 19939, 12752, 21851, 22674...",2017,107


In [9]:
movies_with_duration = movies['id'].values
train_part_cutted = train_part[train_part['movie_id'].apply(lambda x: 1 if x in movies_with_duration else 0) == 1]

In [10]:
train_part_cutted = train_part_cutted.merge(movies[['id', 'movies_duration']], left_on = 'movie_id', right_on = 'id', how = 'left')

In [11]:
train_part_cutted['part_duration'] = train_part_cutted['duration']/(train_part_cutted['movies_duration'] * 60)

In [12]:
train_part_cutted

Unnamed: 0,id_x,datetime,user_id,movie_id,duration,is_train,id_y,movies_duration,part_duration
0,0,2023-04-06 15:00:00.071114+03:00,10250,427,485.0,True,427,141,0.057329
1,4,2023-04-06 15:00:10.495017+03:00,43069,4291,75.0,True,4291,100,0.012500
2,6,2023-04-06 15:00:12.927035+03:00,15060,3316,260.0,True,3316,97,0.044674
3,7,2023-04-06 15:00:13.580970+03:00,38702,1390,400.0,True,1390,118,0.056497
4,8,2023-04-06 15:00:16.885034+03:00,2019,799,1328.0,True,799,111,0.199399
...,...,...,...,...,...,...,...,...,...
920839,2477857,2023-06-14 23:58:12.962890+03:00,58226,475,2.0,True,475,158,0.000211
920840,2477876,2023-06-14 23:59:02.925097+03:00,31874,7367,770.0,True,7367,85,0.150980
920841,2477890,2023-06-14 23:59:33.756448+03:00,112950,2784,345.0,True,2784,104,0.055288
920842,2477898,2023-06-14 23:59:50.429716+03:00,34303,1520,2021.0,True,1520,93,0.362186


In [13]:
# staff = movies['staff'].explode()
# dummies = pd.get_dummies(staff).groupby(level=0).sum()
# staff_list = list(map(str, list(dummies.columns)))
# staff_list = ['staff' + str(i) for i in staff_list]
# dummies.columns = staff_list
# movies = movies.drop(columns='staff').join(dummies)

In [15]:
countries = movies['countries'].explode()
dummies = pd.get_dummies(countries).groupby(level=0).sum()
countries_list = list(map(str, list(dummies.columns)))
countries_list = ['countries' + str(i) for i in countries_list]
dummies.columns = countries_list
movies = movies.drop(columns='countries').join(dummies)

In [16]:
genres = movies['genres'].explode()
dummies = pd.get_dummies(genres).groupby(level=0).sum()
genres_list = list(map(str, list(dummies.columns)))
genres_list = ['genres' + str(i) for i in genres_list]
dummies.columns = genres_list
movies = movies.drop(columns='genres').join(dummies)

In [17]:
top_15_countries = movies[countries_list].sum().nlargest(15).index.to_list()
top_15_genres = movies[genres_list].sum().nlargest(15).index.to_list()
# top_500_staff = movies[staff_list].sum().nlargest(500).index.to_list()

In [18]:
features = top_15_countries + top_15_genres # + top_500_staff
row_sums = movies[features].sum(axis=1)
# movies = movies[features].div(row_sums, axis=0)

In [23]:
movies = movies[['year', 'movies_duration', 'id'] + features]

### Получаем представления фильмов в виде векторов по странам и жанрам

In [24]:
movies.head(3)

Unnamed: 0,year,movies_duration,id,countries102,countries238,countries122,countries205,countries0,countries188,countries121,...,genres294,genres158,genres326,genres266,genres302,genres72,genres130,genres261,genres100,genres223
0,2022,104,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2010,123,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2017,126,3,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
movies.drop(['year', 'movies_duration'], axis = 1)
movie_dict = {movies['id'].values[i] : movies.drop(['year', 'movies_duration', 'id'], axis = 1).iloc[0].values for i in range(len(movies))}

## Создаем матрицу схожестей

In [27]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def create_similarity_matrix(movie_dict):
    # Преобразуем словарь векторов в матрицу
    movie_ids = list(movie_dict.keys())
    matrix = np.array([movie_dict[movie_id] for movie_id in movie_ids])
    
    # Вычисляем косинусное сходство между всеми парами фильмов
    similarity_matrix = cosine_similarity(matrix)
    
    return similarity_matrix, movie_ids

similarity_matrix, movie_ids = create_similarity_matrix(movie_dict)

In [28]:
def get_recommendations(user_history, similarity_matrix, movie_ids, n=10):
    user_vector = np.zeros(len(movie_ids))
    for movie in user_history:
        if movie in movie_ids:
            idx = movie_ids.index(movie)
            user_vector[idx] = 1
    
    # Вычисляем оценки для всех фильмов
    scores = similarity_matrix.dot(user_vector)
    
    # Сортируем фильмы по оценкам
    sorted_indices = np.argsort(scores)[::-1]
    
    # Фильтруем уже просмотренные фильмы
    recommendations = [movie_ids[i] for i in sorted_indices 
                       if movie_ids[i] not in user_history][:n]
    
    return recommendations

In [29]:
def predict(user):
    user_all_history = train_part_cutted[train_part_cutted['user_id'] == user]['movie_id'].values  # ID фильмов, которые посмотрел пользователь
    user_good_history = train_part_cutted[(train_part_cutted['user_id'] == user) & 
                                          (train_part_cutted['part_duration'] >= 0.7)]['movie_id'].values  # ID фильмов, которые досмотрел пользователь

    if len(user_good_history) == 0:  # если нет фильмов, которые пользователь досмотрел, рекомендуем фильмы, которые остальные чаще всего смотрят
        filt_rec = [elem for elem in recommendations if elem not in set(user_all_history)]
        return filt_rec[:10]
        
    # print(user_all_history, user_good_history)
    
    recommended_movies = get_recommendations(user_history = user_good_history, similarity_matrix = similarity_matrix, movie_ids = movie_ids)
    return list(map(float, recommended_movies))
my_func = lambda user: predict(user)

In [30]:
test_part_20000 = test_part.iloc[0:300]

In [32]:
recommendations = train_part['movie_id'].value_counts().index.values[:600]

In [33]:
%%time
test_part_20000["my_predict"] = test_part_20000['user_id'].apply(my_func)
test_part_20000.head()

CPU times: total: 5.86 s
Wall time: 7.67 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,user_id,movie_id,my_predict
0,0,"[12.0, 6201.0, 5542.0, 2025.0, 190.0, 5358.0, ...","[7426.0, 2404.0, 2395.0, 2396.0, 2398.0, 2399...."
1,1,"[2515.0, 1540.0, 5210.0, 1608.0, 3590.0, 7215....","[484, 6194, 641, 2679, 2820, 5336, 2074, 1978,..."
2,2,"[5998.0, 190.0, 7327.0, 947.0, 3814.0, 876.0, ...","[484, 6194, 641, 2679, 2820, 5876, 5336, 2074,..."
3,3,"[4812.0, 3935.0, 802.0, 4459.0, 4340.0, 5975.0...","[7426.0, 2404.0, 2395.0, 2396.0, 2398.0, 2399...."
4,4,"[152.0, 195.0, 800.0, 2266.0, 6634.0, 7412.0, ...","[7426.0, 2404.0, 2395.0, 2396.0, 2398.0, 2399...."


In [34]:
evaluate_recommender(df=test_part_20000, model_preds_col="my_predict")

{'ndcg': 0.03157708935154681, 'recall': 0.003105807948014025}