# Рекомендательные системы - фильмы

Будем работать с данными о пользователях и оценках, которые они поставили различным фильмам.

На основе этих данных будем рекомендовать пользователям к просмотру новые для них фильмы.

In [154]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from typing import List, Dict
from tqdm import tqdm, tqdm_notebook

## Загрузка и обработка данных

In [155]:
ratings = pd.read_csv('https://raw.githubusercontent.com/aiedu-courses/stepik_applied_tasks/main/datasets/movies_ratings.csv')

In [156]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,31,2.5,1260759144,Dangerous Minds
1,7,31,3.0,851868750,Dangerous Minds
2,31,31,4.0,1273541953,Dangerous Minds
3,32,31,4.0,834828440,Dangerous Minds
4,36,31,3.0,847057202,Dangerous Minds
...,...,...,...,...,...
99845,664,64997,2.5,1343761859,H.G. Wells' War of the Worlds
99846,664,72380,3.5,1344435977,The Box
99847,665,129,3.0,995232528,Pie in the Sky
99848,665,4736,1.0,1010197684,Summer Catch


In [157]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

ratings['userId'] = user_encoder.fit_transform(ratings['userId'])
ratings['movieId'] = item_encoder.fit_transform(ratings['movieId'])

num_users, num_movies = ratings.userId.nunique(), ratings.movieId.nunique()
num_users, num_movies

(671, 9025)

In [158]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp,title
0,0,30,2.5,1260759144,Dangerous Minds
1,6,30,3.0,851868750,Dangerous Minds
2,30,30,4.0,1273541953,Dangerous Minds
3,31,30,4.0,834828440,Dangerous Minds
4,35,30,3.0,847057202,Dangerous Minds
...,...,...,...,...,...
99845,663,7082,2.5,1343761859,H.G. Wells' War of the Worlds
99846,663,7337,3.5,1344435977,The Box
99847,664,115,3.0,995232528,Pie in the Sky
99848,664,3704,1.0,1010197684,Summer Catch


Поделим выборку на train и test так, чтобы у каждого пользователя последние 10 фильмов оказались в тесте для подсчета метрики качества рекомендаций k=10.  

In [159]:
train, test = [], []
num_test_samples = 10

for user, data in ratings.groupby('userId'):
    train += [data[:-num_test_samples]]
    test += [data[-num_test_samples:]]

train = pd.concat(train)
test = pd.concat(test)
print(train.shape, test.shape)

(93140, 5) (6710, 5)


In [160]:
train.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,0,30,2.5,1260759144,Dangerous Minds
42,0,830,3.0,1260759179,Dumbo
84,0,856,3.0,1260759182,Sleepers
117,0,903,2.0,1260759185,Escape from New York
165,0,927,4.0,1260759205,Cinema Paradiso


Запишем данные в формате, где строка соответствует пользователю, а столбцы будут истинными метками и предсказаниями в виде списков.

In [161]:
interactions = pd.DataFrame(train.groupby('userId').movieId.apply(list))
interactions['test'] = test.groupby('userId').movieId.apply(list)
interactions.columns = ['train', 'true_test']

In [162]:
interactions.loc[2, 'true_test']

[5008, 5107, 5456, 5461, 5874, 6345, 6518, 6561, 6872, 7681]

Для оценки качества модели будем использовать метрику  precision@10 для каждого пользователя (доля угаданных рекомендаций). Усредним ее по всем пользователям (полученная метрика называется MAP@10).

In [163]:
def calc_precision(column):
    return (
        interactions
        .apply(
            lambda row:
            len(set(row['true_test']).intersection(
                set(row[column]))) /
            min(len(row['true_test']) + 0.001, 10.0),
            axis=1)).mean()

## Коллаборативная фильтрация

Составим матрицу "оценок" пользователей - `ratings`. Нули будут обозначать отсутствие взаимодействия.

In [164]:
ratings = pd.DataFrame(train.pivot_table(columns='movieId', index='userId', values='rating', fill_value=0))

In [165]:
ratings_m = ratings.values

In [166]:
ratings

movieId,0,1,2,3,4,5,6,7,8,9,...,8990,8992,8993,8995,8996,8998,9000,9004,9008,9010
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Посчитаем схожести пользователей с помощью корреляции Пирсона. Для каждой пары учитываем только ненулевые значения.

In [168]:
similarity_users = np.zeros((len(ratings_m), len(ratings_m)))

for user1 in tqdm_notebook(range(len(ratings_m) - 1)):
  for user2 in range(user1 + 1, len(ratings_m)) :
    mask_uv = (ratings_m[user1] != 0) & (ratings_m[user2] != 0)

    if np.sum(mask_uv) == 0:
      continue

    ratings_v = ratings_m[user1, mask_uv]
    ratings_u = ratings_m[user2, mask_uv]

    if len(np.unique(ratings_v)) < 2 or len(np.unique(ratings_u)) < 2:
      continue

    similarity_users[user1,user2] = np.corrcoef(ratings_v, ratings_u)[0, 1]
    similarity_users[user2, user1] = similarity_users[user1, user2]

  0%|          | 0/670 [00:00<?, ?it/s]

### User-based


Для каждого пользователя:

1. Найдём пользователей с похожестью больше $\alpha$ на нашего пользователя.
2. Посчитаем для каждого фильма сумму рейтингов пользователей (среди выделенных на первом шаге), которые взаимодействовали с этим фильмом.
3. Порекомендуем фильмы с наибольшими долями со второго шага (среди тех, которые пользователь ещё не видел).

В нашем примере данных не очень много, поэтому возьмём $\alpha = 0$.

Сделаем предсказания и запишем их в столбец
`prediction_user_based` таблицы `interactions`.

In [171]:
prediction_user_based = []

for user in tqdm_notebook(range(len(ratings_m))):
  selected_users = similarity_users[user] > 0
  if len(selected_users) == 0:
    prediction_user_based.append([])
  else:
    tmp_recommend = np.argsort(ratings_m[selected_users].sum(axis=0))[::-1]
    tmp_recommend = ratings.columns[tmp_recommend]
    recommend = np.array(tmp_recommend)[~np.in1d(tmp_recommend, interactions.iloc[user].train)][:10]
    prediction_user_based.append(list(recommend))
interactions['prediction_user_based'] = prediction_user_based

  0%|          | 0/671 [00:00<?, ?it/s]

Значение метрики MAP@10 для user-based подхода.


In [173]:
calc_precision('prediction_user_based')

np.float64(0.00834575260804769)

## SVD-разложение

Сделаем сингулярное разложение (svd в scipy.linalg), на выходе вы получите три матрицы - `U`,`sigma`,`V`.

In [174]:
ratings

movieId,0,1,2,3,4,5,6,7,8,9,...,8990,8992,8993,8995,8996,8998,9000,9004,9008,9010
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [175]:
from scipy.linalg import svd

In [176]:
U, sigma, V = svd(ratings)

In [177]:
U.shape

(671, 671)

Оставим только первые 150 компонент, чтобы получить скрытые представления размерности 150. Для этого необходимо оставить 150 столбцов в матрице U, оставить из sigma только первые 150 значений (и сделать из них диагональную матрицу) и 150 столбцов в матрице V. Перемножим преобразованные матрицы ($\hat{U}, \hat{sigma}, \hat{V^T}$), чтобы получить восстановленную матрицу оценок.

In [178]:
# your code here
U = U[:,:150]

In [179]:
U.shape, V.shape

((671, 150), (8044, 8044))

In [180]:
sigma = np.diag(sigma[:150])

In [182]:
V = V[:150,:]

In [None]:
U

In [None]:
sigma

In [183]:
ratings_restored = U.dot(sigma).dot(V)

Посчитаем качество аппроксимации матрицы по норме Фробениуса (среднеквадратичную ошибку между всеми элементами соответствующими элементами двух матриц). Сравним его с простым бейзлайном с константным значением, равным среднему значению исходной матрицы. У аппроксимации ошибка должна получиться ниже.

In [184]:
ratings_m

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [185]:
sum(sum((ratings_restored - ratings.values)**2))

np.float64(248017.78364373563)

In [186]:
sum(sum((ratings.values.mean() - ratings.values)**2)) / sum(sum((ratings_restored - ratings.values)**2))

np.float64(5.062829180029833)

Сделаем предсказания по матрице.

Для этого необходимо для каждого пользователя найти предметы с наибольшими оценками в восстановленной матрице.

In [270]:
top_rec = np.argsort(ratings_restored[4])[::-1]

In [275]:
top_rec[:20]

array([1291,  496,  424,  310, 1269, 1449,  303, 1412,  690, 3638,  504,
        787,  494,  195, 3822,  502, 3052, 1565,  313, 1321])

In [304]:
# your code here
pred_svd = []
new_ratings = pd.DataFrame(ratings_restored, index=ratings.index, columns=ratings.columns)

for user in tqdm_notebook(interactions.index):
  top_rec = (
      new_ratings.loc[user].sort_values(ascending=False).index.values
  )

  pred_svd.append(
      list(top_rec[~np.in1d(
          top_rec,
          interactions.loc[user, 'train'])])[:10])




  0%|          | 0/671 [00:00<?, ?it/s]

In [305]:
interactions['pred_svd'] = pred_svd

Посчитаем значение метрики MAP@10 для SVD-подхода.

In [306]:
calc_precision('pred_svd')

np.float64(0.022652757078986587)