In [552]:
import os
import sys
import typing as tp

import joblib
import numpy as np
import pandas as pd
import tqdm.notebook
from recsys.datasets import ml1m, ml100k
from sklearn.preprocessing import LabelEncoder

In [553]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [554]:
ratings, movies = ml100k.load()
ratings.head()

Unnamed: 0,userid,itemid,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [555]:
movies.head()

Unnamed: 0,itemid,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


### Preprocessing

In [556]:
def ids_encoder(ratings):
    users = sorted(ratings["userid"].unique())
    items = sorted(ratings["itemid"].unique())

    # create users and items encoders
    uencoder = LabelEncoder()
    iencoder = LabelEncoder()

    # fit users and items ids to the corresponding encoder
    uencoder.fit(users)
    iencoder.fit(items)

    # encode userids and itemids
    ratings.userid = uencoder.transform(ratings.userid.tolist())
    ratings.itemid = iencoder.transform(ratings.itemid.tolist())

    return ratings, uencoder, iencoder

In [557]:
# create the encoder
ratings, uencoder, iencoder = ids_encoder(ratings)
ratings.head(1100)

Unnamed: 0,userid,itemid,rating
0,0,0,5
1,0,1,3
2,0,2,4
3,0,3,3
4,0,4,3
...,...,...,...
1095,6,577,3
1096,6,578,4
1097,6,579,3
1098,6,580,5


In [558]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(ratings, test_size=0.2)
np_ratings = train_data.to_numpy()

## 1 Задание

In [559]:
from functools import lru_cache

@lru_cache(2000)
def ratings_for_user(i):
    return np_ratings[np_ratings[:, 0] == i]

### Коэффициент Жаккара

In [560]:
def jaccard_similarity(i: int, j: int) -> float:
    if i == j:
        return 1.0

    ratings_i, ratings_j = ratings_for_user(i), ratings_for_user(j)
    movies_user1 = ratings_i[:, 1]
    movies_user2 = ratings_j[:, 1]

    common_movies = np.intersect1d(movies_user1, movies_user2)
    all_movies = np.union1d(movies_user1, movies_user2)

    jaccard_index = len(common_movies) / len(all_movies)

    return jaccard_index

assert np.isclose(jaccard_similarity(0, 0), 1.0)
print(jaccard_similarity(0, 1))

0.052


### Скалярное произведение общих рейтингов

In [561]:
def dot_product_similarity(i: int, j: int) -> float:
    if i == j:
        return 1.0

    ratings_i, ratings_j = ratings_for_user(i), ratings_for_user(j)

    common_movies = np.intersect1d(ratings_i[:, 1], ratings_j[:, 1])

    common_ratings_i = ratings_i[np.isin(ratings_i[:, 1], common_movies)]
    common_ratings_j = ratings_j[np.isin(ratings_j[:, 1], common_movies)]

    x = common_ratings_i[:, 2]
    y = common_ratings_j[:, 2]

    return np.dot(x, y)

print(dot_product_similarity(0, 1))
assert np.isclose(dot_product_similarity(0, 0), 1.0)

241


### Скорректированная Кореляция Пирсона

In [562]:
def pearson_correlation(i: int, j: int) -> float:
    if i == j:
        return 1.0

    ratings_i, ratings_j = ratings_for_user(i), ratings_for_user(j)

    mean_user1 = np.mean(ratings_i[:, 2])
    mean_user2 = np.mean(ratings_j[:, 2])

    numerator = sum([(rating_user1 - mean_user1) * (rating_user2 - mean_user2) for rating_user1, rating_user2 in zip(ratings_i[:, 2], ratings_j[:, 2])])
    denominator = np.sqrt(sum([(rating_user1 - mean_user1)**2 for rating_user1 in ratings_i[:, 2]]) * sum([(rating_user2 - mean_user2)**2 for rating_user2 in ratings_j[:, 2]]))

    pearson_corr = numerator / denominator if denominator != 0 else 0

    common_movies = np.intersect1d(ratings_i[:, 1], ratings_j[:, 1])

    return pearson_corr * min(1, len(common_movies) / 50)

print(pearson_correlation(0, 1))
print(pearson_correlation(2, 3))
assert np.isclose(pearson_correlation(0, 0), 1.0)

-0.03308099874512352
-0.0029569952570800253


### 2 Задание

### Простое усреднение по ближайшим соседям

In [563]:
def get_neighbors(user, movie_id, similarity_func, k=5):
    users_similarity = []
    users = np.unique(np_ratings[np_ratings[:, 1] == movie_id][:, 0])
    for cur_user in users:
        if cur_user != user:
            similarity = similarity_func(cur_user, user)
            users_similarity.append((cur_user, similarity))

    users_similarity = sorted(users_similarity, key=lambda x: x[1], reverse=True)
    k = min(k, len(users_similarity))
    top_k_neighbors = users_similarity[:k]

    return top_k_neighbors

get_neighbors(0, 0, pearson_correlation)

[(898, 0.21885347117195997),
 (243, 0.18069608507431043),
 (762, 0.14604613788906762),
 (649, 0.1435233061387521),
 (918, 0.1261670155330647)]

In [564]:
np_ratings

array([[ 428,  120,    3],
       [  17,  691,    3],
       [ 315,  729,    4],
       ...,
       [ 531,  618,    5],
       [ 495, 1073,    2],
       [ 415,  328,    3]])

In [567]:
@lru_cache(2000)
def movie_rating_for_user(user, movie_id) -> float:
    ratings = ratings_for_user(user)
    return ratings[ratings[:, 1] == movie_id][0][2]

assert movie_rating_for_user(428, 128) == 3

AssertionError: 

In [568]:
def rating_predictions_1(user, movie_id, similarity_func, k=50):
   neighbors = get_neighbors(user, movie_id, similarity_func, k)

   numerator = sum([similarity * movie_rating_for_user(user1, movie_id) for user1, similarity in neighbors])
   denominator = sum([abs(similarity) for user1, similarity in neighbors])

   return numerator / denominator

rating_predictions_1(0, 0, pearson_correlation)

3.9624130102812782

### Усреднение с учётом коррекции среднего

In [569]:
def rating_predictions_2(user, movie_id, similarity_func, k=50):
   neighbors = get_neighbors(user, movie_id, similarity_func, k)

   ratings = ratings_for_user(user)
   mean_user = np.mean(ratings[:, 2])

   numerator = sum([similarity * (movie_rating_for_user(user1, movie_id) - np.mean(ratings_for_user(user1)[:, 2])) for user1, similarity in neighbors])
   denominator = sum([abs(similarity) for user1, similarity in neighbors])

   if denominator == 0.0:
       return mean_user

   return mean_user + (numerator / denominator)

rating_predictions_2(0, 0, pearson_correlation)

4.005903065238828

### 3 Задание

### Разделите датасет movielens на тренировочную и валидационную части. Постройте рекомендации для пользователей из валидационной части

In [570]:
test_data.head()


Unnamed: 0,userid,itemid,rating
2127,12,668,1
83808,783,343,4
44800,404,356,5
5789,56,408,4
24963,253,264,3


In [571]:
def recommend_movies(user_id, item_id):
    return rating_predictions_2(user_id, item_id, pearson_correlation, 5)

In [576]:
test_data['rec_rating'] = test_data.apply(lambda row: recommend_movies(row['userid'], row['itemid']), axis=1)
test_data['rating'] = test_data.apply(lambda row: float(row['rating']), axis=1)

test_data.head()

Unnamed: 0,userid,itemid,rating,rec_rating
2127,12,668,1.0,2.191378
83808,783,343,4.0,3.902259
44800,404,356,5.0,2.818094
5789,56,408,4.0,3.474553
24963,253,264,3.0,3.333324


### Метрики для предсказаний

In [592]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, mean_absolute_error


rmse = np.sqrt(mean_squared_error(test_data['rating'], test_data['rec_rating']))
mae = mean_absolute_error(test_data['rating'], test_data['rec_rating'])

threshold = 0.5
test_data['predicted_class'] = test_data['rec_rating'].round(0).astype(int)

precision = precision_score(test_data['rating'], test_data['predicted_class'], average = 'weighted')
recall = recall_score(test_data['rating'], test_data['predicted_class'], average = 'weighted')
f1 = f1_score(test_data['rating'], test_data['predicted_class'], average = 'weighted')

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
test_data

Precision: 0.40
Recall: 0.38
F1-score: 0.36
RMSE: 1.043448668821877
MAE: 0.818823600373041


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,userid,itemid,rating,rec_rating,predicted_class
2127,12,668,1.0,2.191378,2
83808,783,343,4.0,3.902259,4
44800,404,356,5.0,2.818094,3
5789,56,408,4.0,3.474553,3
24963,253,264,3.0,3.333324,3
...,...,...,...,...,...
95626,895,709,4.0,2.692360,3
58403,520,158,3.0,3.496400,3
19176,195,339,3.0,4.235709,4
13184,127,14,4.0,3.589139,4
