In [1393]:
import os
import sys
import typing as tp

import joblib
import numpy as np
import pandas as pd
import tqdm.notebook
from recsys.datasets import ml1m, ml100k
from sklearn.preprocessing import LabelEncoder

In [1394]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1395]:
ratings, movies = ml100k.load()
ratings.head()

Unnamed: 0,userid,itemid,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [1396]:
movies.head()

Unnamed: 0,itemid,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [1397]:
my_favourite_movies = [1, 96, 100, 23, 29, 50, 67, 72, 89, 95, 98, 121, 127, 135, 151, 161, 178, 181, 196, 204, 210, 225, 222, 227, 228, 229, 230, 231, 250]

my_favourite_movies5 = [1, 29, 50, 67, 95, 98, 121, 127, 178, 181, 225, 222, 227, 231, 250]

my_favourite_movies4 = [96, 100, 23, 72, 89, 135, 151, 161, 196, 204, 210, 228, 229, 231]
assert len(my_favourite_movies4) + len(my_favourite_movies5) == len(my_favourite_movies)

my_user_index = max(ratings['userid']) + 1
for id in my_favourite_movies5:
    new_rows = pd.DataFrame({
        'userid': [my_user_index],
        'itemid': [id],
        'rating': [5]
    }, index=[0])
    ratings = pd.concat([ratings, new_rows], ignore_index=True)
for id in my_favourite_movies4:
    new_rows = pd.DataFrame({
        'userid': [my_user_index],
        'itemid': [id],
        'rating': [4]
    }, index=[0])
    ratings = pd.concat([ratings, new_rows], ignore_index=True)
ratings

Unnamed: 0,userid,itemid,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3
...,...,...,...
100024,944,204,4
100025,944,210,4
100026,944,228,4
100027,944,229,4


### Preprocessing

In [1398]:
def ids_encoder(ratings):
    users = sorted(ratings["userid"].unique())
    items = sorted(ratings["itemid"].unique())

    # create users and items encoders
    uencoder = LabelEncoder()
    iencoder = LabelEncoder()

    # fit users and items ids to the corresponding encoder
    uencoder.fit(users)
    iencoder.fit(items)

    # encode userids and itemids
    ratings.userid = uencoder.transform(ratings.userid.tolist())
    ratings.itemid = iencoder.transform(ratings.itemid.tolist())

    return ratings, uencoder, iencoder

In [1399]:
# create the encoder
ratings, uencoder, iencoder = ids_encoder(ratings)
ratings

Unnamed: 0,userid,itemid,rating
0,0,0,5
1,0,1,3
2,0,2,4
3,0,3,3
4,0,4,3
...,...,...,...
100024,943,203,4
100025,943,209,4
100026,943,227,4
100027,943,228,4


In [1400]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(ratings, test_size=0.2)
np_ratings = train_data.to_numpy()
all_ratings = ratings.to_numpy()

## 1 Задание

In [1401]:
from functools import lru_cache

@lru_cache(5000)
def ratings_for_user(i):
    return all_ratings[all_ratings[:, 0] == i]

### Коэффициент Жаккара

In [1402]:
def jaccard_similarity(np_ratings, i: int, j: int) -> float:
    if i == j:
        return 1.0

    ratings_i, ratings_j = ratings_for_user(i), ratings_for_user(j)
    movies_user1 = ratings_i[:, 1]
    movies_user2 = ratings_j[:, 1]

    common_movies = np.intersect1d(movies_user1, movies_user2)
    all_movies = np.union1d(movies_user1, movies_user2)

    jaccard_index = len(common_movies) / len(all_movies)

    return jaccard_index

assert np.isclose(jaccard_similarity(np_ratings, 0, 0), 1.0)
print(jaccard_similarity(np_ratings, 0, 1))

0.056962025316455694


### Скалярное произведение общих рейтингов

In [1403]:
def dot_product_similarity(np_ratings, i: int, j: int) -> float:
    if i == j:
        return 1.0

    ratings_i, ratings_j = ratings_for_user(i), ratings_for_user(j)

    common_movies = np.intersect1d(ratings_i[:, 1], ratings_j[:, 1])

    common_ratings_i = ratings_i[np.isin(ratings_i[:, 1], common_movies)]
    common_ratings_j = ratings_j[np.isin(ratings_j[:, 1], common_movies)]

    x = common_ratings_i[:, 2]
    y = common_ratings_j[:, 2]

    return np.dot(x, y)

print(dot_product_similarity(np_ratings, 0, 1))
assert np.isclose(dot_product_similarity(np_ratings, 0, 0), 1.0)

319


### Скорректированная Кореляция Пирсона

In [1404]:
def pearson_correlation(np_ratings, i: int, j: int) -> float:
    if i == j:
        return 1.0

    ratings_i, ratings_j = ratings_for_user(i), ratings_for_user(j)

    mean_user1 = np.mean(ratings_i[:, 2])
    mean_user2 = np.mean(ratings_j[:, 2])

    numerator = sum([(rating_user1 - mean_user1) * (rating_user2 - mean_user2) for rating_user1, rating_user2 in zip(ratings_i[:, 2], ratings_j[:, 2])])
    denominator = np.sqrt(sum([(rating_user1 - mean_user1)**2 for rating_user1 in ratings_i[:, 2]]) * sum([(rating_user2 - mean_user2)**2 for rating_user2 in ratings_j[:, 2]]))

    pearson_corr = numerator / denominator if denominator != 0 else numerator

    common_movies = np.intersect1d(ratings_i[:, 1], ratings_j[:, 1])
    coef = min(1, len(common_movies) / 50)
    if np.isclose(coef, 0.0):
        coef = 1
    return pearson_corr * coef

print(pearson_correlation(np_ratings, 0, 1))
print(pearson_correlation(np_ratings, 2, 3))
assert np.isclose(pearson_correlation(np_ratings, 0, 0), 1.0)

-0.0017341664853389787
-0.055524917387262014


### 2 Задание

### Простое усреднение по ближайшим соседям

In [1405]:
def get_neighbors(np_ratings, user, movie_id, similarity_func, k=50):
    users_similarity = []
    users = np.unique(np_ratings[np_ratings[:, 1] == movie_id][:, 0])
    for cur_user in users:
        if cur_user != user:
            similarity = similarity_func(np_ratings, cur_user, user)
            users_similarity.append((cur_user, similarity))

    users_similarity = sorted(users_similarity, key=lambda x: x[1], reverse=True)
    k = min(k, len(users_similarity))
    top_k_neighbors = users_similarity[:k]

    return top_k_neighbors

get_neighbors(np_ratings, 0, 0, pearson_correlation)

[(215, 0.19198244073602577),
 (289, 0.15562120535833365),
 (902, 0.14563465544661258),
 (880, 0.13499841828857773),
 (159, 0.12701750601913794),
 (338, 0.11971270022230987),
 (762, 0.11821862633443757),
 (304, 0.11219384099355088),
 (842, 0.11190864773468465),
 (291, 0.11101521774217658),
 (5, 0.10884336960670629),
 (882, 0.09855213929887945),
 (886, 0.0962804110066975),
 (649, 0.09481845681661784),
 (57, 0.0933495883242139),
 (587, 0.09077879151430997),
 (535, 0.08639127789745854),
 (477, 0.08456780665622526),
 (683, 0.08150522168254352),
 (349, 0.0765146463935229),
 (664, 0.07640379163899759),
 (544, 0.07526396656320619),
 (772, 0.07500489359490821),
 (748, 0.07480211457266728),
 (793, 0.07247636462903895),
 (647, 0.07030212871394699),
 (342, 0.06901479361953161),
 (326, 0.06825960537811308),
 (660, 0.06814210394356963),
 (720, 0.06798551871960856),
 (641, 0.06692259858810187),
 (531, 0.06580252835617872),
 (653, 0.06465761868971755),
 (293, 0.06444432619905027),
 (926, 0.06362401085

In [1406]:
np_ratings

array([[238, 227,   2],
       [  0, 266,   4],
       [157, 297,   3],
       ...,
       [676,   6,   4],
       [ 10, 214,   3],
       [416, 746,   3]])

In [1407]:
from functools import lru_cache

@lru_cache(5000)
def movie_rating_for_user(user, movie_id) -> float:
    ratings = ratings_for_user(user)
    return ratings[ratings[:, 1] == movie_id][0][2]

assert movie_rating_for_user(917, 198) == 3

In [1408]:
def rating_predictions_1(np_ratings, user, movie_id, similarity_func, k=50):
   neighbors = get_neighbors(np_ratings, user, movie_id, similarity_func, k)

   numerator = sum([similarity * movie_rating_for_user(user1, movie_id) for user1, similarity in neighbors])
   denominator = sum([abs(similarity) for user1, similarity in neighbors])

   return numerator / denominator

rating_predictions_1(np_ratings, 0, 0, pearson_correlation)

4.111112970231002

### Усреднение с учётом коррекции среднего

In [1409]:
def rating_predictions_2(np_ratings, user, movie_id, similarity_func, k=50):
   neighbors = get_neighbors(np_ratings, user, movie_id, similarity_func, k)

   ratings = ratings_for_user(user)
   mean_user = np.mean(ratings[:, 2])

   numerator = sum([similarity * (movie_rating_for_user(user1, movie_id) - np.mean(ratings_for_user(user1)[:, 2])) for user1, similarity in neighbors])
   denominator = sum([abs(similarity) for user1, similarity in neighbors])

   if denominator == 0.0:
       return mean_user

   return mean_user + (numerator / denominator)

rating_predictions_2(np_ratings, 0, 0, pearson_correlation)

4.087617000136576

### 3 Задание

### Разделите датасет movielens на тренировочную и валидационную части. Постройте рекомендации для пользователей из валидационной части

In [1410]:
test_data.head()

Unnamed: 0,userid,itemid,rating
69343,633,146,2
476,4,225,3
6125,58,217,5
21196,215,128,4
67723,617,722,3


In [1411]:
def recommend_movies(np_ratings, user_id, item_id):
    return rating_predictions_2(np_ratings, user_id, item_id, pearson_correlation)

In [None]:
test_data['rec_rating'] = test_data.apply(lambda row: recommend_movies(np_ratings, row['userid'], row['itemid']), axis=1)
test_data['rating'] = test_data.apply(lambda row: float(row['rating']), axis=1)

test_data.head()

### Метрики для предсказаний

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, mean_absolute_error


rmse = np.sqrt(mean_squared_error(test_data['rating'], test_data['rec_rating']))
mae = mean_absolute_error(test_data['rating'], test_data['rec_rating'])

test_data['predicted_class'] = test_data['rec_rating'].round(0).astype(int)

precision = precision_score(test_data['rating'], test_data['predicted_class'], average = 'weighted')
recall = recall_score(test_data['rating'], test_data['predicted_class'], average = 'weighted')
f1 = f1_score(test_data['rating'], test_data['predicted_class'], average = 'weighted')

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
test_data

### 4 Задание

### Выберите от 10 до 50 своих любимых фильмов

In [None]:
ratings, movies = ml100k.load()
my_user_index
ratings

In [None]:
my_favourite_movies = [1, 96, 100, 23, 29, 50, 67, 72, 89, 95, 98, 121, 127, 135, 151, 161, 178, 181, 196, 204, 210, 225, 222, 227, 228, 229, 230, 231, 250]

In [None]:
np_movies = movies.to_numpy()
np_movies

In [None]:
ratings, uencoder, iencoder = ids_encoder(ratings)
np_ratings = ratings.to_numpy()
np_ratings

### Топ-10 рекомендаций по каждому из 6 методов

In [None]:
def candidate_items(
    np_ratings: np.array, userid: int, k=-1
) -> tp.Tuple[np.array, np.array]:
    user_movies = np_ratings[np_ratings[:, 0] == userid][:, 1]
    all_movies = np.unique(np_ratings[:, 1])

    return np.setdiff1d(all_movies, user_movies)

In [None]:
candidates = candidate_items(np_ratings, my_user_index - 1)

print("Candidates:", len(candidates))
candidates

In [None]:
def topn_recommendations(rating_predictions, similarity_func, k = 10):
    candidates_with_ratings = []
    for id in candidates:
        candidates_with_ratings.append((rating_predictions(np_ratings, my_user_index - 1, id, similarity_func), id))
    candidates_with_ratings = sorted(candidates_with_ratings, reverse=True)
    return candidates_with_ratings[:k]

In [None]:
jaccard_similarity(np_ratings, my_user_index - 1, 6)

In [None]:
top1 = topn_recommendations(rating_predictions_1, jaccard_similarity)
top2 = topn_recommendations(rating_predictions_1, dot_product_similarity)
top3 = topn_recommendations(rating_predictions_1, pearson_correlation)
top4 = topn_recommendations(rating_predictions_2, jaccard_similarity)
top5 = topn_recommendations(rating_predictions_2, dot_product_similarity)
top6 = topn_recommendations(rating_predictions_2, pearson_correlation)