In [1]:
import os
import sys
import typing as tp

import joblib
import numpy as np
import pandas as pd
import tqdm.notebook
from recsys.datasets import ml1m, ml100k
from sklearn.preprocessing import LabelEncoder

In [2]:
%load_ext autoreload
%autoreload 2

In [11]:
ratings, movies = ml100k.load()
ratings.head()

Unnamed: 0,userid,itemid,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [4]:
movies.head()

Unnamed: 0,itemid,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


### Preprocessing

In [5]:
def ids_encoder(ratings):
    users = sorted(ratings["userid"].unique())
    items = sorted(ratings["itemid"].unique())

    # create users and items encoders
    uencoder = LabelEncoder()
    iencoder = LabelEncoder()

    # fit users and items ids to the corresponding encoder
    uencoder.fit(users)
    iencoder.fit(items)

    # encode userids and itemids
    ratings.userid = uencoder.transform(ratings.userid.tolist())
    ratings.itemid = iencoder.transform(ratings.itemid.tolist())

    return ratings, uencoder, iencoder

In [12]:
# create the encoder
ratings, uencoder, iencoder = ids_encoder(ratings)
ratings.head()

Unnamed: 0,userid,itemid,rating
0,0,0,5
1,0,1,3
2,0,2,4
3,0,3,3
4,0,4,3


## 1 Задание

### Коэффициент Жаккара

In [6]:
def jaccard_similarity(user1, user2):
    movies_user1 = set(ratings[user1].keys())
    movies_user2 = set(ratings[user2].keys())

    common_movies = movies_user1.intersection(movies_user2)
    all_movies = movies_user1.union(movies_user2)

    jaccard_index = len(common_movies) / len(all_movies)

    return jaccard_index

### Скалярное произведение общих рейтингов

In [None]:
def dot_product_similarity(user1, user2):
    common_movies = set(ratings[user1].keys()).intersection(set(ratings[user2].keys()))

    dot_product = 0
    for movie in common_movies:
        dot_product += ratings[user1][movie] * ratings[user2][movie]

    return dot_product

### Скорректированная Кореляция Пирсона

In [None]:
def pearson_correlation(user1, user2):
    ratings_user1 = [ratings[user1][movie] for movie in ratings[user1] if movie in ratings[user2]]
    ratings_user2 = [ratings[user2][movie] for movie in ratings[user1] if movie in ratings[user2]]

    mean_user1 = np.mean(ratings_user1)
    mean_user2 = np.mean(ratings_user2)

    numerator = sum([(rating_user1 - mean_user1) * (rating_user2 - mean_user2) for rating_user1, rating_user2 in zip(ratings_user1, ratings_user2)])
    denominator = np.sqrt(sum([(rating_user1 - mean_user1)**2 for rating_user1 in ratings_user1]) * sum([(rating_user2 - mean_user2)**2 for rating_user2 in ratings_user2]))

    pearson_corr = numerator / denominator if denominator != 0 else 0

    common_movies = set(ratings[user1].keys()).intersection(set(ratings[user2].keys()))

    return pearson_corr * min(1, common_movies / 50)