In [1]:
import os
import shutil
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics import mean_squared_error
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
movie_data = pd.read_csv('movielensTest/movies.csv')
display(movie_data.head())
link_data = pd.read_csv('movielensTest/links.csv')
display(link_data.head())
rating_data = pd.read_csv('movielensTest/ratings.csv')
display(rating_data.head())
tags_data = pd.read_csv('movielensTest/tags.csv')
display(tags_data.head())

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


# 1.Collaborative Filtering with Matrix Factorization (from Scratch)

На этом этапе мы собираемся выбрать пользователей, которые взаимодействовали как минимум с 2000 фильмами и фильмы были оценены 1000 пользователями (это поможет уменьшить размер таблицы, потому что у меня ограничены ресурсы для вычисления общей таблицы).

In [7]:
n_interacted = 2000
user_movie_data_temp = pd.pivot_table(rating_data, index = ['userId'], values='movieId', aggfunc='count')
user_movie_data_temp[user_movie_data_temp.movieId>=n_interacted]
selected_user_ids = user_movie_data_temp[user_movie_data_temp.movieId>=n_interacted].index
print('number of userIds: ', str(len(selected_user_ids)))

n_rated = 1000
get_rated_movie = pd.pivot_table(rating_data, index=['movieId'], values='userId', aggfunc='count')
get_rated_movie[get_rated_movie.userId>=n_rated]
selected_movie_ids = get_rated_movie[get_rated_movie.userId>=n_rated].index

print('numbser of movieIds: ', str(len(selected_movie_ids)))

filtered_rating_data = rating_data[(rating_data['userId'].isin(selected_user_ids)) &(rating_data['movieId'].isin(selected_movie_ids))]
filtered_rating_data['movieId'] = filtered_rating_data['movieId'].apply(lambda x: 'm_'+str(x))
# filtered_rating_data['user_movie'] = filtered_rating_data['userId'].astype(str) + '_' + filtered_rating_data['movieId'].astype(str)

print('raw data shape.  : ',str(filtered_rating_data.shape))

number of userIds:  255
numbser of movieIds:  3159
raw data shape.  :  (409728, 4)


In [12]:
filtered_rating_data

Unnamed: 0,datetime,userId,movieId,rating
0,2022-01-12 06:15:00,802,1,1
1,2022-01-19 09:44:00,1350,1,1
2,2022-01-21 17:00:00,914,1,1
3,2022-01-25 17:35:00,914,1,1
4,2022-01-26 19:17:00,1350,1,1
...,...,...,...,...
3712,2024-09-05 15:11:00,16447,131,1
3713,2024-09-05 17:18:00,23917,131,1
3714,2024-09-05 17:18:00,23917,129,1
3715,2024-09-06 11:45:00,10948,129,1


In [13]:
print(filtered_rating_data['userId'].value_counts())
# Filter out userIds with only one occurrence
user_counts = filtered_rating_data['userId'].value_counts()
valid_users = user_counts[user_counts > 1].index
filtered_rating_data = filtered_rating_data[filtered_rating_data['userId'].isin(valid_users)]

userId
3443     152
16294    118
11341     83
11978     82
11668     74
        ... 
9520       1
9429       1
10907      1
9511       1
10948      1
Name: count, Length: 370, dtype: int64


In [14]:
filtered_rating_data

Unnamed: 0,datetime,userId,movieId,rating
0,2022-01-12 06:15:00,802,1,1
1,2022-01-19 09:44:00,1350,1,1
2,2022-01-21 17:00:00,914,1,1
3,2022-01-25 17:35:00,914,1,1
4,2022-01-26 19:17:00,1350,1,1
...,...,...,...,...
3711,2024-09-05 15:11:00,16447,132,1
3712,2024-09-05 15:11:00,16447,131,1
3713,2024-09-05 17:18:00,23917,131,1
3714,2024-09-05 17:18:00,23917,129,1


In [15]:
#train_df, test_df =  train_test_split(filtered_rating_data, 
#                                   stratify = filtered_rating_data['userId'],
#                                   test_size = 0.2,
#                                   random_state = 42)

split_index = int(len(filtered_rating_data) * 0.9)

train_df = filtered_rating_data.iloc[:split_index]
test_df = filtered_rating_data.iloc[split_index:]
print(f"Всего при разделении Train: {len(train_df)}, Test: {len(test_df)}")

train_users = set(train_df['userId'])

test_df = test_df[test_df['userId'].isin(train_users)]

missing_users = set(test_df['userId']) - train_users

if missing_users:
    missing_data = test_df[test_df['userId'].isin(missing_users)]
    train_df = pd.concat([train_df, missing_data])
    test_df = test_df[~test_df['userId'].isin(missing_users)]

print(f"При проверке Train: {len(train_df)}, Test: {len(test_df)}")


print('train_df size:{}'.format(len(train_df)))
print('test_df size:{}'.format(len(test_df)))

Всего при разделении Train: 3246, Test: 361
При проверке Train: 3246, Test: 237
train_df size:3246
test_df size:237


In [16]:
duplicates = train_df[train_df.duplicated(subset=['userId', 'movieId'], keep=False)]
print(duplicates)
# Aggregate duplicates by taking the mean rating
train_df_agg = train_df.groupby(['userId', 'movieId'], as_index=False).agg({'rating': 'mean'})

                 datetime  userId  movieId  rating
0     2022-01-12 06:15:00     802        1       1
1     2022-01-19 09:44:00    1350        1       1
2     2022-01-21 17:00:00     914        1       1
3     2022-01-25 17:35:00     914        1       1
4     2022-01-26 19:17:00    1350        1       1
...                   ...     ...      ...     ...
3347  2024-07-12 12:15:00   16599       95       1
3348  2024-07-12 14:16:00   16294      131       1
3349  2024-07-13 00:51:00   16294       95       1
3350  2024-07-13 21:17:00   16495      123       1
3351  2024-07-13 22:34:00   16294      131       1

[1903 rows x 4 columns]


In [17]:
user_movie_data_train = train_df_agg.pivot(index='userId', columns='movieId', values='rating').fillna(0.0)
user_movie_data_train

movieId,1,2,4,5,6,8,9,10,11,12,...,126,127,128,129,130,131,132,133,134,135
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
914,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23847,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
def matrix_factorization(R, K, steps=5, alpha=0.002, beta=0.02):
    '''
    R: rating matrix
    P: |U| * K (User features matrix)
    Q: |D| * K (Item features matrix)
    K: latent features
    steps: iterations
    alpha: learning rate
    beta: regularization parameter
    
    '''
 
    P = np.random.rand(len(R),K)
    Q = np.random.rand(len(R[0]),K)
    Q = Q.T

    for step in range(steps):
        print('Processing epoch {}'.format(step))
        
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])

        eR = np.dot(P,Q)

        e = 0

        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        # 0.001: local minimum
        if e < 0.001:

            break

    return P, Q.T

In [19]:
R = np.array(user_movie_data_train)
nP, nQ = matrix_factorization(R, K=10)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


In [20]:
pred_R = np.dot(nP, nQ.T)

# Transforming prediction to reconstructed matrix back into a Pandas dataframe in cross-tabural format
user_movie_pred = pd.DataFrame(pred_R, columns=user_movie_data_train.columns, index=list(user_movie_data_train.index))
print(user_movie_pred.shape)
user_movie_pred.head(10)

(243, 108)


movieId,1,2,4,5,6,8,9,10,11,12,...,126,127,128,129,130,131,132,133,134,135
802,1.256643,1.744846,2.189877,1.745471,1.44049,1.20576,1.740896,1.705018,1.262374,1.491976,...,2.301973,1.584791,1.104141,1.835873,1.504608,1.578356,1.715002,0.990336,2.121979,1.715448
836,2.430435,2.521962,2.54813,2.146609,2.670429,1.742039,2.526218,2.876885,1.831415,1.831426,...,3.010173,2.136888,2.099231,2.246414,2.222929,1.915745,2.427823,1.474284,2.454963,2.384075
914,2.121235,2.045232,2.07031,1.745324,2.133,1.482403,2.225934,2.895881,1.882293,1.327142,...,2.277155,2.020391,1.488671,1.682259,2.17179,1.42126,1.687406,1.067748,1.748012,1.556484
927,1.947706,2.727987,2.268665,2.703566,2.783142,1.722119,2.10732,2.98528,1.53266,1.515528,...,2.552504,2.87529,1.583715,1.976691,2.30455,2.07244,2.038914,1.657043,2.753821,2.214285
1016,1.693964,1.880113,2.345785,2.08394,1.598261,1.211244,1.629232,2.162396,1.542561,1.543318,...,2.340535,2.012599,1.082994,1.859832,1.972193,1.48867,1.353156,0.822253,2.258061,1.461458
1025,1.381906,1.967747,2.156805,1.954647,1.784123,1.365472,2.165592,2.746797,1.673136,1.235388,...,2.022664,2.191454,0.902032,1.664168,2.063227,1.689788,1.58728,1.064264,2.009219,1.649788
1305,2.198488,2.494468,2.870787,2.353823,2.517686,1.925141,2.217505,3.176698,1.946967,2.016375,...,2.698196,2.556303,1.479233,2.429827,2.45687,1.660973,2.179862,1.209728,2.734832,1.867874
1307,1.835849,2.138658,2.329773,2.218267,2.065705,1.347615,1.58831,2.286623,1.279007,1.616591,...,2.286987,2.435306,1.092491,2.028476,2.040442,1.49935,1.570256,0.999197,2.69708,1.794007
1313,1.40309,1.689813,1.556603,1.208685,1.465538,1.586297,1.422709,1.502979,1.379707,1.204843,...,2.081396,1.383731,1.271149,1.485398,1.197114,0.893221,1.622012,1.120088,1.296565,0.733321
1350,1.719256,2.13714,2.287836,2.320869,2.367235,1.162852,2.253797,3.261367,1.507575,1.35954,...,1.975548,2.43587,1.172926,1.696065,2.376217,1.993462,1.701761,1.109816,2.421191,2.345671


In [21]:
# User Matrix
Pu = pd.DataFrame(nP, index=list(user_movie_data_train.index))
# Movie Matrix
Qu = pd.DataFrame(nQ, index=user_movie_data_train.columns)

# This matrix can be used independently to predict testing dataset

In [22]:
def predict_rating(data):
    try:
        pred_rating = np.dot(Pu.loc[data.userId], Qu.loc[data.movieId].T)
    except Exception as e:
        pred_rating = np.nan
        print('Unknown user: {} or movieId: {}'.format(data.userId,data.movieId))
    return pred_rating

In [23]:
test_df['pred_rating'] = test_df.apply(predict_rating, axis=1)
test_df

Unnamed: 0,datetime,userId,movieId,rating,pred_rating
3352,2024-07-13 22:43:00,16156,123,1,1.292084
3353,2024-07-14 12:03:00,11506,130,1,2.025847
3354,2024-07-14 12:04:00,11506,127,1,2.277971
3355,2024-07-14 12:04:00,11506,113,1,1.189858
3356,2024-07-14 15:06:00,802,119,1,1.355565
...,...,...,...,...,...
3708,2024-09-03 14:25:00,23917,129,1,1.555607
3710,2024-09-04 18:07:00,23917,129,1,1.555607
3713,2024-09-05 17:18:00,23917,131,1,1.092122
3714,2024-09-05 17:18:00,23917,129,1,1.555607


In [24]:
print(test_df['rating'].isna().sum())      # Count NaNs in actual ratings
print(test_df['pred_rating'].isna().sum())  # Count NaNs in predicted ratings
test_df_cleaned = test_df.dropna(subset=['rating', 'pred_rating'])

0
0


In [25]:
rmse_test = mean_squared_error(test_df_cleaned['rating'], test_df_cleaned['pred_rating'], squared=False)
rmse_test



0.66845303587741

In [26]:
from sklearn.metrics import precision_score

In [35]:
test_df['match'] = test_df['rating'].eq(round(test_df['pred_rating']))

count = test_df['match'].sum()
print(len(test_df), count)
print((count / len(test_df)) * 100)

237 132
55.69620253164557


# 2.Collaborative Filtering using Surprise Package

In [20]:
#pip install scikit-surprise

In [39]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split

In [40]:
filtered_rating_data = pd.read_csv('data_out.csv')

In [41]:
filtered_rating_data = filtered_rating_data.rename(columns={"user_id": "userId", "item_id": "movieId", "weight":"rating"})
filtered_rating_data

Unnamed: 0,datetime,userId,movieId,rating
0,2022-01-12 06:15:00,802,1,1
1,2022-01-19 09:44:00,1350,1,1
2,2022-01-21 17:00:00,914,1,1
3,2022-01-25 17:35:00,914,1,1
4,2022-01-26 19:17:00,1350,1,1
...,...,...,...,...
3712,2024-09-05 15:11:00,16447,131,1
3713,2024-09-05 17:18:00,23917,131,1
3714,2024-09-05 17:18:00,23917,129,1
3715,2024-09-06 11:45:00,10948,129,1


In [79]:
reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(filtered_rating_data[['userId','movieId','rating']], reader)

trainset, testset = train_test_split(data, test_size=0.25, shuffle=False)

# We'll use the famous SVD (one of matrix factorization) algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
evaluation = cross_validate(algo, data, measures=['RMSE','MAE'], cv= 5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2828  0.2980  0.3010  0.3100  0.2889  0.2961  0.0095  
MAE (testset)     0.1728  0.1800  0.1797  0.1869  0.1726  0.1784  0.0053  
Fit time          0.03    0.03    0.03    0.03    0.03    0.03    0.00    
Test time         0.01    0.00    0.01    0.01    0.00    0.01    0.00    


In [80]:
def get_top_n(predictions, n=5):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [81]:
top_n = get_top_n(predictions, 10)

for uid, user_ratings in top_n.items():
    print(uid, [mid for (mid, _) in user_ratings])

12016 [116, 116, 95, 115, 115, 115, 115, 115]
2722 [90, 108, 108, 107, 89, 84, 84, 84, 92, 131]
11110 [108, 81, 117]
11109 [89, 89, 89, 89, 89]
10669 [89]
3327 [113, 113, 112]
8735 [117, 82, 82, 107, 107, 104, 100, 113, 116, 116]
19011 [117, 116, 116, 116]
3363 [116, 113, 117]
23693 [113]
10701 [95, 108, 107]
11021 [116, 95, 95, 101, 101, 86, 120, 108, 108, 108]
21589 [110]
16259 [116, 116, 116, 116, 116, 116, 116, 116, 116, 119]
15882 [122, 121, 88, 98, 103, 84]
9560 [116, 109]
1305 [5, 123, 114, 48]
914 [72, 48]
23772 [103]
18314 [116, 101, 86, 82, 91, 110, 100, 113, 115, 81]
17569 [116, 107, 115, 117]
16683 [95, 95, 86, 86, 86, 119, 120, 130, 120, 108]
23854 [118]
836 [45]
12461 [95, 95]
3435 [83, 107, 115]
8548 [108, 81]
1313 [119, 115, 81, 48]
23858 [121, 122, 122, 122, 122, 122, 121, 122]
23859 [123, 123, 123, 123, 123]
3432 [116, 108, 95, 122, 123, 125, 125, 124, 124, 131]
10952 [100]
22684 [116, 116, 119, 120, 122, 121, 127, 102, 110, 113]
23915 [125, 124]
16722 [119]
16995 [11

In [82]:
elements = {}
for uid, user_ratings in top_n.items():
    elements[uid] = [mid for (mid, _) in user_ratings]

elements

{12016: [116, 116, 95, 115, 115, 115, 115, 115],
 2722: [90, 108, 108, 107, 89, 84, 84, 84, 92, 131],
 11110: [108, 81, 117],
 11109: [89, 89, 89, 89, 89],
 10669: [89],
 3327: [113, 113, 112],
 8735: [117, 82, 82, 107, 107, 104, 100, 113, 116, 116],
 19011: [117, 116, 116, 116],
 3363: [116, 113, 117],
 23693: [113],
 10701: [95, 108, 107],
 11021: [116, 95, 95, 101, 101, 86, 120, 108, 108, 108],
 21589: [110],
 16259: [116, 116, 116, 116, 116, 116, 116, 116, 116, 119],
 15882: [122, 121, 88, 98, 103, 84],
 9560: [116, 109],
 1305: [5, 123, 114, 48],
 914: [72, 48],
 23772: [103],
 18314: [116, 101, 86, 82, 91, 110, 100, 113, 115, 81],
 17569: [116, 107, 115, 117],
 16683: [95, 95, 86, 86, 86, 119, 120, 130, 120, 108],
 23854: [118],
 836: [45],
 12461: [95, 95],
 3435: [83, 107, 115],
 8548: [108, 81],
 1313: [119, 115, 81, 48],
 23858: [121, 122, 122, 122, 122, 122, 121, 122],
 23859: [123, 123, 123, 123, 123],
 3432: [116, 108, 95, 122, 123, 125, 125, 124, 124, 131],
 10952: [100],

In [83]:
count_of_right = 0

for iteraction in testset:
    if iteraction[1] in (elements.get(iteraction[0])):
        count_of_right+=1

print(len(testset), count_of_right)
print((count_of_right / len(testset)) * 100)

930 608
65.3763440860215
