In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict
import math

In [None]:
column_names1 = ['userId','movieId','rating','timestamp']
ratings = pd.read_csv('ratings.csv')
ratings.head() 

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
ratings['title'] = None

In [None]:
movies = pd.read_csv('movies.csv')

In [None]:
for i in tqdm(range(ratings.shape[0])):
    m_id = ratings.iloc[i,1]
    movie_detail = movies[movies['movieId'] == m_id]
    title = movie_detail.iloc[:,1]
    ratings.iloc[i,4] = title

  0%|          | 0/100836 [00:00<?, ?it/s]

In [None]:
users = list(set(ratings['userId']))

In [None]:
train_df = pd.DataFrame(columns=list(ratings.columns))
test_df = pd.DataFrame(columns=list(ratings.columns))

In [None]:
for i in tqdm(range(len(users))):
    df = ratings[ratings['userId'] == users[i]]
    train, test = train_test_split(df, test_size=0.2)
    train_df = pd.concat([train_df, train])
    test_df = pd.concat([test_df, test])

  0%|          | 0/610 [00:00<?, ?it/s]

In [None]:
train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

In [None]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)

In [None]:
train_data = train_df.loc[:, ['userId', 'movieId', 'rating']]
test_data = test_df.loc[:, ['userId', 'movieId', 'rating']]

User to User Collaborative Filtering

In [None]:
test_user_features = test_data.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)
test_user_similarity = cosine_similarity(test_user_features)
test_user_similarity[np.isnan(test_user_similarity)] = 0

In [None]:
print(test_user_similarity)

[[1.         0.         0.         ... 0.04637102 0.         0.03751829]
 [0.         1.         0.         ... 0.         0.         0.01724007]
 [0.         0.         1.         ... 0.00519147 0.         0.00415691]
 ...
 [0.04637102 0.         0.00519147 ... 1.         0.02129767 0.02517411]
 [0.         0.         0.         ... 0.02129767 1.         0.        ]
 [0.03751829 0.01724007 0.00415691 ... 0.02517411 0.         1.        ]]


In [None]:
print(test_user_similarity.shape)


(610, 610)


In [None]:
print(test_user_features.shape)

(610, 5174)


In [None]:
user_predicted_ratings_test = np.dot(test_user_similarity, test_user_features)
user_predicted_ratings_test

array([[ 6.87862294,  1.6285706 ,  1.6542852 , ...,  0.        ,
         0.        ,  0.        ],
       [ 2.47464845,  0.33348657,  0.19169513, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.67992257,  0.25725776,  0.06058216, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [12.12760442,  2.11360291,  1.75897911, ...,  0.05818081,
         0.        ,  0.        ],
       [ 3.41872279,  3.0520592 ,  1.3265222 , ...,  0.        ,
         0.        ,  0.        ],
       [ 6.57980182,  2.48890894,  1.32380927, ...,  0.03194502,
         0.04023389,  0.04598159]])

In [None]:
print(user_predicted_ratings_test.shape)

(610, 5174)


Calculation of RMSE and MAE:

In [None]:
temp_test = test_data.copy()
temp_test['rating'] = temp_test['rating'].apply(lambda x: 1 if x > 0 else 0)
temp_test = temp_test.pivot(index ='userId', columns = 'movieId', values = 'rating').fillna(0)
temp_test.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,185585,187031,187593,188797,188833,190215,190219,190221,193565,193581
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
test_user_final_rating = np.multiply(user_predicted_ratings_test, temp_test)
test_user_final_rating.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,185585,187031,187593,188797,188833,190215,190219,190221,193565,193581
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
train_data['rating'].describe()

count    80419.000000
mean         3.501741
std          1.042938
min          0.500000
25%          3.000000
50%          3.500000
75%          4.000000
max          5.000000
Name: rating, dtype: float64

In [None]:
X = test_user_final_rating.copy()
X = X[X > 0]
scaler = MinMaxScaler(feature_range = (0.5, 5))
scaler.fit(X)

MinMaxScaler(feature_range=(0.5, 5))

In [None]:
pred = scaler.transform(X)

In [None]:
print(pred)

[[       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 ...
 [1.16489033        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]
 [       nan        nan        nan ...        nan        nan        nan]]


In [None]:
total_non_nan = np.count_nonzero(~np.isnan(pred))
total_non_nan

20417

In [None]:
test = test_data.pivot(index = 'userId', columns = 'movieId', values = 'rating')
test.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,185585,187031,187593,188797,188833,190215,190219,190221,193565,193581
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# RMSE
diff_sqr_matrix = (test - pred)**2
sum_of_squares_err = diff_sqr_matrix.sum().sum() # df.sum().sum() by default ignores null values

rmse = np.sqrt(sum_of_squares_err/total_non_nan)
print(rmse)

1.673383734160549


In [None]:
# Mean abslute error
mae = np.abs(pred - test).sum().sum()/total_non_nan
print(mae)

1.2980477984442642


In [None]:
temp_train = train_data.copy()
temp_train['rating'] = temp_train['rating'].apply(lambda x: 0 if x > 0 else 1)
temp_train = temp_train.pivot(index ='userId', columns = 'movieId', values = 'rating').fillna(1)
temp_train.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190213,191005,193567,193571,193573,193579,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
train_user_features = train_data.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)
train_user_similarity = cosine_similarity(train_user_features)
train_user_similarity[np.isnan(train_user_similarity)] = 0

In [None]:
print(train_user_similarity)

[[1.         0.01709927 0.06731982 ... 0.21805126 0.0796254  0.08487675]
 [0.01709927 1.         0.         ... 0.0451059  0.03529496 0.08278886]
 [0.06731982 0.         1.         ... 0.01635551 0.         0.02768623]
 ...
 [0.21805126 0.0451059  0.01635551 ... 1.         0.09452169 0.26058112]
 [0.0796254  0.03529496 0.         ... 0.09452169 1.         0.05086331]
 [0.08487675 0.08278886 0.02768623 ... 0.26058112 0.05086331 1.        ]]


In [None]:
user_predicted_ratings_train = np.dot(train_user_similarity, train_user_features)
user_predicted_ratings_train

array([[9.05476986e+01, 4.62131594e+01, 2.15596558e+01, ...,
        0.00000000e+00, 0.00000000e+00, 1.25757830e-01],
       [2.81013419e+01, 1.41397818e+01, 3.11507084e+00, ...,
        3.60625691e-01, 3.60625691e-01, 5.45427516e-01],
       [6.31668870e+00, 3.52782538e+00, 1.86522987e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.21668848e+02, 7.05076450e+01, 2.58351878e+01, ...,
        7.14726293e-02, 7.14726293e-02, 8.03482581e-01],
       [7.77046886e+01, 3.93074248e+01, 1.43768324e+01, ...,
        0.00000000e+00, 0.00000000e+00, 1.05520055e-01],
       [9.04573352e+01, 4.75393229e+01, 1.30666232e+01, ...,
        4.10483814e-01, 4.10483814e-01, 9.37717974e-01]])

In [None]:
train_user_final_rating = np.multiply(user_predicted_ratings_train, temp_train)
train_user_final_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190213,191005,193567,193571,193573,193579,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,46.213159,0.0,1.281519,12.358468,0.0,16.657545,3.109301,4.070678,57.030939,...,0.055448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125758
2,28.101342,14.139782,3.115071,0.258876,3.343275,13.465313,3.479974,0.769832,0.783579,13.537865,...,0.050164,0.463662,0.309108,0.412144,0.412144,0.360626,0.360626,0.360626,0.360626,0.545428
3,6.316689,3.527825,1.86523,0.023433,0.684632,4.843206,1.059532,0.187568,0.429189,4.134524,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,62.0403,30.153723,11.606954,0.674316,8.285952,34.29379,13.656827,1.801886,2.084321,36.359902,...,0.032245,0.054217,0.036145,0.048193,0.048193,0.042169,0.042169,0.042169,0.042169,0.191793
5,0.0,47.182982,16.263206,2.472238,16.698745,44.480448,23.050877,4.178489,3.837251,61.313889,...,0.15913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23774


In [None]:
X = train_user_final_rating.copy()
X = X[X > 0]
scaler = MinMaxScaler(feature_range = (0.5, 5))
scaler.fit(X)

MinMaxScaler(feature_range=(0.5, 5))

In [None]:
pred = scaler.transform(X)

In [None]:
pred

array([[       nan, 3.31195362,        nan, ...,        nan,        nan,
        0.9326694 ],
       [1.38949021, 1.24797065, 0.9434369 , ..., 2.79861442, 2.79861442,
        2.46227828],
       [0.63665121, 0.56507103, 0.73243233, ...,        nan,        nan,
               nan],
       ...,
       [4.62301734,        nan,        nan, ..., 0.9280401 , 0.9280401 ,
        3.40283541],
       [       nan, 2.86755643, 2.84470527, ...,        nan,        nan,
        0.85890691],
       [       nan, 3.39729477, 2.62350904, ..., 3.12115406, 3.12115406,
        3.89209555]])

In [None]:
total_non_nan = np.count_nonzero(~np.isnan(pred))
total_non_nan

5302554

In [None]:
pred.shape

(610, 8957)

In [None]:
movie_id = list(train_user_final_rating.columns)

In [None]:
user_list = []

In [None]:
for i in tqdm(range(len(pred))):
    user_i_pred = pred[i]
    non_nan_pred = defaultdict(float)
    for j in range(len(user_i_pred)):
        if not np.isnan(user_i_pred[j]):
            non_nan_pred[movie_id[j]] = user_i_pred[j]
    user_list.append(non_nan_pred)

  0%|          | 0/610 [00:00<?, ?it/s]

In [None]:
predictions = []
pred_ratings = []

In [None]:
for i in tqdm(range(len(user_list))):
    pred = user_list[i]
    pred = dict(sorted(pred.items(), key=lambda item: item[1], reverse=True))
    recommendations = []
    recom_ratings = []
    for key, value in pred.items():
        recommendations.append(key)
        recom_ratings.append(value)
    predictions.append(recommendations[:10])
    pred_ratings.append(recom_ratings[:10])

  0%|          | 0/610 [00:00<?, ?it/s]

In [None]:
true_set = []
true_set_rating = []

In [None]:
users = list(test_df['userId'].unique())

In [None]:
for i in range(len(users)):
    uid = users[i]
    df = test_df[test_df['userId'] == uid]
    df = df.sort_values(by='rating', ascending=False)
    true_set.append(list(df['movieId'])[:10])
    true_set_rating.append(list(df['rating'])[:10])

In [None]:
# Precission & recall
precision = []
recall = []
for i in range(len(predictions)):
    count = 0
    yp = predictions[i]
    yt = true_set[i]
    for mid in yp:
        if mid in yt:
            count += 1
    precision.append(count/10)
    uid = users[i]
    df = test_df[test_df['userId'] == uid]
    df = df.sort_values(by='rating', ascending=False)
    den = len(list(df['movieId']))
    recall.append(count/den)

In [None]:
print('The precision of the user based collaborative filter is:', sum(precision)/len(precision))

The precision of the user based collaborative filter is: 0.026557377049180302


In [None]:
print('The recall of the user based collaborative filter is:', sum(recall)/len(recall))

The recall of the user based collaborative filter is: 0.027731599785739944


In [None]:
# F1 score 2*precision*recall / precision + recall
p = sum(precision)/len(precision)
r = sum(recall)/len(recall)
f1 = 2*p*r/(p+r)

In [None]:
print('The F1 score of the user based is:', f1)

The F1 score of the user based is: 0.027131789715850344


In [None]:
# ndcg
ndcg = []
for i in tqdm(range(len(pred_ratings))):
    dcg_i = 0
    idcg_i = 0
    for j in range(len(true_set_rating[i])):
        idcg_i += true_set_rating[i][j]/math.log2(j+2)
    count = min(len(true_set_rating[i]), 10)
    for j in range(count):
        dcg_i += pred_ratings[i][j]/math.log2(j+2)
    ndcg.append(dcg_i/idcg_i)

  0%|          | 0/610 [00:00<?, ?it/s]

In [None]:
print('The ndcg score of the user based is:', sum(ndcg)/len(ndcg))

The ndcg score of the user based is: 0.9930488311907384


User to User Collaborative Filtering - 2

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df['title'].value_counts().describe()

count    8961.000000
mean        8.974333
std        18.454983
min         1.000000
25%         1.000000
50%         2.000000
75%         8.000000
max       265.000000
Name: title, dtype: float64

In [None]:
train_df['count'] = 1

In [None]:
train_df

Unnamed: 0,userId,movieId,rating,timestamp,title,count
0,1,3,4.0,964981247,Grumpier Old Men (1995),1
1,1,1580,3.0,964981125,Men in Black (a.k.a. MIB) (1997),1
2,1,1270,5.0,964983705,Back to the Future (1985),1
3,1,1275,5.0,964982290,Highlander (1986),1
4,1,1208,4.0,964983250,Apocalypse Now (1979),1
...,...,...,...,...,...,...
80414,610,57669,5.0,1493845166,In Bruges (2008),1
80415,610,3717,2.5,1493845921,Gone in 60 Seconds (2000),1
80416,610,66509,3.5,1479542766,Funny People (2009),1
80417,610,96832,5.0,1479545188,Holy Motors (2012),1


In [None]:
# movies with count and mean rating
movie_count_rating = train_df.drop(['userId'], axis = 'columns').pivot_table(index = ['title'], aggfunc = {'count': 'sum', 'rating': 'mean'})
movie_count_rating = movie_count_rating.sort_values(by = ['count'], ascending = False)
movie_count_rating

Unnamed: 0_level_0,count,rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Forrest Gump (1994),265,4.149057
"Shawshank Redemption, The (1994)",254,4.431102
Pulp Fiction (1994),248,4.219758
"Silence of the Lambs, The (1991)",230,4.176087
"Matrix, The (1999)",213,4.197183
...,...,...
Reality (2014),1,4.000000
"Garden of Words, The (Koto no ha no niwa) (2013)",1,5.000000
Garam Masala (2005),1,4.500000
Garage (2007),1,3.500000


In [None]:
movie_data = train_df.drop(['count'], axis = 'columns').pivot_table(index = 'userId', columns = 'movieId', values = 'rating')
movie_data

movieId,1,2,3,4,5,6,7,8,9,10,...,190219,190221,191005,193565,193567,193571,193573,193581,193583,193585
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,,,,,,,,,,4.0,...,,,,,,,,,,


In [None]:
n_movie = movie_data.shape[1]
n_user = movie_data.shape[0]

print('The number of movies:', n_movie)
print('The number of users:', n_user)

The number of movies: 8965
The number of users: 610


In [None]:
# Define Correlation
def calculate_correlation(c1, c2):
    c1_mean, c2_mean = c1.mean(), c2.mean()
    c1_centered, c2_centered = c1 - c1_mean, c2 - c2_mean
    numerator = np.dot(np.where(np.isnan(c1_centered), 0, c1_centered), np.where(np.isnan(c2_centered), 0, c2_centered))
    c1_squared = np.dot(np.where(np.isnan(c1_centered), 0, c1_centered), np.where(np.isnan(c1_centered), 0, c1_centered))
    c2_squared = np.dot(np.where(np.isnan(c2_centered), 0, c2_centered), np.where(np.isnan(c2_centered), 0, c2_centered))
    denominator = np.sqrt(c1_squared * c2_squared)
    return numerator / denominator

In [None]:
# Define Euclidean distance
def calculate_euclidean_dist(c1, c2):
    euclidean_dist = - np.sqrt(np.power(np.where(np.isnan(c1), 0, c1) - np.where(np.isnan(c2), 0, c2), 2).sum())
    return euclidean_dist

In [None]:
# Define Cosine similarity
def calculate_cosine_similarity(c1, c2):
    numerator = np.dot(np.where(np.isnan(c1), 0, c1), np.where(np.isnan(c2), 0, c2))
    c1_squared = np.dot(np.where(np.isnan(c1), 0, c1), np.where(np.isnan(c1), 0, c1))
    c2_squared = np.dot(np.where(np.isnan(c2), 0, c2), np.where(np.isnan(c2), 0, c2))
    denominator = np.sqrt(c1_squared * c2_squared)
    return numerator / denominator

In [None]:
function_dict = {}
function_dict['corr'] = calculate_correlation
function_dict['euclidean'] = calculate_euclidean_dist
function_dict['cosine'] = calculate_cosine_similarity

In [None]:
list_userId = movie_data.index

In [None]:
movies = pd.read_csv('movies.csv')

In [None]:
test_df

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,2797,4.0,964981710,Big (1988)
1,1,736,3.0,964982653,Twister (1996)
2,1,3053,5.0,964984086,"Messenger: The Story of Joan of Arc, The (1999)"
3,1,2858,5.0,964980868,American Beauty (1999)
4,1,1265,4.0,964983599,Groundhog Day (1993)
...,...,...,...,...,...
20412,610,85179,4.0,1493850183,Summer Wars (Samâ wôzu) (2009)
20413,610,36931,3.5,1479542370,New Police Story (Xin jing cha gu shi) (2004)
20414,610,5136,3.5,1493848655,Wendigo (2001)
20415,610,8526,3.0,1479542323,Around the World in 80 Days (2004)


In [None]:
true_set = []
users = list(test_df['userId'].unique())
for i in range(len(users)):
    uid = users[i]
    df = test_df[test_df['userId'] == uid]
    df = df.sort_values(by='rating', ascending=False)
    true_set.append(list(df['title'])[:10])

In [None]:
def recommend_movie_by_similar_user(user, method, movies_data = movies, n_recommend = 1, accept_rating = 4.5):
    
    rating_simility = {}
    recommend_movieId = []
    
    for userId in list_userId:
        rating_simility[userId] = function_dict[method](movie_data.loc[user, :], movie_data.loc[userId, :])
        
    # pick most similar userId
    i = 0
    while len(recommend_movieId) < n_recommend:
        i += 1
        most_similar_userId = sorted(rating_simility, key = rating_simility.get, reverse  = True)[i]
        most_similar_userId_rating = movie_data.loc[most_similar_userId,:]
        rating_list = sorted(pd.Series(np.where(np.isnan(most_similar_userId_rating), 0, most_similar_userId_rating)).unique(), reverse = True)
        for rate_score in rating_list:
            if rate_score >= accept_rating:
                max_rating_movieId = list(most_similar_userId_rating[most_similar_userId_rating == rate_score].index)
                recommend_movieId.extend([movieid for movieid in max_rating_movieId if np.isnan(movie_data.loc[user, movieid])])
                if len(recommend_movieId) >= n_recommend:
                    break
            else:
                break

    movies_data_pivot = movies_data.set_index(['movieId'])
    predictions = list(movies_data_pivot.loc[recommend_movieId[:n_recommend], 'title'])
    
    # Precission & recall
    precision = []
    recall = []
    yt = true_set[user-1]
    for i in range(len(predictions)):
        count = 0
        yp = predictions[i]
        if yp in yt:
            count += 1
        precision.append(count/10)
        uid = users[i]
        df = test_df[test_df['userId'] == uid]
        df = df.sort_values(by='rating', ascending=False)
        den = len(list(df['movieId']))
        recall.append(count/den)
    
    return precision, recall

In [None]:
u_p = []
u_r = []
for i in tqdm(range(len(users))):
    p,r = recommend_movie_by_similar_user(user = users[i], method = 'cosine', n_recommend = 10)
    p = sum(p)
    r = sum(r)
    u_p.append(p)
    u_r.append(r)

  0%|          | 0/610 [00:00<?, ?it/s]

In [None]:
print('Precision of this method is', sum(u_p)/(len(u_p)))

Precision of this method is 0.08459016393442659


In [None]:
print('Recall of this method is', sum(u_r)/(len(u_r)))

Recall of this method is 0.06300082388454525


In [None]:
p = sum(u_p)/len(u_p)
r = sum(u_r)/len(u_r)
f1 = 2*p*r / (p+r)

In [None]:
print('F1 score of this method is', f1)

F1 score of this method is 0.07221646930006631


In [None]:
u_p = []
u_r = []
for i in tqdm(range(len(users))):
    p,r = recommend_movie_by_similar_user(user = users[i], method = 'corr', n_recommend = 10)
    p = sum(p)
    r = sum(r)
    u_p.append(p)
    u_r.append(r)

  0%|          | 0/610 [00:00<?, ?it/s]

  if __name__ == '__main__':


In [None]:
print('Precision of this method is', sum(u_p)/(len(u_p)))

Precision of this method is 0.06442622950819693


In [None]:
print('Recall of this method is', sum(u_r)/(len(u_r)))

Recall of this method is 0.04707666866352467


In [None]:
p = sum(u_p)/len(u_p)
r = sum(u_r)/len(u_r)
f1 = 2*p*r / (p+r)

In [None]:
print('F1 score of this method is', f1)

F1 score of this method is 0.05440167582239184


In [None]:
u_p = []
u_r = []
for i in tqdm(range(len(users))):
    p,r = recommend_movie_by_similar_user(user = users[i], method = 'euclidean', n_recommend = 10)
    p = sum(p)
    r = sum(r)
    u_p.append(p)
    u_r.append(r)

  0%|          | 0/610 [00:00<?, ?it/s]

In [None]:
print('Precision of this method is', sum(u_p)/(len(u_p)))

Precision of this method is 0.09278688524590195


In [None]:
print('Recall of this method is', sum(u_r)/(len(u_r)))

Recall of this method is 0.06747288914989788


In [None]:
p = sum(u_p)/len(u_p)
r = sum(u_r)/len(u_r)
f1 = 2*p*r / (p+r)

In [None]:
print('F1 score of this method is', f1)

F1 score of this method is 0.0781306381637477
