In [1]:
import numpy as np
import pandas as pd

In [2]:
complete_df = pd.read_csv(r'complete_dataset.csv')
complete_df = complete_df.drop(columns='Unnamed: 0',axis=1)
complete_df

Unnamed: 0,userId,movieId,rating,original_title,nltk_rating,diff,leniency_score_for_user
0,1,110,1.0,Trois couleurs : Rouge,4.571285,-3.571285,0.791561
1,1,147,4.5,Les Quatre Cents Coups,3.709128,0.790872,0.791561
2,1,858,5.0,Sleepless in Seattle,4.667067,0.332933,0.791561
3,1,1246,5.0,Rocky Balboa,4.266597,0.733403,0.791561
4,1,1968,4.0,Fools Rush In,4.756418,-0.756418,0.791561
...,...,...,...,...,...,...,...
5001,1219,54503,5.0,Shuang ma lian huan,2.425867,2.574133,0.620946
5002,1219,58559,1.0,Confession of a Child of the Century,2.535798,-1.535797,0.620946
5003,1219,260,1.0,The 39 Steps,4.341973,-3.341973,0.620946
5004,1219,780,1.0,La passion de Jeanne d'Arc,4.034362,-3.034362,0.620946


In [3]:
ratings_matrix = complete_df.pivot_table(index='userId',columns='movieId',values='rating').fillna(0)
ratings_matrix

movieId,5,25,58,64,79,110,141,147,223,260,...,1552,1968,2762,2959,3101,4226,4474,54503,58559,96821
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.5,0.0,0.0,...,0.0,4.0,4.5,4.0,0.0,4.0,0.0,3.5,4.0,5.0
2,3.0,3.0,3.0,4.0,4.0,0.0,3.0,0.0,0.0,4.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0
1214,5.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0
1216,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Test Method 1: Caculating precision,recall,accuracy,mean squared error and f1 score by randomly masking indices of rated movies and predicting those ratings based on user's cosine similarity with other users and comparing the predicted ratings with the actual one's:

## -> Masking 3 indices (ratings being assigned 0 value) per user: 

In [4]:

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# creating a masked matrix.Corresponding to every user 3 movie ratings will be masked and there updated values will be stored 
# in the masked matrix.
mask_matrix = ratings_matrix.copy()

# will hold the indices(y labels)of the masked ratings corresponding to every row(userId).
masked_indices_dict = {}

# loop over each user:
for user in ratings_matrix.index:
    
    # get the indices of non-zero ratings for the user
    non_zero_indices = np.where(ratings_matrix.loc[user] != 0)[0]
    
    non_zero_indices_labels = ratings_matrix.columns[non_zero_indices]
    
    # randomly choose the indices to mask
    if len(non_zero_indices) >= 3:
        masked_indices = np.random.choice(non_zero_indices, size=3, replace=False)
    else:
        masked_indices = np.concatenate((non_zero_indices, np.random.choice(np.where(ratings_matrix.loc[user] == 0)[0], size=3-len(non_zero_indices), replace=False)))
    
    # set the masked ratings to 0
    mask_matrix.loc[user, ratings_matrix.columns[masked_indices]] = 0
    masked_indices_dict[user]=ratings_matrix.columns[masked_indices]


In [5]:
# Let's see how our mask matrix looks like:
mask_matrix

movieId,5,25,58,64,79,110,141,147,223,260,...,1552,1968,2762,2959,3101,4226,4474,54503,58559,96821
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,4.0,0.0,4.0,0.0,3.5,0.0,5.0
2,3.0,3.0,3.0,4.0,4.0,0.0,0.0,0.0,0.0,4.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1214,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0
1216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## -> Our new training set is mask matrix . Now we'll be computing the cosine similarity b/w each pair of user but based on the mask matrix ratings:

In [6]:
# Compute the cosine similarity matrix between all pairs of users in the training set
cosine_sim_matrix = cosine_similarity(mask_matrix)
cosine_sim_matrix_df = pd.DataFrame(index=complete_df['userId'].unique(),columns=complete_df['userId'].unique())
cosine_sim_matrix_df.values[:,:] = cosine_sim_matrix
cosine_sim_matrix_df

Unnamed: 0,1,2,3,4,5,7,8,9,11,12,...,1207,1208,1209,1211,1212,1213,1214,1215,1216,1219
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132666,0.233686,...,0.558183,0.085671,0.0,0.0,0.0,0.04926,0.0,0.342682,0.0,0.22111
2,0.0,1.0,0.0,0.176604,0.0,0.0,0.0,0.0,0.105263,0.166875,...,0.0,0.0,0.0,0.0,0.502412,0.273594,0.152944,0.0,0.0,0.035088
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.610713,0.318304,...,0.0,0.0,0.0,0.0,0.0,0.167742,0.218797,0.0,0.0,0.334637
4,0.0,0.176604,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.288675,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213,0.04926,0.273594,0.167742,0.0,0.0,0.0,0.0,0.0,0.560624,0.278829,...,0.0,0.574989,0.0,0.0,0.451058,1.0,0.191663,0.0,0.0,0.388406
1214,0.0,0.152944,0.218797,0.288675,0.0,0.0,0.0,0.0,0.200739,0.0,...,0.0,0.0,0.0,0.0,0.0,0.191663,1.0,0.0,0.0,0.0
1215,0.342682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.28677,0.0,...,0.552158,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## -> Now that we have got the movieId of the ratings that have been masked,we'll have to calculate the leniency score for each user again. Why? Leniency Score for each user is basically the combined summation of the differences b/w the rating user has given to a movie and the rating that has been assigned to the movie via nlp(analyzing movie reviews). Note that now our new training matrix is the mask_matrix and no more the rating_matrix and so in the mask_matrix, rating corresponding to the masked_indices is 0 i.e it has been assumed that the user has not seen the movie and thus in the nltk_ratings also,since the user has not seen those movies ratings against those movie for that user should be 0.Since ratings in nltk_ratings will get modified too the final difference b/w (mask_matrix - nltk_ratings) will also get modified and thus the leniency score will also get modified.

In [7]:
# Constructing the nltk_ratings_matrix:

nltk_ratings_matrix = pd.pivot_table(complete_df, values='nltk_rating', index='userId', columns='movieId').fillna(0)
nltk_ratings_matrix

movieId,5,25,58,64,79,110,141,147,223,260,...,1552,1968,2762,2959,3101,4226,4474,54503,58559,96821
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.00000,0.000000,0.000000,0.000000,0.00000,4.571285,0.000000,3.709128,0.000000,0.000000,...,0.00000,4.756418,3.598082,3.616755,0.00000,2.65913,0.000000,2.425867,2.535798,3.870065
2,3.64357,2.335647,4.063033,3.930747,2.87473,0.000000,3.810897,0.000000,0.000000,4.341973,...,3.91699,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000
3,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,4.756418,0.000000,0.000000,3.51121,0.00000,3.838612,0.000000,0.000000,0.000000
4,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,3.461978,0.000000,...,0.00000,0.000000,3.598082,0.000000,0.00000,2.65913,0.000000,0.000000,0.000000,0.000000
5,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213,0.00000,0.000000,0.000000,0.000000,0.00000,4.571285,0.000000,0.000000,0.000000,4.341973,...,0.00000,0.000000,0.000000,3.616755,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000
1214,3.64357,0.000000,0.000000,3.930747,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000
1215,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,4.341973,...,0.00000,0.000000,0.000000,0.000000,0.00000,2.65913,0.000000,0.000000,0.000000,0.000000
1216,0.00000,0.000000,0.000000,0.000000,0.00000,4.571285,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000


In [8]:
# So first we'll be creating the nltk_masked_ratings_matrix which will be a modified version of nltk_ratings matrix where
# values corresponding to masked indices will be 0:

nltk_ratings_masked_matrix = nltk_ratings_matrix

for user,masked_indices in masked_indices_dict.items():
    nltk_ratings_masked_matrix.loc[user,masked_indices]=0
    
nltk_ratings_masked_matrix

movieId,5,25,58,64,79,110,141,147,223,260,...,1552,1968,2762,2959,3101,4226,4474,54503,58559,96821
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.00000,0.000000,0.000000,0.000000,0.00000,4.571285,0.0,0.0,0.0,0.000000,...,0.00000,4.756418,0.0,3.616755,0.00000,2.65913,0.000000,2.425867,0.0,3.870065
2,3.64357,2.335647,4.063033,3.930747,2.87473,0.000000,0.0,0.0,0.0,4.341973,...,3.91699,0.000000,0.0,0.000000,0.00000,0.00000,0.000000,0.000000,0.0,0.000000
3,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.000000,...,0.00000,0.000000,0.0,0.000000,3.51121,0.00000,3.838612,0.000000,0.0,0.000000
4,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.000000,...,0.00000,0.000000,0.0,0.000000,0.00000,0.00000,0.000000,0.000000,0.0,0.000000
5,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.000000,...,0.00000,0.000000,0.0,0.000000,0.00000,0.00000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213,0.00000,0.000000,0.000000,0.000000,0.00000,4.571285,0.0,0.0,0.0,4.341973,...,0.00000,0.000000,0.0,0.000000,0.00000,0.00000,0.000000,0.000000,0.0,0.000000
1214,0.00000,0.000000,0.000000,3.930747,0.00000,0.000000,0.0,0.0,0.0,0.000000,...,0.00000,0.000000,0.0,0.000000,0.00000,0.00000,0.000000,0.000000,0.0,0.000000
1215,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.000000,...,0.00000,0.000000,0.0,0.000000,0.00000,2.65913,0.000000,0.000000,0.0,0.000000
1216,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.000000,...,0.00000,0.000000,0.0,0.000000,0.00000,0.00000,0.000000,0.000000,0.0,0.000000


In [9]:
# Now that we have got the nltk_masked_ratings_matrix and mask_matrix, we can use these 2 to calculate the leniency score
# for each user which will be finally normalized on a scale of 0-1:

# Calculate sum of each row in masked_matrix and nltk_ratings_masked_matrix
masked_matrix_row_sums = mask_matrix.sum(axis=1)
nltk_ratings_masked_matrix_row_sums = nltk_ratings_masked_matrix.sum(axis=1)

# Calculate difference between the row sums:
row_sums_diff = masked_matrix_row_sums - nltk_ratings_masked_matrix_row_sums


# Normalize the row sums difference to range between 0 and 1
row_sums_diff_normalized = (row_sums_diff - np.min(row_sums_diff)) / (np.max(row_sums_diff) - np.min(row_sums_diff))
row_sums_diff_normalized[row_sums_diff == 0] = 0

# Create a new matrix to store leniency score
user_ids = complete_df['userId'].unique()

# This new matrix will be used to store leniency score of each user in all columns(all other users+this user)
new_matrix = np.zeros(((len(user_ids),len(user_ids))))

# Populate leniency score for each user
for i in range(mask_matrix.shape[0]):
    row_label = mask_matrix.index[i]
    leniency_score = row_sums_diff_normalized[row_label]
    new_matrix[i] = leniency_score

user_leniency_matrix = pd.DataFrame(data=new_matrix, index=user_ids, columns=user_ids)
user_leniency_matrix

Unnamed: 0,1,2,3,4,5,7,8,9,11,12,...,1207,1208,1209,1211,1212,1213,1214,1215,1216,1219
1,0.737596,0.737596,0.737596,0.737596,0.737596,0.737596,0.737596,0.737596,0.737596,0.737596,...,0.737596,0.737596,0.737596,0.737596,0.737596,0.737596,0.737596,0.737596,0.737596,0.737596
2,0.485848,0.485848,0.485848,0.485848,0.485848,0.485848,0.485848,0.485848,0.485848,0.485848,...,0.485848,0.485848,0.485848,0.485848,0.485848,0.485848,0.485848,0.485848,0.485848,0.485848
3,0.596753,0.596753,0.596753,0.596753,0.596753,0.596753,0.596753,0.596753,0.596753,0.596753,...,0.596753,0.596753,0.596753,0.596753,0.596753,0.596753,0.596753,0.596753,0.596753,0.596753
4,0.761131,0.761131,0.761131,0.761131,0.761131,0.761131,0.761131,0.761131,0.761131,0.761131,...,0.761131,0.761131,0.761131,0.761131,0.761131,0.761131,0.761131,0.761131,0.761131,0.761131
5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213,0.641569,0.641569,0.641569,0.641569,0.641569,0.641569,0.641569,0.641569,0.641569,0.641569,...,0.641569,0.641569,0.641569,0.641569,0.641569,0.641569,0.641569,0.641569,0.641569,0.641569
1214,0.916597,0.916597,0.916597,0.916597,0.916597,0.916597,0.916597,0.916597,0.916597,0.916597,...,0.916597,0.916597,0.916597,0.916597,0.916597,0.916597,0.916597,0.916597,0.916597,0.916597
1215,0.743433,0.743433,0.743433,0.743433,0.743433,0.743433,0.743433,0.743433,0.743433,0.743433,...,0.743433,0.743433,0.743433,0.743433,0.743433,0.743433,0.743433,0.743433,0.743433,0.743433
1216,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


## -> Now comes the part of prediction. First, we'll be predicting ratings of movie which were masked based on solely cosine similarity:

In [10]:
# Use KNN to find top 10 similar users:
from sklearn.neighbors import NearestNeighbors

predicted_ratings_cosine = mask_matrix.copy()

k = 10

for user in ratings_matrix.index:

    masked_indices = masked_indices_dict[user]
    
    # loop over each masked index for the user
    for j in masked_indices:
        
        # get the indices of non-zero ratings for the movie
        non_zero_indices = np.nonzero(ratings_matrix.loc[:,j].values)[0]
        user_ids = ratings_matrix.index[non_zero_indices]
        
        # get similar users corresponding to every user
        sim_users = cosine_sim_matrix_df.loc[user, user_ids]
        
        # use k-NN to select k most similar users
        k_similar_users = sim_users.astype(float).nlargest(k+1).iloc[1:]
        
        # get their corresponding ratings for the movie
        rated_movies = ratings_matrix.loc[k_similar_users.index, j]
        
        # compute the weighted average of their ratings using cosine similarity as weights
        predicted_rating = np.dot(k_similar_users, rated_movies) / k_similar_users.sum()
        predicted_ratings_cosine.loc[user, j] = predicted_rating

predicted_ratings_cosine = predicted_ratings_cosine.fillna(0)

  predicted_rating = np.dot(k_similar_users, rated_movies) / k_similar_users.sum()


In [11]:
# Let's see what ratings did we finally predict:
predicted_ratings_cosine

movieId,5,25,58,64,79,110,141,147,223,260,...,1552,1968,2762,2959,3101,4226,4474,54503,58559,96821
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.0,0.0,0.0,0.0,1.0,0.000000,3.19496,0.00000,0.000000,...,0.0,4.000000,4.027859,4.000000,0.0,4.000000,0.0,3.5,3.561133,5.0
2,3.000000,3.0,3.0,4.0,4.0,0.0,2.790759,0.00000,0.00000,4.000000,...,2.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.00000,0.000000,...,0.0,3.882928,0.000000,0.000000,3.0,0.000000,3.0,0.0,0.000000,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,4.16097,0.000000,...,0.0,0.000000,4.196452,0.000000,0.0,4.082927,0.0,0.0,0.000000,0.0
5,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.00000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213,0.000000,0.0,0.0,0.0,0.0,4.5,0.000000,0.00000,0.00000,4.000000,...,0.0,0.000000,0.000000,3.939948,0.0,0.000000,0.0,0.0,0.000000,0.0
1214,2.941386,0.0,0.0,5.0,0.0,0.0,0.000000,0.00000,0.00000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0
1215,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.00000,3.580822,...,0.0,0.000000,0.000000,0.000000,0.0,3.500000,0.0,0.0,0.000000,0.0
1216,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.00000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0


## -> Testing accuracy,f1 score,recall,precision and mse by comparing predicted ratings with actual ratings(cosine similarity):

In [12]:
from sklearn.metrics import accuracy_score, f1_score, recall_score,mean_squared_error,precision_score


# convert dataframes to numpy arrays for easier computation
predicted = predicted_ratings_cosine.to_numpy()
actual = ratings_matrix.to_numpy()

# threshold the predicted values to binary (1 if >= 3, 0 otherwise)
predicted_binary = np.where(predicted >= 3, 1, 0)
actual_binary = np.where(actual >= 3,1,0)

# calculate metrics
accuracy = accuracy_score(actual_binary.flatten(), predicted_binary.flatten())
f1 = f1_score(actual_binary.flatten(), predicted_binary.flatten())
recall = recall_score(actual_binary.flatten(), predicted_binary.flatten())
precision = precision_score(actual_binary.flatten(),predicted_binary.flatten())
mse=mean_squared_error(predicted,actual)

print("Solely Cosine: ")
print()
print("Accuracy:", accuracy)
print("F1-score:", f1)
print("Recall:", recall)
print("Precision: ", precision)
print("Mean squared error: ",mse)

Solely Cosine: 

Accuracy: 0.9733051487241431
F1-score: 0.8922739458932489
Recall: 0.8292185730464326
Precision:  0.9657082563967291
Mean squared error:  0.3897643296166452


# Test Method 2: Caculating precision,recall,accuracy,mean squared error and f1 score by randomly masking indices of rated movies and predicting those ratings based on user's cosine similarity with other users + the lenieny score of user and comparing the predicted ratings with the actual one's:

## ->Since we have evaluated our metrics based on our first method of recommendation, wherein we give 100% weightage to cosine similarity to determine similarity b/w users, now we'll be evaluating the metrics based on our second method of recommendation,wherein we give 55% weightage to cosine similarity and 45% to leniency score of user(present in user_leniency_matrix) and will try to observe the differences:

In [13]:
# create a new matrix that will take into account the combined score of both cosine similarity and user leniency
combined_score_matrix_cosine_and_leniency = 0.55 * cosine_sim_matrix + 0.45 * user_leniency_matrix

# creating a dataframe out of the matrix with values equal to values of the matrix
combined_score_matrix_cosine_and_leniency_df = pd.DataFrame(index=complete_df['userId'].unique(),columns=complete_df['userId'].unique())
combined_score_matrix_cosine_and_leniency_df.values[:,:] = combined_score_matrix_cosine_and_leniency
combined_score_matrix_cosine_and_leniency_df

Unnamed: 0,1,2,3,4,5,7,8,9,11,12,...,1207,1208,1209,1211,1212,1213,1214,1215,1216,1219
1,0.881918,0.331918,0.331918,0.331918,0.331918,0.331918,0.331918,0.331918,0.404884,0.460445,...,0.638919,0.379037,0.331918,0.331918,0.331918,0.359011,0.331918,0.520393,0.331918,0.453528
2,0.218632,0.768632,0.218632,0.315764,0.218632,0.218632,0.218632,0.218632,0.276526,0.310413,...,0.218632,0.218632,0.218632,0.218632,0.494958,0.369108,0.302751,0.218632,0.218632,0.23793
3,0.268539,0.268539,0.818539,0.268539,0.268539,0.268539,0.268539,0.268539,0.604431,0.443606,...,0.268539,0.268539,0.268539,0.268539,0.268539,0.360797,0.388877,0.268539,0.268539,0.452589
4,0.342509,0.439641,0.342509,0.892509,0.342509,0.342509,0.342509,0.342509,0.342509,0.342509,...,0.342509,0.342509,0.342509,0.342509,0.342509,0.342509,0.50128,0.342509,0.342509,0.342509
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213,0.315799,0.439183,0.380964,0.288706,0.288706,0.288706,0.288706,0.288706,0.597049,0.442062,...,0.288706,0.60495,0.288706,0.288706,0.536788,0.838706,0.394121,0.288706,0.288706,0.502329
1214,0.412469,0.496588,0.532807,0.57124,0.412469,0.412469,0.412469,0.412469,0.522875,0.412469,...,0.412469,0.412469,0.412469,0.412469,0.412469,0.517883,0.962469,0.412469,0.412469,0.412469
1215,0.52302,0.334545,0.334545,0.334545,0.334545,0.334545,0.334545,0.334545,0.492268,0.334545,...,0.638231,0.334545,0.334545,0.334545,0.334545,0.334545,0.334545,0.884545,0.334545,0.334545
1216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## -> Now comes the part of prediction. We'll be predicting ratings of movie which were masked based on cosine similarity+leniency score:

In [14]:

from sklearn.neighbors import NearestNeighbors

predicted_ratings_cosine_and_leniency = mask_matrix.copy()

k = 10

for user in ratings_matrix.index:
    
    masked_indices = masked_indices_dict[user]
    
    for j in masked_indices:
        
        non_zero_indices = np.nonzero(ratings_matrix.loc[:,j].values)[0]
        user_ids = ratings_matrix.index[non_zero_indices]
        
        # get similar users corresponding to every user based on combined score:
        sim_users = combined_score_matrix_cosine_and_leniency_df.loc[user, user_ids]
        
        # use k-NN to select k most similar users
        k_similar_users = sim_users.astype(float).nlargest(k+1).iloc[1:]
        
        # get their corresponding ratings for the movie
        rated_movies = ratings_matrix.loc[k_similar_users.index, j]
        
        # compute the weighted average of their ratings using cosine similarity as weights
        predicted_rating = np.dot(k_similar_users, rated_movies) / k_similar_users.sum()
        predicted_ratings_cosine_and_leniency.loc[user, j] = predicted_rating
        
predicted_ratings_cosine_and_leniency = predicted_ratings_cosine_and_leniency.fillna(0)

  predicted_rating = np.dot(k_similar_users, rated_movies) / k_similar_users.sum()


## -> Testing accuracy,f1 score,recall,precision and mse by comparing predicted ratings with actual ratings(cosine similarity+leniency score):

In [15]:
from sklearn.metrics import accuracy_score, f1_score, recall_score,mean_squared_error

# convert dataframes to numpy arrays for easier computation
predicted = predicted_ratings_cosine_and_leniency.to_numpy()
actual = ratings_matrix.to_numpy()

# threshold the predicted values to binary (1 if >= 3, 0 otherwise)
predicted_binary = np.where(predicted >= 3, 1, 0)
actual_binary = np.where(actual >= 3,1,0)

# calculate metrics
accuracy2 = accuracy_score(actual_binary.flatten(), predicted_binary.flatten())
f12 = f1_score(actual_binary.flatten(), predicted_binary.flatten())
recall2 = recall_score(actual_binary.flatten(), predicted_binary.flatten())
precision2 = precision_score(actual_binary.flatten(),predicted_binary.flatten())
mse2=mean_squared_error(predicted,actual)

print("Cosine Similarity + User Leniency Score: ")
print()

print("Accuracy:", accuracy2)
print("F1-score:", f12)
print("Recall:", recall2)
print("Precision: ", precision2)
print("Mean Squared Error:", mse2)

Cosine Similarity + User Leniency Score: 

Accuracy: 0.9735165332930696
F1-score: 0.8932180689151347
Recall: 0.8308040770101925
Precision:  0.965771458662454
Mean Squared Error: 0.38862526323554347
