In [121]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import numpy as np

In [122]:
# Read ratings data and convert  it to pandas dataframe
ratings = pd.read_csv('./ml-latest-small/ratings.csv')

# Read movies data and convert  it to pandas dataframe
movies = pd.read_csv('./ml-latest-small/movies.csv')


In [123]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [124]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [125]:
# Read links data and convert  it to pandas dataframe
links = pd.read_csv('./ml-latest-small/links.csv')

In [126]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [127]:
#Merge dataset for data exploration and cleaning
data = pd.merge(ratings, movies, on='movieId', how='inner')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [128]:
# Compute the mean rating and number of ratings for each movie
mean_rating = data.groupby('title')['rating'].mean()
num_ratings = data.groupby('title')['rating'].count()

# Combine the results into a single dataframe
temp_df = pd.DataFrame({'mean_rating': mean_rating, 'number_of_ratings': num_ratings})

# Remove movies with less than 50 user ratings 
data_50 = temp_df[temp_df['number_of_ratings'] > 50]
data_50 = data_50.reset_index()

In [129]:
data_50.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              437 non-null    object 
 1   mean_rating        437 non-null    float64
 2   number_of_ratings  437 non-null    int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 10.4+ KB


In [130]:
data_df = pd.merge(data, data_50[['title']], on='title',how='inner')

In [131]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40712 entries, 0 to 40711
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     40712 non-null  int64  
 1   movieId    40712 non-null  int64  
 2   rating     40712 non-null  float64
 3   timestamp  40712 non-null  int64  
 4   title      40712 non-null  object 
 5   genres     40712 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 2.2+ MB


In [132]:
#create user_item matrix
user_item_df = data.pivot_table(index='userId', columns='movieId', values='rating')

In [133]:
#Fill null values with 0
user_item_df = user_item_df.fillna(0)

In [134]:
user_item_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [135]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(user_item_df, test_size=0.2, random_state=42)

In [163]:
train_data.iloc[[0]]

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [137]:
test_data

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
82,2.5,3.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
219,3.5,2.5,0.0,0.0,0.0,3.5,0.0,0.0,0.0,4.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
599,3.0,2.5,1.5,0.0,0.0,4.5,2.5,0.0,1.5,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
520,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
529,3.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [172]:
def user_similarity(user1, user2):
    # Get the ratings of the two users
    u1_ratings = train_data.iloc[user1]
    u2_ratings = train_data.iloc[user2] 
    u1_ratings = u1_ratings.values.reshape(1, -1)
    u2_ratings = u2_ratings.values.reshape(1, -1)
    # Calculate Cosine similarity
    similarity = cosine_similarity(u1_ratings, u2_ratings)[0][0]
    return similarity

In [173]:
def recommend_movies_cosine(user, num_recommendations=10):
    # Compute the similarity between the target user and all other users in the training set
    cosine_similarities = train_data.apply(lambda x: user_similarity(user, x), axis=1)
    
    # Compute the Pearson correlation between all pairs of users
    #user_correlations = train_data.T.corr()
    
    # Select the top N users with highest similarity
    top_similar_users = cosine_similarities.nlargest(num_recommendations)
    
    # Retrieve the ratings of the top similar users 
    top_user_ratings = train_data.iloc[top_similar_users.index]
    
    # Compute the average rating of each movie among the top similar users
    average_movie_ratings = top_user_ratings.mean()
    
    # Retrieve the movies that the target user has not rated
    target_user_ratings = train_data.iloc[user]
    unrated_movies = target_user_ratings[target_user_ratings == 0].index
    
    # Select the top N movies with highest average rating among the top similar users
    recommended_movie_ratings = average_movie_ratings.loc[unrated_movies].nlargest(num_recommendations)
    
    # Retrieve the movie titles corresponding to the recommended movie IDs
    recommended_movie_titles = movies[movies['movieId'].isin(recommended_movie_ratings.index)]['title']
    
    return recommended_movie_titles


In [179]:
pearson_correlations = train_data.T.corr()

In [180]:
pearson_correlations

userId,24,583,288,323,133,175,339,178,159,138,...,331,215,467,122,21,72,107,271,436,103
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24,1.000000,0.081787,0.139820,0.114812,0.164071,-0.004766,0.224745,0.188064,0.126613,-0.004605,...,0.217330,0.207790,0.036712,0.187027,0.217715,0.196028,0.071771,0.016211,0.054829,0.147142
583,0.081787,1.000000,0.066862,0.046711,0.095760,-0.003330,0.056222,0.053431,0.127744,-0.003218,...,0.068762,0.058132,0.039068,0.084366,0.099712,0.058770,0.058788,-0.004803,0.086358,0.107113
288,0.139820,0.066862,1.000000,0.124200,0.138148,-0.006505,0.182787,0.167286,0.113981,0.028953,...,0.146442,0.175942,0.049767,0.212480,0.170115,0.169579,0.108232,0.104076,0.121932,0.304699
323,0.114812,0.046711,0.124200,1.000000,0.323072,-0.004407,0.086980,0.215930,0.103569,-0.004258,...,0.076237,0.119819,0.049567,0.107328,0.102086,0.242569,0.267362,0.023376,0.321503,0.142540
133,0.164071,0.095760,0.138148,0.323072,1.000000,-0.002641,0.104949,0.341571,0.129283,-0.002551,...,0.066280,0.129708,0.089849,0.138534,0.068855,0.421111,0.253152,0.037376,0.339611,0.121886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,0.196028,0.058770,0.169579,0.242569,0.421111,-0.003046,0.142923,0.377618,0.096879,0.010386,...,0.109692,0.203660,0.100235,0.158315,0.125548,1.000000,0.186078,0.098965,0.161498,0.182411
107,0.071771,0.058788,0.108232,0.267362,0.253152,-0.002621,0.032438,0.183130,0.110907,-0.002533,...,0.028151,0.033122,-0.002562,0.076427,0.020098,0.186078,1.000000,-0.003780,0.240374,0.115263
271,0.016211,-0.004803,0.104076,0.023376,0.037376,-0.002925,0.092412,0.044012,0.007577,0.013353,...,0.022313,0.035108,-0.002860,0.053021,0.003591,0.098965,-0.003780,1.000000,-0.006634,0.051560
436,0.054829,0.086358,0.121932,0.321503,0.339611,0.017265,0.046080,0.167782,0.115696,-0.004445,...,0.022097,0.039600,0.047410,0.052033,0.071839,0.161498,0.240374,-0.006634,1.000000,0.103900


In [174]:
def recommend_movies_pearson(user, num_recommendations=10):   
    # Compute the Pearson correlation between all pairs of users
    pearson_correlations = train_data.T.corr()
    
    # Select the top N users with highest similarity
    top_similar_users = pearson_correlations.nlargest(num_recommendations, '')
    
    # Retrieve the ratings of the top similar users 
    top_user_ratings = train_data.iloc[top_similar_users.index]
    
    # Compute the average rating of each movie among the top similar users
    average_movie_ratings = top_user_ratings.mean()
    
    # Retrieve the movies that the target user has not rated
    target_user_ratings = train_data.iloc[user]
    unrated_movies = target_user_ratings[target_user_ratings == 0].index
    
    # Select the top N movies with highest average rating among the top similar users
    recommended_movie_ratings = average_movie_ratings.loc[unrated_movies].nlargest(num_recommendations)
    
    # Retrieve the movie titles corresponding to the recommended movie IDs
    recommended_movie_titles = movies[movies['movieId'].isin(recommended_movie_ratings.index)]['title']
    
    return recommended_movie_titles


In [177]:
def calculate_mae(num_recommendations=10):
    users = test_data.index
    mae_c = 0
    mae_p = 0
    for user in users:
        # Get the recommended movies for the user and get actual ratings
        # recommended_movies_c = recommend_movies_cosine(user, num_recommendations)
        recommended_movies_p = recommend_movies_pearson(user, num_recommendations)
        
        # actual_ratings_c = test_data.iloc[user][recommended_movies_c.index]
        actual_ratings_p = test_data.iloc[user][recommended_movies_p.index]
        
        # Calculate the average absolute error
        # absolute_error_c = np.abs(actual_ratings_c - recommended_movies_c)
        absolute_error_p = np.abs(actual_ratings_p - recommended_movies_p)
        
        
        # mae_c += absolute_error_c.mean()
        mae_p += absolute_error_p.mean()
    
    # Calculate the average MAE over all users
    # mae_c /= len(users)
    mae_p /= len(users)
    
    return mae_c,mae_p

In [191]:
mae_c,mae_p = calculate_mae()

In [94]:
print(f"MAE for User to User Collaborative filtering using cosine similarity is {mae_c:.2f} ")
print(f"MAE for User to User Collaborative filtering using pearson Pearson correlation is {mae_p:.2f}")

MAE for User to User Collaborative filtering using cosine similarity is 1.98 
MAE for User to User Collaborative filtering using pearson Pearson correlation is 2.05


In [84]:
def calculate_rmse(num_recommendations=10):
    users = test_data.index
    rmse_c = 0
    rmse_p = 0
    
    for user in users:
        # Get the recommended movies for the user and get actual ratings
        recommended_movies_c = recommend_movies_cosine(user, num_recommendations)
        recommended_movies_p = recommend_movies_pearson(user, num_recommendations)
        actual_ratings = test_data.loc[user][recommended_movies.index]
        
        # Calculate the root mean squared error
        squared_error_c = (actual_ratings - recommended_movies_c)**2
        squared_error_p = (actual_ratings - recommended_movies_p)**2
        
        
        rmse_c += np.sqrt(squared_error_c.mean())
        rmse_p += np.sqrt(squared_error_p.mean())
    
    rmse_c /= len(users)
    rmse_p /= len(users)
    
    return rmse_c,rmse_p


In [87]:
rmse_c,rmse_p = calculate_rmse()

In [89]:
print(f"RMSE for User to User Collaborative filtering using cosine similarity is {rmse_c:.2f}")
print(f"RMSE for User to User Collaborative filtering using Pearson correlation is {rmse_p:.2f}")

RMSE for User to User Collaborative filtering using cosine similarity is 2.12
RMSE for User to User Collaborative filtering using Pearson correlation is 2.27


In [100]:
tp = 0
fp = 0
fn = 0

# Loop through each user in the testing data set
for user in test_data.index:
    recommended_movies = recommend_movies_cosine(user)
    actual_ratings = test_data.loc[user][recommended_movies.index]
    
    # Calculate the number of true positives, false positives, and false negatives
    for movie, rating in actual_ratings.iteritems():
        if rating > 0 and movie in recommended_movies.index:
            tp += 1
        elif rating == 0 and movie in recommended_movies.index:
            fp += 1
        elif rating > 0 and movie not in recommended_movies.index:
            fn += 1

# Calculate precision and recall
precision = tp / (tp + fp)
recall = tp / (tp + fn)
F1 = 2*(precision*recall)/(precision+recall)

In [192]:
print("User to User Collaborative Filtering using cosine similarity")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1: {F1:.4f}")

User to User Collaborative Filtering using cosine similarity
Precision: 0.1127
Recall: 0.1010
F1: 0.1065


In [104]:
tp = 0
fp = 0
fn = 0

# Loop through each user in the testing data set
for user in test_data.index:
    recommended_movies = recommend_movies_pearson(user)
    actual_ratings = test_data.loc[user][recommended_movies.index]
    
    # Calculate the number of true positives, false positives, and false negatives
    for movie, rating in actual_ratings.iteritems():
        if rating > 0 and movie in recommended_movies.index:
            tp += 1
        elif rating == 0 and movie in recommended_movies.index:
            fp += 1
        elif rating > 0 and movie not in recommended_movies.index:
            fn += 1

# Calculate precision and recall
precision = tp / (tp + fp)
recall = tp / (tp + fn)
F1 = 2*(precision*recall)/(precision+recall)

In [105]:
print("User to User Collaborative Filtering using Pearson Correlation")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1: {F1:.4f}")

User to User Collaborative Filtering using Pearson Correlation
Precision: 0.1034
Recall: 0.0957
F1: 0.0995


In [106]:
def calculate_ndcg(recommended_items, true_items, k=10):
    
    # Calculate DCG
    dcg = 0
    for i in range(min(k, len(recommended_items))):
        item = recommended_items[i]
        if item in true_items:
            relevance = 1
        else:
            relevance = 0
        dcg += (2 ** relevance - 1) / np.log2(i + 2)
        
    # Calculate IDCG
    idcg = 0
    for i in range(min(k, len(true_items))):
        idcg += (2 ** 1 - 1) / np.log2(i + 2)
    
    # Calculate NDCG
    if idcg > 0:
        ndcg = dcg / idcg
    else:
        ndcg = 0
    
    return ndcg


In [108]:
ndcg_scores = []
for user in test_data.index:
    # Get the true and recommended movies for the user
    true_movies = test_data.loc[user][test_data.loc[user]].index
    recommended_movies_c = recommend_movies_cosine(user)
    recommended_movies_p = recommend_movies_pearson(user)
    
    # Calculate the NDCG score for cosine and pearson 
    score_c = calculate_ndcg(recommended_movies_c.index.values, true_movies) 
    score_p = calculate_ndcg(recommended_movies_p.index.values, true_movies) 
    ndcg_c.append(score_c)
    ndcg_p.append(score_p)
    
    
# Calculate the average NDCG score
avg_ndcg_c = np.mean(ndcg_c)
avg_ndcg_p = np.mean(ndcg_p)

In [110]:
print(f"NDCG for User to User Collaborative filtering using cosine similarity is {avg_ndcg_c:.4f}")
print(f"NDCG for User to User Collaborative filtering using Pearson correlation is {avg_ndcg_p:.4f}")

NDCG for User to User Collaborative filtering using cosine similarity is 0.1333
NDCG for User to User Collaborative filtering using Pearson correlation is 0.1178
