In [2]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
#github raw link

movies_url  = "https://raw.githubusercontent.com/Sadiya-Akter-Mim/MovieLens-Recommender-Assignment/main/movies.csv"
ratings_url = "https://raw.githubusercontent.com/Sadiya-Akter-Mim/MovieLens-Recommender-Assignment/main/ratings.csv"

In [4]:
#load dataset
movies = pd.read_csv(movies_url)
ratings = pd.read_csv(ratings_url)

In [5]:
#show 5 rows
print("Movies dataset:")
print(movies.head())
print("\nRatings dataset:")
print(ratings.head())

Movies dataset:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings dataset:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [7]:
# Split ratings into train (80%) and test (20%) for evaluation
train, test = train_test_split(ratings, test_size=0.2, random_state=42)

print("Train size:", train.shape)
print("Test size:", test.shape)


Train size: (80668, 4)
Test size: (20168, 4)


In [8]:
# Create User-Movie matrix from train set
train_matrix = train.pivot_table(index="userId", columns="movieId", values="rating").fillna(0)

print("User-Movie matrix:")
print(train_matrix.head())


User-Movie matrix:
movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     0.0     4.0     0.0     0.0     4.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  191005  193565  193571  193573  193579  193581  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0  ...     0.0     0.0     0.0   

In [9]:
# Compute user-user similarity
user_similarity = cosine_similarity(train_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=train_matrix.index, columns=train_matrix.index)

print("User similarity matrix sample:")
display(user_similarity_df.head())



User similarity matrix sample:


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.016314,0.049021,0.165799,0.123392,0.118556,0.112563,0.142135,0.056088,0.012906,...,0.070901,0.152097,0.187324,0.067264,0.151517,0.139042,0.198771,0.232811,0.112174,0.143902
2,0.016314,1.0,0.0,0.004627,0.0,0.013391,0.029067,0.032754,0.0,0.080739,...,0.170123,0.020395,0.014415,0.0,0.0,0.019846,0.016076,0.05561,0.032404,0.07581
3,0.049021,0.0,1.0,0.0,0.00577,0.004833,0.0,0.005911,0.0,0.0,...,0.006401,0.005889,0.015344,0.0,0.012783,0.008884,0.004642,0.009433,0.0,0.031309
4,0.165799,0.004627,0.0,1.0,0.133565,0.090914,0.094497,0.050417,0.0,0.021991,...,0.075828,0.090252,0.241155,0.054366,0.081585,0.162277,0.083074,0.107276,0.02672,0.068325
5,0.123392,0.0,0.00577,0.133565,1.0,0.238812,0.071386,0.393773,0.0,0.006245,...,0.050523,0.343953,0.101064,0.159651,0.111464,0.086797,0.073278,0.09704,0.205395,0.05309


In [10]:
def recommend_movies_userCF(user_id, N=5):

    #Recommend Top-N movies using User-Based Collaborative Filtering

    # Find top similar users (neighbors)
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:11]

    # Weighted average of neighbor ratings
    weighted_ratings = pd.Series(dtype=float)
    for neighbor_id, sim_score in similar_users.items():
        neighbor_ratings = train_matrix.loc[neighbor_id]
        weighted_ratings = weighted_ratings.add(neighbor_ratings * sim_score, fill_value=0)

    # Remove movies already rated by user
    user_rated = train_matrix.loc[user_id]
    weighted_ratings = weighted_ratings[user_rated == 0]

    # Return top-N recommended movies
    top_recommendations = weighted_ratings.sort_values(ascending=False).head(N)
    return movies[movies['movieId'].isin(top_recommendations.index)][['movieId','title']]



In [11]:
# Convert to numpy array
matrix = train_matrix.values

# Apply Truncated SVD
svd = TruncatedSVD(n_components=20, random_state=42)
latent_matrix = svd.fit_transform(matrix)
reconstructed = np.dot(latent_matrix, svd.components_)

# Convert back to DataFrame
svd_train_df = pd.DataFrame(reconstructed, index=train_matrix.index, columns=train_matrix.columns)

print("SVD reconstructed rating matrix sample:")
display(svd_train_df.head())



SVD reconstructed rating matrix sample:


movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.276717,1.254279,1.111054,0.00903,0.154842,1.467038,0.274476,0.041337,0.163454,1.531384,...,-0.014538,-0.011308,-0.012923,-0.012923,-0.011308,-0.012923,-0.011308,-0.011308,-0.011308,-0.026245
2,0.159419,-0.0052,0.031014,0.006267,0.028691,-0.045036,-0.028494,0.026206,0.019432,-0.07679,...,0.011383,0.008854,0.010119,0.010119,0.008854,0.010119,0.008854,0.008854,0.008854,0.014487
3,0.046178,0.021501,0.032577,-0.003379,-0.012925,0.036691,-0.003069,0.006263,0.012388,0.016572,...,-0.000318,-0.000247,-0.000282,-0.000282,-0.000247,-0.000282,-0.000247,-0.000247,-0.000247,-0.001735
4,1.716662,0.323975,0.138665,0.029194,0.179403,0.507154,0.35836,-0.050802,0.007065,0.331316,...,-0.001685,-0.00131,-0.001498,-0.001498,-0.00131,-0.001498,-0.00131,-0.00131,-0.00131,-0.008362
5,1.108818,0.792571,0.270276,0.105957,0.383099,0.462174,0.462996,0.127123,0.047584,1.036694,...,-0.002248,-0.001749,-0.001998,-0.001998,-0.001749,-0.001998,-0.001749,-0.001749,-0.001749,-0.003188


In [12]:
def recommend_movies_SVD(user_id, N=5):

    #Recommend Top-N movies using Matrix Factorization (SVD)

    # Get predicted ratings
    user_ratings = svd_train_df.loc[user_id]

    # Exclude already rated movies
    already_rated = train_matrix.loc[user_id]
    user_ratings = user_ratings[already_rated == 0]

    # Return top-N
    top_recommendations = user_ratings.sort_values(ascending=False).head(N)
    return movies[movies['movieId'].isin(top_recommendations.index)][['movieId','title']]



In [13]:
def recommend_movies(user_id, N=5, method="userCF"):

    #Recommend Top-N movies using either User-Based or Matrix Factorization (SVD)

    #Unified recommendation function
    #method = "userCF" or "SVD"

    if method == "userCF":
        return recommend_movies_userCF(user_id, N)
    elif method == "SVD":
        return recommend_movies_SVD(user_id, N)
    else:
        raise ValueError("Method must be 'userCF' or 'SVD'")



In [14]:
def precision_at_k(recommended, relevant, k):
    return len(set(recommended[:k]) & set(relevant)) / k

def recall_at_k(recommended, relevant, k):
    return len(set(recommended[:k]) & set(relevant)) / len(relevant) if len(relevant) > 0 else 0

def ndcg_at_k(recommended, relevant, k):
    dcg = 0
    for i, rec in enumerate(recommended[:k]):
        if rec in relevant:
            dcg += 1 / np.log2(i+2)
    idcg = sum(1/np.log2(i+2) for i in range(min(len(relevant), k)))
    return dcg / idcg if idcg > 0 else 0


In [22]:
# Example: Evaluate user 1 with SVD
user_id = 1
N = 5

# Get recommendations
recommended = recommend_movies(user_id, N, method="SVD")['movieId'].tolist()

# Relevant = movies user actually rated in test set
relevant = test[test['userId']==user_id]['movieId'].tolist()

print("Precision@5:", precision_at_k(recommended, relevant, 5))
print("Recall@5:", recall_at_k(recommended, relevant, 5))
print("NDCG@5:", ndcg_at_k(recommended, relevant, 5))


Precision@5: 0.4
Recall@5: 0.05128205128205128
NDCG@5: 0.30078518014914984
