# Task - 4 Movie Recommendation System

In [1]:
import pandas as pd
import numpy as np

### Load Ratings Data

In [2]:
ratings = pd.read_csv("ml-100k/u.data" , sep = "\t", names = ["userId", "movieId", "rating", "timestamp"])
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


### Load Movies Data

In [3]:
movies = pd.read_csv("ml-100k/u.item", sep = "|" ,encoding = "latin-1" ,names = ["movieId", "title", "release_date", "video_release_date", "IMDb_URL", 
                            "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", 
                            "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", 
                            "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"],usecols = ["movieId", "title"])

In [4]:
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


# User-User Similarity

In [5]:
user_item_matrix = ratings.pivot_table(index = "userId" , columns = "movieId", values = "rating")

In [6]:
user_item_matrix.shape

(943, 1682)

***filling NaN values with 0***

In [7]:
user_item_matrix = user_item_matrix.fillna(0)

## calculation cosine similarity

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
similarity = cosine_similarity(user_item_matrix)

***convert in dataframe***

In [10]:
similarity_data = pd.DataFrame(similarity , index = user_item_matrix.index , columns = user_item_matrix.index)

In [11]:
similarity_data.head()

userId,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
2,0.166931,1.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.22679,0.161485,0.172268,0.105798
3,0.04746,0.110591,1.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.16189,0.101243,0.133416,0.026556
4,0.064358,0.178121,0.344151,1.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
5,0.378475,0.072979,0.021245,0.031804,1.0,0.237286,0.3736,0.24893,0.056847,0.201427,...,0.338794,0.08058,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941


# Recommendation Function

In [12]:
def recommendation(user_id , user_item_matrix , similarity_data , movies , top_n = 5):
    similar_user = similarity_data[user_id].sort_values(ascending = False)[1:6].index
    similar_user_ratings = user_item_matrix.loc[similar_user].mean(axis = 0)
    user_rate_movies = user_item_matrix.loc[user_id]
    unseen_movies = similar_user_ratings[user_rate_movies == 0]
    recommendation = unseen_movies.sort_values(ascending = False).head(top_n)
    return movies[movies["movieId"].isin(recommendation.index)][["movieId","title"]]

In [13]:
print(recommendation(user_id=1, 
                       user_item_matrix=user_item_matrix, 
                       similarity_data=similarity_data, 
                       movies=movies, 
                       top_n=5))


     movieId                                              title
272      273                                        Heat (1995)
381      382  Adventures of Priscilla, Queen of the Desert, ...
432      433                                    Heathers (1989)
473      474  Dr. Strangelove or: How I Learned to Stop Worr...
565      566                    Clear and Present Danger (1994)


#  Evaluate performance using precision at K

In [14]:
def precision_at_k(user_id, recommended_movies, ratings, k = 5 , threshold = 3.0):
    user_ratings = ratings[ratings["userId"] == user_id]
    recommended_ids = recommended_movies["movieId"].head(k).values
    relevant_ids = user_ratings[user_ratings["rating"] >= threshold]["movieId"].values
    relevant_recommended = len(set(recommended_ids) & set(relevant_ids))
    return relevant_recommended / k

In [15]:
user_id = 1
recs = recommendation(user_id, user_item_matrix, similarity_data, movies, top_n=10)

print(recs)   

score = precision_at_k(user_id, recs, ratings, k=10)
print("Precision@5 for User", user_id, "=", score)


     movieId                                              title
272      273                                        Heat (1995)
381      382  Adventures of Priscilla, Queen of the Desert, ...
402      403                                      Batman (1989)
432      433                                    Heathers (1989)
473      474  Dr. Strangelove or: How I Learned to Stop Worr...
565      566                    Clear and Present Danger (1994)
567      568                                       Speed (1994)
654      655                                 Stand by Me (1986)
683      684                         In the Line of Fire (1993)
731      732                                        Dave (1993)
Precision@5 for User 1 = 0.0


In [16]:
for u in [4, 7, 21, 61]:
    recs = recommendation(u, user_item_matrix, similarity_data, movies, top_n=10)
    print("\nUser", u)
    print(recs)
    score = precision_at_k(u, recs, ratings, k=10)
    print("Precision@5 =", score)



User 4
     movieId                                        title
244      245                      Devil's Own, The (1997)
285      286                  English Patient, The (1996)
301      302                     L.A. Confidential (1997)
304      305                        Ice Storm, The (1997)
305      306  Mrs. Brown (Her Majesty, Mrs. Brown) (1997)
322      323                          Dante's Peak (1997)
325      326                             G.I. Jane (1997)
330      331                             Edge, The (1997)
689      690                  Seven Years in Tibet (1997)
878      879                       Peacemaker, The (1997)
Precision@5 = 0.0

User 7
      movieId                            title
0           1                 Toy Story (1995)
14         15        Mr. Holland's Opus (1995)
94         95                   Aladdin (1992)
116       117                 Rock, The (1996)
217       218                 Cape Fear (1991)
489       490          To Catch a Thief (1955)

In [17]:
def evaluate_model(user_ids, ratings, user_item_matrix, similarity_data, movies, k=10):
    scores = []
    for user_id in user_ids:
        recs = recommendation(user_id, user_item_matrix, similarity_data, movies, top_n=k)
        score = precision_at_k(user_id, recs, ratings, k=k)
        scores.append(score)
    return sum(scores)/len(scores)

avg_precision = evaluate_model(ratings["userId"].unique()[:100], ratings, user_item_matrix, similarity_data, movies, k=5)
print("Average Precision@5 =", avg_precision)


Average Precision@5 = 0.0


# Item to Item similarity

In [18]:
item_similarity = pd.DataFrame(cosine_similarity(user_item_matrix.T),
                               index=user_item_matrix.columns,
                               columns=user_item_matrix.columns)

In [19]:
def item_based_recommendation(user_id, user_item_matrix, item_similarity_df, movies, top_n=5):
    user_ratings = user_item_matrix.loc[user_id].copy()
    pred_ratings = pd.Series(0.0, index=user_item_matrix.columns)
    for movie_id in user_item_matrix.columns:
        if user_ratings[movie_id] == 0:
            sim_scores = item_similarity_df[movie_id]
            rated_mask = user_ratings > 0
            if sim_scores[rated_mask].sum() > 0:
                pred_ratings[movie_id] = np.dot(sim_scores[rated_mask], user_ratings[rated_mask]) / sim_scores[rated_mask].sum()
    top_movies = pred_ratings.sort_values(ascending=False).head(top_n)
    return movies[movies["movieId"].isin(top_movies.index)][["movieId", "title"]]

In [20]:
def evaluate_item_based(user_ids, ratings, user_item_matrix, item_similarity_df, movies, top_n=5, k=5):
    scores = []
    for user_id in user_ids:
        recs = item_based_recommendation(user_id, user_item_matrix, item_similarity_df, movies, top_n=top_n)
        scores.append(precision_at_k(user_id, recs, ratings, k=k))
    return sum(scores)/len(scores)


# SVD-based CF

In [21]:
R = user_item_matrix.values
U, sigma, Vt = np.linalg.svd(R, full_matrices=False)
sigma_matrix = np.diag(sigma[:50])
R_hat = np.dot(np.dot(U[:, :50], sigma_matrix), Vt[:50, :])
pred_ratings_matrix = pd.DataFrame(R_hat, index=user_item_matrix.index, columns=user_item_matrix.columns)

In [22]:
def svd_numpy_recommendation(user_id, pred_ratings_matrix, user_item_matrix, movies, top_n=5):
    user_row = pred_ratings_matrix.loc[user_id].copy()
    already_rated = user_item_matrix.loc[user_id] > 0
    user_row[already_rated] = 0
    top_movies = user_row.sort_values(ascending=False).head(top_n)
    return movies[movies["movieId"].isin(top_movies.index)][["movieId", "title"]]

In [23]:
def evaluate_svd(user_ids, ratings, user_item_matrix, pred_ratings_matrix, movies, top_n=5, k=5):
    scores = []
    for user_id in user_ids:
        recs = svd_numpy_recommendation(user_id, pred_ratings_matrix, user_item_matrix, movies, top_n=top_n)
        scores.append(precision_at_k(user_id, recs, ratings, k=k))
    return sum(scores)/len(scores)

In [25]:
user_sample = ratings["userId"].unique()[:50]
print("Average Precision@5 (Item-Item CF):", evaluate_item_based(user_sample, ratings, user_item_matrix, item_similarity, movies, top_n=5, k=5))
print("Average Precision@5 (SVD CF):", evaluate_svd(user_sample, ratings, user_item_matrix, pred_ratings_matrix, movies, top_n=5, k=5))

Average Precision@5 (Item-Item CF): 0.0
Average Precision@5 (SVD CF): 0.0
