# ==========================================
# Movie Recommendation System Description - Elevvo Pathways Internship
# Author: Syed Huzaifa Bin Khamis
# Level 2 - Task 5
# ==========================================

In [1]:
# ---------------------------------------------------
# Importing Libraries
# ---------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler


In [5]:
# Load u.data
ratings = pd.read_csv(
    "ml-100k/u.data",
    sep="\t",
    names=["userId", "movieId", "rating", "timestamp"]
)

# Load u.item (movie info)
movies = pd.read_csv(
    "ml-100k/u.item",
    sep="|",
    encoding="latin-1",
    usecols=[0, 1],
    names=["movieId", "title"]
)

# Merge
data = pd.merge(ratings, movies, on="movieId")
data.head()


# ---------------------------------------------------
# Load Dataset
# ---------------------------------------------------
# ratings.csv should have columns: userId, movieId, rating, timestamp
#ratings = pd.read_csv("ratings.csv")
#movies = pd.read_csv("movies.csv")

# Merge to get movie names
#data = pd.merge(ratings, movies, on="movieId")

#data.head()


Unnamed: 0,userId,movieId,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [6]:
print("Number of users:", data.userId.nunique())
print("Number of movies:", data.movieId.nunique())

# Create user-item matrix
user_movie_matrix = data.pivot_table(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)

user_movie_matrix.head()


Number of users: 943
Number of movies: 1682


title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0


In [7]:
# ---------------------------------------------------
# User-User Collaborative Filtering
# ---------------------------------------------------
# Compute cosine similarity between users
user_similarity = cosine_similarity(user_movie_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)

def recommend_movies_user_based(user_id, num_recommendations=5):
    # Find similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:]
    
    # Weighted ratings from similar users
    similar_users_ratings = user_movie_matrix.loc[similar_users]
    weighted_ratings = similar_users_ratings.T.dot(user_similarity_df[user_id][similar_users])
    
    # Normalize
    weighted_ratings /= user_similarity_df[user_id][similar_users].sum()
    
    # Remove movies already rated by the user
    user_rated_movies = user_movie_matrix.loc[user_id]
    recommendations = weighted_ratings[user_rated_movies == 0].sort_values(ascending=False)
    
    return recommendations.head(num_recommendations)

recommend_movies_user_based(10, 5)


title
Return of the Jedi (1983)          2.625570
Empire Strikes Back, The (1980)    2.382551
Back to the Future (1985)          2.135831
Schindler's List (1993)            2.132099
Fugitive, The (1993)               2.104110
dtype: float64

In [8]:
# ---------------------------------------------------
# Item-Item Collaborative Filtering
# ---------------------------------------------------
movie_similarity = cosine_similarity(user_movie_matrix.T)
movie_similarity_df = pd.DataFrame(movie_similarity, index=user_movie_matrix.columns, columns=user_movie_matrix.columns)

def recommend_movies_item_based(movie_name, num_recommendations=5):
    similar_movies = movie_similarity_df[movie_name].sort_values(ascending=False)[1:num_recommendations+1]
    return similar_movies

recommend_movies_item_based("Toy Story (1995)", 5)


title
Star Wars (1977)                 0.734572
Return of the Jedi (1983)        0.699925
Independence Day (ID4) (1996)    0.689786
Rock, The (1996)                 0.664555
Mission: Impossible (1996)       0.641322
Name: Toy Story (1995), dtype: float64

In [9]:
# ---------------------------------------------------
# Matrix Factorization using Truncated SVD
# ---------------------------------------------------
svd = TruncatedSVD(n_components=20, random_state=42)
latent_matrix = svd.fit_transform(user_movie_matrix)

print("Shape of latent matrix:", latent_matrix.shape)


Shape of latent matrix: (943, 20)


In [10]:
reconstructed_ratings = np.dot(latent_matrix, svd.components_)
predicted_ratings = pd.DataFrame(reconstructed_ratings, index=user_movie_matrix.index, columns=user_movie_matrix.columns)

def recommend_movies_svd(user_id, num_recommendations=5):
    user_pred = predicted_ratings.loc[user_id].sort_values(ascending=False)
    user_rated = user_movie_matrix.loc[user_id]
    recommendations = user_pred[user_rated == 0].head(num_recommendations)
    return recommendations

recommend_movies_svd(10, 5)


title
Godfather: Part II, The (1974)    4.120514
Annie Hall (1977)                 4.106397
To Kill a Mockingbird (1962)      3.868077
Schindler's List (1993)           3.782243
Babe (1995)                       3.471999
Name: 10, dtype: float64

In [11]:
def precision_at_k(actual, predicted, k=5):
    pred_k = set(predicted[:k])
    act_k = set(actual)
    return len(pred_k & act_k) / float(k)

# Example (requires test split for true evaluation)
