<a href="https://colab.research.google.com/github/Orth33/movie-recommendation/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rajmehra03/movielens100k")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'movielens100k' dataset.
Path to dataset files: /kaggle/input/movielens100k


In [8]:
# 1. Load the datasets
movies = pd.read_csv('/kaggle/input/movielens100k/movies.csv')
ratings = pd.read_csv('/kaggle/input/movielens100k/ratings.csv')

# Let's see what we're working with
display(movies.head())
display(ratings.head())

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [10]:
# 2. Quick Inspection
print("--- Data ---")
print(ratings.info())

--- Data ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None


# Data Merging & Cleaning

In [11]:
# Merge datasets on 'movieId'
df = pd.merge(ratings, movies, on='movieId')

# Drop columns we don't need for the basic model (like timestamps)
df = df.drop('timestamp', axis=1)

# Quick check for missing values
print(df.isnull().sum())

userId     0
movieId    0
rating     0
title      0
genres     0
dtype: int64


# Creating the User-Item Matrix

In [12]:
# Create the matrix
user_item_matrix = df.pivot_table(index='userId', columns='title', values='rating')

# Most users haven't seen most movies, so we'll have a lot of NaNs (Missing values)
# For now, we fill them with 0 to indicate 'no rating/not seen'
user_item_matrix_filled = user_item_matrix.fillna(0)

print(f"Matrix Shape: {user_item_matrix_filled.shape}") # (Users, Movies)

Matrix Shape: (671, 9064)


# Computing User Similarity

In [14]:
# Calculate cosine similarity between all users
user_similarity = cosine_similarity(user_item_matrix_filled)

# Convert to a DataFrame for easier navigation
user_sim_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

# Preview the similarity between the first few users
user_sim_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.074482,0.016818,0.0,0.083884,0.0,0.012843,0.0,...,0.0,0.0,0.014481,0.043719,0.0,0.0,0.0,0.062917,0.0,0.017466
2,0.0,1.0,0.124295,0.118821,0.103646,0.0,0.212985,0.11319,0.113333,0.043213,...,0.477306,0.063202,0.077784,0.164162,0.466281,0.425462,0.084646,0.02414,0.170595,0.113175
3,0.0,0.124295,1.0,0.08164,0.151531,0.060691,0.154714,0.249781,0.134475,0.114672,...,0.161205,0.064198,0.176222,0.158357,0.177098,0.124562,0.124911,0.080984,0.136606,0.170193
4,0.074482,0.118821,0.08164,1.0,0.130649,0.079648,0.319745,0.191013,0.030417,0.137186,...,0.114319,0.047228,0.136647,0.25403,0.121905,0.088735,0.068483,0.104309,0.054512,0.211609
5,0.016818,0.103646,0.151531,0.130649,1.0,0.063796,0.095888,0.165712,0.086616,0.03237,...,0.191029,0.021142,0.146246,0.224245,0.139721,0.058252,0.042926,0.038358,0.062642,0.225086


# Building the Recommender Logic

In [15]:
def get_recommendations(user_id, num_recommendations=5):
    # 1. Get the top 5 users most similar to our target user
    similar_users = user_sim_df[user_id].sort_values(ascending=False).iloc[1:6].index

    # 2. Get the movies highly rated by these similar users
    # We'll take the mean rating given by these 5 similar users for all movies
    similar_users_ratings = user_item_matrix_filled.loc[similar_users].mean()

    # 3. Filter out movies the target user has already seen
    user_seen_movies = user_item_matrix.loc[user_id].dropna().index
    recommendations = similar_users_ratings.drop(user_seen_movies)

    # 4. Return the top N movies with the highest predicted rating
    return recommendations.sort_values(ascending=False).head(num_recommendations)

# Try it out!
print(f"Recommendations for User 1:\n{get_recommendations(user_id=1)}")

Recommendations for User 1:
title
Beverly Hills Cop (1984)            2.0
Junior (1994)                       1.6
Shawshank Redemption, The (1994)    1.0
Fargo (1996)                        1.0
Proof (2005)                        1.0
dtype: float64


# Evaluation (Precision at K)

In [37]:
from sklearn.model_selection import train_test_split

# 1. Split data into train and test
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

# 2. Re-create the User-Item matrix using ONLY training data
# (This prevents 'data leakage')
train_matrix = train_data.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
user_sim_train = cosine_similarity(train_matrix)
user_sim_df_train = pd.DataFrame(user_sim_train, index=train_matrix.index, columns=train_matrix.index)

def evaluate_precision_at_k(user_id, k, threshold=4.0):
    # Get movies the user actually liked in the TEST set (Relevant items)
    user_test_ratings = test_data[test_data['userId'] == user_id]
    relevant_items = set(user_test_ratings[user_test_ratings['rating'] >= threshold]['movieId'])

    if not relevant_items:
        return None # Skip users who have no highly rated movies in the test set

    # Generate Top K recommendations (using our previous logic)
    # Note: Use IDs here for consistency
    similar_users = user_sim_df_train[user_id].sort_values(ascending=False).iloc[1:11].index
    sim_user_ratings = train_matrix.loc[similar_users].mean()

    # Exclude movies the user saw in the TRAIN set
    user_train_seen = train_matrix.loc[user_id][train_matrix.loc[user_id] > 0].index
    recommendations = sim_user_ratings.drop(user_train_seen).sort_values(ascending=False).head(k)
    recommended_items = set(recommendations.index)

    # Calculate hits (intersection of recommended and relevant)
    hits = len(recommended_items.intersection(relevant_items))

    return hits / k

# 4. Calculate average Precision@5 for a sample of users
precisions = []
for user in test_data['userId'].unique()[:75]: # Testing first 50 users for speed
    p = evaluate_precision_at_k(user, k=5)
    if p is not None:
        precisions.append(p)

print(f"Average Precision@5: {sum(precisions) / len(precisions):.4f}")

Average Precision@5: 0.3440
