In [2]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
import urllib.request
import zipfile
import os


In [3]:
# Step 2: Download and extract MovieLens 100K dataset if not already present
dataset_url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
dataset_path = "ml-100k.zip"

if not os.path.exists("ml-100k"):
    print("📦 Downloading MovieLens dataset...")
    urllib.request.urlretrieve(dataset_url, dataset_path)
    with zipfile.ZipFile(dataset_path, "r") as zip_ref:
        zip_ref.extractall(".")
    print("✅ Extraction completed!")
else:
    print("✅ Dataset already available.")


✅ Dataset already available.


In [4]:
# Step 3: Load the ratings data into a DataFrame
df = pd.read_csv("ml-100k/u.data", sep='\t', names=["userId", "movieId", "rating", "timestamp"])

# Remove the timestamp column as it's not needed
df.drop("timestamp", axis=1, inplace=True)

# Preview first 5 rows
df.head()


Unnamed: 0,userId,movieId,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [5]:
# Step 4: Encode userId and movieId to numeric indices (0-based)
user_enc = LabelEncoder()
item_enc = LabelEncoder()

# Create new encoded columns for easier indexing
df['user'] = user_enc.fit_transform(df['userId'])
df['item'] = item_enc.fit_transform(df['movieId'])


In [6]:
# Step 5: Create the user-item rating matrix (sparse matrix)
n_users = df['user'].nunique()
n_items = df['item'].nunique()

# Initialize a matrix of zeros with shape (n_users, n_items)
rating_matrix = np.zeros((n_users, n_items))

# Fill the matrix with actual ratings
for row in df.itertuples():
    rating_matrix[row.user, row.item] = row.rating


In [7]:
# Step 6: Apply Truncated SVD to reduce dimensionality (Matrix Factorization)
# n_components = number of latent features (tunable hyperparameter)
svd = TruncatedSVD(n_components=20, random_state=42)

# Fit the model and transform the original rating matrix
matrix_reduced = svd.fit_transform(rating_matrix)

# Reconstruct the full matrix using dot product (approximate predictions)
reconstructed_matrix = np.dot(matrix_reduced, svd.components_)


In [8]:
# Step 7: Evaluate performance using RMSE (Root Mean Squared Error)
# Only consider ratings that are known (i.e., > 0)
mask = rating_matrix > 0
rmse = np.sqrt(mean_squared_error(rating_matrix[mask], reconstructed_matrix[mask]))
print("🎯 RMSE (on known ratings):", round(rmse, 4))


🎯 RMSE (on known ratings): 2.1329


In [9]:
# Step 8: Define a function to recommend top N movies to a given user
def recommend_movies(user_id_original, top_n=5):
    # Convert original user ID to encoded index
    user_index = user_enc.transform([user_id_original])[0]

    # Get predicted ratings for the user from reconstructed matrix
    scores = reconstructed_matrix[user_index]

    # Get indices of movies the user has already rated
    rated_items = rating_matrix[user_index].nonzero()[0]

    # Set scores of already rated items to -inf so they're not recommended again
    scores[rated_items] = -np.inf

    # Get indices of top-N highest scoring movies
    top_items = np.argsort(scores)[::-1][:top_n]

    # Convert encoded movie indices back to original movie IDs
    top_movie_ids = item_enc.inverse_transform(top_items)
    return top_movie_ids


In [10]:
# Step 9: Test the recommendation function for a sample user (e.g., userId = 196)
user_id = 196
recommendations = recommend_movies(user_id, top_n=5)
print(f"🎬 Top 5 movie recommendations for User {user_id}:", recommendations)


🎬 Top 5 movie recommendations for User 196: [100 216 208  88 732]


In [11]:
# Step 10 (Optional): Load movie titles to map recommended movie IDs to names
movie_titles = pd.read_csv(
    "ml-100k/u.item", sep="|", header=None, encoding="latin-1",
    usecols=[0, 1], names=["movieId", "title"]
)

# Create a dictionary to map movieId → title
title_map = dict(zip(movie_titles['movieId'], movie_titles['title']))

# Print movie titles for recommended IDs
movie_titles_list = [title_map[mid] for mid in recommendations]
print(f"🍿 Recommended Movie Titles for User {user_id}:\n", movie_titles_list)


🍿 Recommended Movie Titles for User 196:
 ['Fargo (1996)', 'When Harry Met Sally... (1989)', 'Young Frankenstein (1974)', 'Sleepless in Seattle (1993)', 'Dave (1993)']
