In [1]:
# =====================================================================
#### Task 4: Movie Rating Prediction
# =====================================================================
### Objective:
#   - Predict how a user will rate a movie they haven't seen.
#   - Use collaborative filtering based on user similarity.
#   - Preprocess data, build model, and evaluate performance with RMSE.
### Tools Used:
#   - Python, pandas, numpy, matplotlib
#   - Scikit-learn: cosine_similarity, train_test_split, mean_squared_error

In [2]:
# Data handling and manipulation
import pandas as pd
import numpy as np

# Data plotting and visualization
import matplotlib.pyplot as plt

# Compute similarity between users
from sklearn.metrics.pairwise import cosine_similarity

# Split data and evaluate performance
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [4]:
# Load ratings data
columns = ['user_id', 'movie_id', 'rating', 'timestamp']
df = pd.read_csv('https://files.grouplens.org/datasets/movielens/ml-100k/u.data', sep='\t', names=columns)

# Show first 5 rows
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
df.shape

(100000, 4)

In [6]:
df.isnull().sum()

user_id      0
movie_id     0
rating       0
timestamp    0
dtype: int64

In [7]:
# Pivot table: users as rows, movies as columns
rating_matrix = df.pivot_table(index='user_id', columns='movie_id', values='rating')

# Show matrix shape
rating_matrix.shape

(943, 1682)

In [8]:
# Replace NaN with 0 temporarily
rating_matrix_filled = rating_matrix.fillna(0)

# Show sample of filled matrix
rating_matrix_filled.head(3)

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Calculate cosine similarity between all users
user_similarity = cosine_similarity(rating_matrix_filled)

# Convert to DataFrame for readability
user_similarity_df = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

# Show similarity matrix shape
user_similarity_df.shape

(943, 943)

In [10]:
# Predict rating using top-k similar users
def predict_rating(user_id, movie_id, rating_matrix, similarity_matrix, k=5):
    # Skip if movie not in matrix
    if movie_id not in rating_matrix.columns:
        return np.nan

    # Get users who rated this movie
    movie_ratings = rating_matrix[movie_id]
    rated_users = movie_ratings.dropna().index

    # Get similarity scores
    sim_scores = similarity_matrix.loc[user_id, rated_users]

    # Get top-k similar users (exclude self)
    top_k_users = sim_scores.nlargest(k + 1).index[1:]

    # Return mean if no similar users
    if len(top_k_users) == 0:
        return movie_ratings.mean()

    # Weighted average prediction
    sim_weights = sim_scores.loc[top_k_users]
    ratings_from_similar = movie_ratings.loc[top_k_users]

    predicted = np.dot(sim_weights, ratings_from_similar) / sim_weights.sum()
    return predicted

In [11]:
# Pick user 1 and movie 2
sample_user = 1
sample_movie = 2

# Get actual rating (if exists)
actual_rating = rating_matrix.loc[sample_user, sample_movie]
print("Actual rating:", actual_rating)

# Get predicted rating
predicted_rating = predict_rating(sample_user, sample_movie, rating_matrix, user_similarity_df, k=5)
print("Predicted rating:", round(predicted_rating, 2))

Actual rating: 3.0
Predicted rating: 3.2


In [12]:
# Create dataset of known ratings
known_ratings = rating_matrix.stack().reset_index()
known_ratings.columns = ['user_id', 'movie_id', 'rating']

# Split into train (80%) and test (20%)
train, test = train_test_split(known_ratings, test_size=0.2, random_state=42)

# Show sizes
print("Train size:", len(train))
print("Test size:", len(test))

Train size: 80000
Test size: 20000


In [13]:
# Apply prediction function to test set
test['predicted'] = test.apply(
    lambda row: predict_rating(
        row['user_id'], row['movie_id'], rating_matrix, user_similarity_df, k=5
    ), axis=1
)

# Keep only rows with valid predictions
test_clean = test.dropna(subset=['predicted'])

# Show first 10 predictions
test_clean.head(10)

Unnamed: 0,user_id,movie_id,rating,predicted
75721,693,382,4.0,3.189542
80184,747,111,4.0,3.200499
19864,201,212,4.0,3.000962
76699,705,427,2.0,4.422423
92991,880,849,3.0,2.809
76434,701,289,4.0,3.332387
84004,787,750,5.0,3.535459
80917,751,181,5.0,3.992739
60767,537,762,3.0,3.019494
50074,447,1142,5.0,4.195789


In [14]:
# Compute Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(test_clean['rating'], test_clean['predicted']))

# Show result
print("Model RMSE:", round(rmse, 4))

Model RMSE: 1.0414


In [15]:
# Display comparison
print("\nSample Predictions vs Actual:")
print(test_clean[['user_id', 'movie_id', 'rating', 'predicted']].head(10).round(2))


Sample Predictions vs Actual:
       user_id  movie_id  rating  predicted
75721      693       382     4.0       3.19
80184      747       111     4.0       3.20
19864      201       212     4.0       3.00
76699      705       427     2.0       4.42
92991      880       849     3.0       2.81
76434      701       289     4.0       3.33
84004      787       750     5.0       3.54
80917      751       181     5.0       3.99
60767      537       762     3.0       3.02
50074      447      1142     5.0       4.20
