## Collaborative filtering

**Key Steps:**
  1. Split the dataset into training (80%) and testing (20%) sets.
  2. Train the model using only the training data.
  3. Use the model to predict ratings for movies in the test set.
  4. Calculate errors between predicted and actual ratings.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt
import os

script_dir = os.getcwd() 

print(f"Current working directory: {script_dir}")

# Load ratings data
ratings_file = os.path.join(script_dir, "Cleaned Datasets", "Audience_Ratings.csv")
df_ratings = pd.read_csv(ratings_file)

Current working directory: c:\Users\willi\Documents\GitHub\Movie-Recommendation-System


In [None]:
print(df_ratings.columns)

print("Dataset shape:", df_ratings.shape)

# Drop NA values if any
df_ratings.dropna(inplace=True)

# Check unique users and movies
print(f"Unique users: {df_ratings['userId'].nunique()}")
print(f"Unique movies: {df_ratings['imdbId'].nunique()}")

# Filter users or movies with very few interactions
min_user_ratings = 5
min_movie_ratings = 5

user_counts = df_ratings['userId'].value_counts()
movie_counts = df_ratings['imdbId'].value_counts()

df = df_ratings[df_ratings['userId'].isin(user_counts[user_counts >= min_user_ratings].index)]
df = df[df['imdbId'].isin(movie_counts[movie_counts >= min_movie_ratings].index)]


Index(['userId', 'imdbId', 'rating'], dtype='str')
Dataset shape: (33832162, 3)
Unique users: 330975
Unique movies: 83239


From above we can see that the total rating 100,836 with 610 unique users and 9,724 unique movies. 

In [3]:
# Mapping for userId and imdbId to index-based values
user_ids = df['userId'].unique()
movie_ids = df['imdbId'].unique()

user2idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie2idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

df['user_idx'] = df['userId'].map(user2idx)
df['movie_idx'] = df['imdbId'].map(movie2idx)


In [4]:
n_users = len(user2idx)
n_movies = len(movie2idx)
n_factors = 20  # Number of latent features

# Initialize user and movie matrices
np.random.seed(42)
P = np.random.normal(scale=0.1, size=(n_users, n_factors))  # User latent matrix
Q = np.random.normal(scale=0.1, size=(n_movies, n_factors))  # Movie latent matrix

# Bias terms
user_bias = np.zeros(n_users)
movie_bias = np.zeros(n_movies)
global_bias = df['rating'].mean()


In [5]:
def train_svd(df, P, Q, user_bias, movie_bias, global_bias, n_factors, epochs=20, lr=0.01, reg=0.1):
    for epoch in range(epochs):
        for row in df.itertuples():
            u = row.user_idx
            m = row.movie_idx
            rating = row.rating

            pred = global_bias + user_bias[u] + movie_bias[m] + np.dot(P[u], Q[m])
            error = rating - pred

            # Update biases
            user_bias[u] += lr * (error - reg * user_bias[u])
            movie_bias[m] += lr * (error - reg * movie_bias[m])

            # Update latent factors
            P[u] += lr * (error * Q[m] - reg * P[u])
            Q[m] += lr * (error * P[u] - reg * Q[m])
        
        # Optional: evaluate performance after each epoch
        preds = predict_all(df, P, Q, user_bias, movie_bias, global_bias)
        rmse = sqrt(mean_squared_error(df['rating'], preds))
        print(f"Epoch {epoch+1}/{epochs}, RMSE: {rmse:.4f}")
    
    return P, Q, user_bias, movie_bias


In [6]:
def predict_all(df, P, Q, user_bias, movie_bias, global_bias):
    preds = []
    for row in df.itertuples():
        u = row.user_idx
        m = row.movie_idx
        pred = global_bias + user_bias[u] + movie_bias[m] + np.dot(P[u], Q[m])
        preds.append(pred)
    return np.array(preds)


In [7]:
P, Q, user_bias, movie_bias = train_svd(df, P, Q, user_bias, movie_bias, global_bias, n_factors=20, epochs=20, lr=0.01, reg=0.1)


Epoch 1/20, RMSE: 0.1762
Epoch 2/20, RMSE: 0.1745
Epoch 3/20, RMSE: 0.1738
Epoch 4/20, RMSE: 0.1735
Epoch 5/20, RMSE: 0.1732
Epoch 6/20, RMSE: 0.1731
Epoch 7/20, RMSE: 0.1729
Epoch 8/20, RMSE: 0.1729
Epoch 9/20, RMSE: 0.1728
Epoch 10/20, RMSE: 0.1727
Epoch 11/20, RMSE: 0.1727
Epoch 12/20, RMSE: 0.1727
Epoch 13/20, RMSE: 0.1726
Epoch 14/20, RMSE: 0.1726
Epoch 15/20, RMSE: 0.1726
Epoch 16/20, RMSE: 0.1726
Epoch 17/20, RMSE: 0.1726
Epoch 18/20, RMSE: 0.1725
Epoch 19/20, RMSE: 0.1725
Epoch 20/20, RMSE: 0.1725


In [8]:
def predict_rating(user_id, movie_id):
    u = user2idx.get(user_id)
    m = movie2idx.get(movie_id)
    if u is None or m is None:
        return global_bias  # Fallback to global average
    pred = global_bias + user_bias[u] + movie_bias[m] + np.dot(P[u], Q[m])
    return pred

# Example:
predict_rating(1, 1)


np.float64(0.6574510486051969)