In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
import joblib
import time

sns.set_style("whitegrid")
np.random.seed(42)

In [2]:
data_path = "D:/Machine Learning Projects/10. Movie Recommender/"
models_path = "D:/Machine Learning Projects/10. Movie Recommender/"

ratings = pd.read_csv(data_path + "ratings_processed.csv")
movies = pd.read_csv(data_path + "movies_processed.csv")

print("Ratings:", ratings.shape)
print("Unique users:", ratings['user_id'].nunique())
print("Unique movies:", ratings['movie_id'].nunique())

Ratings: (1000209, 4)
Unique users: 6040
Unique movies: 3706


### Map IDs to consecutive indices

In [3]:
# Create dense user and item indices
user_ids = ratings['user_id'].unique()
movie_ids = ratings['movie_id'].unique()

user_map = {uid: i for i, uid in enumerate(user_ids)}
movie_map = {mid: i for i, mid in enumerate(movie_ids)}

ratings['user_idx'] = ratings['user_id'].map(user_map)
ratings['movie_idx'] = ratings['movie_id'].map(movie_map)

n_users = len(user_ids)
n_movies = len(movie_ids)

print(f"Dense shape: {n_users} users × {n_movies} movies")

Dense shape: 6040 users × 3706 movies


### Train / Test Split

In [4]:
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42, stratify=ratings['rating'])

print("Train size:", len(train_df))
print("Test size :", len(test_df))

Train size: 800167
Test size : 200042


### FunkSVD Implementation (Biased Matrix Factorization)

In [5]:
class FunkSVD:
    def __init__(self, n_factors=40, n_epochs=30, lr=0.005, reg=0.02, verbose=True):
        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.lr = lr
        self.reg = reg
        self.verbose = verbose
        
        self.user_factors = None
        self.item_factors = None
        self.user_bias = None
        self.item_bias = None
        self.global_mean = None

    def fit(self, df_train):
        start = time.time()
        
        # Global mean
        self.global_mean = df_train['rating'].mean()
        
        # Initialize
        self.user_factors = np.random.normal(0, 0.1, (n_users, self.n_factors))
        self.item_factors = np.random.normal(0, 0.1, (n_movies, self.n_factors))
        self.user_bias = np.zeros(n_users)
        self.item_bias = np.zeros(n_movies)
        
        # Training loop (SGD)
        for epoch in range(self.n_epochs):
            epoch_start = time.time()
            total_error = 0
            
            # Shuffle training data each epoch
            df_shuf = df_train.sample(frac=1, random_state=epoch).reset_index(drop=True)
            
            for _, row in df_shuf.iterrows():
                u = row['user_idx']
                i = row['movie_idx']
                r = row['rating']
                
                # Prediction
                pred = (self.global_mean + self.user_bias[u] + self.item_bias[i] +
                        np.dot(self.user_factors[u], self.item_factors[i]))
                
                # Error
                err = r - pred
                total_error += err ** 2
                
                # Updates
                self.user_bias[u] += self.lr * (err - self.reg * self.user_bias[u])
                self.item_bias[i] += self.lr * (err - self.reg * self.item_bias[i])
                
                user_grad = err * self.item_factors[i] - self.reg * self.user_factors[u]
                item_grad = err * self.user_factors[u] - self.reg * self.item_factors[i]
                
                self.user_factors[u] += self.lr * user_grad
                self.item_factors[i] += self.lr * item_grad
            
            rmse = np.sqrt(total_error / len(df_shuf))
            if self.verbose:
                print(f"Epoch {epoch+1:2d}/{self.n_epochs} | RMSE: {rmse:.4f} | Time: {time.time()-epoch_start:.1f}s")
        
        print(f"Total training time: {time.time()-start:.1f} seconds")
        return self

    def predict(self, user_idx, movie_idx):
        return (self.global_mean + self.user_bias[user_idx] + self.item_bias[movie_idx] +
                np.dot(self.user_factors[user_idx], self.item_factors[movie_idx]))

    def predict_df(self, df):
        preds = []
        for _, row in df.iterrows():
            u = row['user_idx']
            i = row['movie_idx']
            preds.append(self.predict(u, i))
        return np.array(preds)

    def save(self, path):
        artifacts = {
            'user_factors': self.user_factors,
            'item_factors': self.item_factors,
            'user_bias': self.user_bias,
            'item_bias': self.item_bias,
            'global_mean': self.global_mean,
            'n_factors': self.n_factors
        }
        np.savez(path, **artifacts)
        print(f"Model saved to {path}")

In [6]:
model = FunkSVD(n_factors=40, n_epochs=25, lr=0.007, reg=0.02, verbose=True)
model.fit(train_df)

Epoch  1/25 | RMSE: 0.9716 | Time: 61.0s
Epoch  2/25 | RMSE: 0.9168 | Time: 58.4s
Epoch  3/25 | RMSE: 0.9043 | Time: 61.3s
Epoch  4/25 | RMSE: 0.8965 | Time: 52.3s
Epoch  5/25 | RMSE: 0.8890 | Time: 52.1s
Epoch  6/25 | RMSE: 0.8800 | Time: 50.4s
Epoch  7/25 | RMSE: 0.8691 | Time: 53.2s
Epoch  8/25 | RMSE: 0.8571 | Time: 51.7s
Epoch  9/25 | RMSE: 0.8447 | Time: 52.4s
Epoch 10/25 | RMSE: 0.8321 | Time: 53.6s
Epoch 11/25 | RMSE: 0.8193 | Time: 54.0s
Epoch 12/25 | RMSE: 0.8064 | Time: 51.2s
Epoch 13/25 | RMSE: 0.7938 | Time: 52.1s
Epoch 14/25 | RMSE: 0.7818 | Time: 53.4s
Epoch 15/25 | RMSE: 0.7703 | Time: 53.1s
Epoch 16/25 | RMSE: 0.7594 | Time: 59.7s
Epoch 17/25 | RMSE: 0.7491 | Time: 50.3s
Epoch 18/25 | RMSE: 0.7397 | Time: 54.2s
Epoch 19/25 | RMSE: 0.7309 | Time: 51.1s
Epoch 20/25 | RMSE: 0.7227 | Time: 50.8s
Epoch 21/25 | RMSE: 0.7153 | Time: 49.8s
Epoch 22/25 | RMSE: 0.7084 | Time: 50.3s
Epoch 23/25 | RMSE: 0.7020 | Time: 50.2s
Epoch 24/25 | RMSE: 0.6961 | Time: 51.5s
Epoch 25/25 | RM

<__main__.FunkSVD at 0x2b4fdaf3b60>

### Evaluate on Test Set (RMSE & MAE)

In [7]:
test_preds = model.predict_df(test_df)
test_true = test_df['rating'].values

rmse = np.sqrt(np.mean((test_preds - test_true) ** 2))
mae = np.mean(np.abs(test_preds - test_true))

print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE : {mae:.4f}")

Test RMSE: 0.8773
Test MAE : 0.6866


### Top-N Recommendation Function

In [8]:
def get_top_n(user_id, n=10):
    if user_id not in user_map:
        # Cold start → popularity fallback
        print(f"Cold start for user {user_id} → returning popular movies")
        movie_stats = ratings.groupby('movie_id').agg({'rating': ['mean', 'count']})
        movie_stats.columns = ['avg', 'cnt']
        movie_stats = movie_stats.sort_values(['avg', 'cnt'], ascending=False)
        top_ids = movie_stats.head(n).index.tolist()
        return movies[movies['movie_id'].isin(top_ids)][['movie_id', 'title', 'genres']]
    
    u_idx = user_map[user_id]
    
    preds = []
    for m_id, m_idx in movie_map.items():
        pred = model.predict(u_idx, m_idx)
        preds.append((m_id, pred))
    
    preds.sort(key=lambda x: x[1], reverse=True)
    top_ids = [mid for mid, _ in preds[:n]]
    
    return movies[movies['movie_id'].isin(top_ids)][['movie_id', 'title', 'genres']].reset_index(drop=True)

# Example
print("Top 10 for user 1:")
get_top_n(1, 10)

Top 10 for user 1:


Unnamed: 0,movie_id,title,genres
0,318,"Shawshank Redemption, The (1994)",Drama
1,356,Forrest Gump (1994),Comedy|Romance|War
2,527,Schindler's List (1993),Drama|War
3,593,"Silence of the Lambs, The (1991)",Drama|Thriller
4,904,Rear Window (1954),Mystery|Thriller
5,919,"Wizard of Oz, The (1939)",Adventure|Children's|Drama|Musical
6,1226,"Quiet Man, The (1952)",Comedy|Romance
7,1262,"Great Escape, The (1963)",Adventure|War
8,2762,"Sixth Sense, The (1999)",Thriller
9,2905,Sanjuro (1962),Action|Adventure


In [9]:
model.save(models_path + "funksvd_model.npz")

# Save mappings
mappings = {
    'user_map': user_map,
    'movie_map': movie_map,
    'inverse_user_map': {v: k for k, v in user_map.items()},
    'inverse_movie_map': {v: k for k, v in movie_map.items()}
}
joblib.dump(mappings, models_path + "id_mappings.pkl")

movies.to_csv(models_path + "movies_metadata.csv", index=False)

print("Model, mappings, and metadata saved.")

Model saved to D:/Machine Learning Projects/10. Movie Recommender/funksvd_model.npz
Model, mappings, and metadata saved.
