# Movie Recommendation Model Training

This notebook trains and saves the recommendation model using cross-validation with reconstruction error.

In [6]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import requests
import io
import zipfile
from sklearn.model_selection import KFold

In [7]:
def load_movielens_data():
    url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
    response = requests.get(url)
    z = zipfile.ZipFile(io.BytesIO(response.content))
    movies_df = pd.read_csv(z.open('ml-latest-small/movies.csv'))
    ratings_df = pd.read_csv(z.open('ml-latest-small/ratings.csv'))
    return movies_df, ratings_df

movies_df, ratings_df = load_movielens_data()

# Create user-movie matrix
user_movie_matrix = ratings_df.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

print(f"Matrix shape: {user_movie_matrix.shape}")

Matrix shape: (610, 9724)


In [8]:
def compute_reconstruction_error(pipeline, X):
    """Compute reconstruction error for dimensionality reduction"""
    X_transformed = pipeline.transform(X)
    X_reconstructed = pipeline.inverse_transform(X_transformed)
    return np.mean(np.square(X - X_reconstructed))

# Parameter grid
n_components_list = [50, 100, 150]
best_error = float('inf')
best_n_components = None

# Cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for n_components in n_components_list:
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('svd', TruncatedSVD(n_components=n_components, random_state=42))
    ])
    
    errors = []
    for train_idx, val_idx in kf.split(user_movie_matrix):
        X_train = user_movie_matrix.iloc[train_idx]
        X_val = user_movie_matrix.iloc[val_idx]
        
        pipeline.fit(X_train)
        error = compute_reconstruction_error(pipeline, X_val)
        errors.append(error)
    
    mean_error = np.mean(errors)
    print(f"n_components: {n_components}, Mean Reconstruction Error: {mean_error}")
    
    if mean_error < best_error:
        best_error = mean_error
        best_n_components = n_components

print(f"\nBest n_components: {best_n_components}")
print(f"Best reconstruction error: {best_error}")

n_components: 50, Mean Reconstruction Error: 0.16367975623311198
n_components: 100, Mean Reconstruction Error: 0.1564271300953075
n_components: 100, Mean Reconstruction Error: 0.1564271300953075
n_components: 150, Mean Reconstruction Error: 0.1522642454458111

Best n_components: 150
Best reconstruction error: 0.1522642454458111
n_components: 150, Mean Reconstruction Error: 0.1522642454458111

Best n_components: 150
Best reconstruction error: 0.1522642454458111


In [10]:
# Get best model and compute similarity matrix
best_model = grid_search.best_estimator_
movie_features = best_model.transform(user_movie_matrix.T)
movie_similarity = cosine_similarity(movie_features)

# Save model and similarity matrix
model_data = {
    'model': best_model,
    'similarity_matrix': pd.DataFrame(
        movie_similarity,
        index=user_movie_matrix.columns,
        columns=user_movie_matrix.columns
    ),
    'best_params': grid_search.best_params_,
    'best_score': grid_search.best_score_
}

joblib.dump(model_data, '../models/best_model.joblib')
print("Model saved successfully!")

Model saved successfully!
