# SVD Model Training and Evaluation

This notebook trains a Singular Value Decomposition (SVD) model using the Surprise library on the preprocessed MovieLens 1M dataset.

**Dataset:** MovieLens 1M (preprocessed)
**Algorithm:** SVD (Singular Value Decomposition)
**Evaluation Metrics:** RMSE, MAE

---

## Setup Instructions for Google Colab

1. Upload your preprocessed files to Google Drive:
   - `train_set.csv`
   - `test_set.csv`
   - `preprocessing_metadata.json`

2. Mount your Google Drive when prompted
3. Update the `DATA_PATH` variable below to point to your files
4. Run all cells sequentially

## 1. Install Required Libraries

In [None]:
# Install scikit-surprise library
!pip install scikit-surprise pandas numpy matplotlib seaborn

## 2. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
from datetime import datetime

# Surprise library imports
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV
from surprise import accuracy

# Set random seed for reproducibility
np.random.seed(42)

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

print("Libraries imported successfully!")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 3. Mount Google Drive (For Colab)

In [None]:
# Uncomment and run this cell if using Google Colab
from google.colab import drive
drive.mount('/content/drive')

# Update this path to where you uploaded your preprocessed data
DATA_PATH = '/content/drive/MyDrive/xai-collaborative-filtering/data/processed/'

# If running locally, comment out the above and use:
# DATA_PATH = '../data/processed/'

## 4. Load Preprocessed Data

In [None]:
# Load train and test sets
print("Loading preprocessed data...")

train_df = pd.read_csv(DATA_PATH + 'train_set.csv')
test_df = pd.read_csv(DATA_PATH + 'test_set.csv')

# Load metadata
with open(DATA_PATH + 'preprocessing_metadata.json', 'r') as f:
    metadata = json.load(f)

print(f"\nTrain set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"\nTrain ratings: {len(train_df):,}")
print(f"Test ratings: {len(test_df):,}")
print(f"\nMetadata:")
for key, value in metadata.items():
    print(f"  {key}: {value}")

In [None]:
# Display sample of train data
print("Train set sample:")
display(train_df.head(10))

print("\nTrain set statistics:")
display(train_df.describe())

## 5. Prepare Data for Surprise Library

In [None]:
# Define the rating scale
reader = Reader(rating_scale=(1, 5))

# Load data into Surprise format
print("Converting data to Surprise format...")

# Create train dataset
train_data = Dataset.load_from_df(
    train_df[['user_id', 'movie_id', 'rating']], 
    reader
)
trainset = train_data.build_full_trainset()

# Create test dataset (list of tuples format)
testset = [(row['user_id'], row['movie_id'], row['rating']) 
           for _, row in test_df.iterrows()]

print(f"\nTrainset statistics:")
print(f"  Number of users: {trainset.n_users}")
print(f"  Number of items: {trainset.n_items}")
print(f"  Number of ratings: {trainset.n_ratings}")
print(f"  Rating scale: {trainset.rating_scale}")
print(f"\nTestset size: {len(testset):,} ratings")

## 6. Baseline Model - Global Mean

Let's establish a baseline by predicting the global mean rating for all test samples.

In [None]:
# Calculate baseline metrics using global mean
global_mean = train_df['rating'].mean()

# Calculate RMSE and MAE for baseline
baseline_predictions = [global_mean] * len(test_df)
baseline_rmse = np.sqrt(np.mean((test_df['rating'] - baseline_predictions) ** 2))
baseline_mae = np.mean(np.abs(test_df['rating'] - baseline_predictions))

print(f"Baseline Model (Global Mean = {global_mean:.3f})")
print(f"  RMSE: {baseline_rmse:.4f}")
print(f"  MAE:  {baseline_mae:.4f}")
print("\nThis is our baseline to beat!")

## 7. Train SVD Model with Default Parameters

First, let's train an SVD model with default parameters to get a baseline.

In [None]:
# Initialize SVD with default parameters
print("Training SVD model with default parameters...\n")

svd_default = SVD(random_state=42, verbose=True)

# Train the model
start_time = time.time()
svd_default.fit(trainset)
training_time = time.time() - start_time

print(f"\nTraining completed in {training_time:.2f} seconds")

## 8. Evaluate Default SVD Model

In [None]:
# Make predictions on test set
print("Evaluating SVD model on test set...\n")

predictions = svd_default.test(testset)

# Calculate RMSE and MAE
rmse = accuracy.rmse(predictions, verbose=True)
mae = accuracy.mae(predictions, verbose=True)

print(f"\n{'='*50}")
print("SVD Model Performance (Default Parameters)")
print(f"{'='*50}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")
print(f"Training Time: {training_time:.2f} seconds")
print(f"\nImprovement over baseline:")
print(f"  RMSE: {baseline_rmse - rmse:.4f} ({((baseline_rmse - rmse) / baseline_rmse * 100):.2f}% reduction)")
print(f"  MAE:  {baseline_mae - mae:.4f} ({((baseline_mae - mae) / baseline_mae * 100):.2f}% reduction)")
print(f"{'='*50}")

## 9. Hyperparameter Tuning with Grid Search

Let's optimize the SVD model by tuning hyperparameters:
- `n_factors`: Number of latent factors
- `n_epochs`: Number of training iterations
- `lr_all`: Learning rate for all parameters
- `reg_all`: Regularization term for all parameters

In [None]:
# Define parameter grid
param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 30, 40],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.02, 0.1]
}

print("Starting Grid Search for optimal hyperparameters...")
print(f"Parameter grid: {param_grid}")
print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")
print("\nThis may take several minutes...\n")

# Perform grid search with 3-fold cross-validation
gs = GridSearchCV(
    SVD,
    param_grid,
    measures=['rmse', 'mae'],
    cv=3,
    n_jobs=-1,  # Use all available cores
    joblib_verbose=2
)

start_time = time.time()
gs.fit(train_data)
grid_search_time = time.time() - start_time

print(f"\nGrid search completed in {grid_search_time:.2f} seconds ({grid_search_time/60:.2f} minutes)")

In [None]:
# Display best parameters and scores
print("\n" + "="*50)
print("Grid Search Results")
print("="*50)
print(f"\nBest RMSE: {gs.best_score['rmse']:.4f}")
print(f"Best parameters (RMSE):")
for param, value in gs.best_params['rmse'].items():
    print(f"  {param}: {value}")

print(f"\nBest MAE: {gs.best_score['mae']:.4f}")
print(f"Best parameters (MAE):")
for param, value in gs.best_params['mae'].items():
    print(f"  {param}: {value}")
print("="*50)

## 10. Train Optimized SVD Model

In [None]:
# Train SVD with best parameters (optimizing for RMSE)
best_params = gs.best_params['rmse']

print("Training optimized SVD model...")
print(f"Parameters: {best_params}\n")

svd_optimized = SVD(
    n_factors=best_params['n_factors'],
    n_epochs=best_params['n_epochs'],
    lr_all=best_params['lr_all'],
    reg_all=best_params['reg_all'],
    random_state=42,
    verbose=True
)

start_time = time.time()
svd_optimized.fit(trainset)
optimized_training_time = time.time() - start_time

print(f"\nTraining completed in {optimized_training_time:.2f} seconds")

## 11. Evaluate Optimized SVD Model

In [None]:
# Make predictions on test set
print("Evaluating optimized SVD model on test set...\n")

predictions_optimized = svd_optimized.test(testset)

# Calculate RMSE and MAE
rmse_optimized = accuracy.rmse(predictions_optimized, verbose=True)
mae_optimized = accuracy.mae(predictions_optimized, verbose=True)

print(f"\n{'='*50}")
print("SVD Model Performance (Optimized Parameters)")
print(f"{'='*50}")
print(f"RMSE: {rmse_optimized:.4f}")
print(f"MAE:  {mae_optimized:.4f}")
print(f"Training Time: {optimized_training_time:.2f} seconds")
print(f"\nImprovement over baseline:")
print(f"  RMSE: {baseline_rmse - rmse_optimized:.4f} ({((baseline_rmse - rmse_optimized) / baseline_rmse * 100):.2f}% reduction)")
print(f"  MAE:  {baseline_mae - mae_optimized:.4f} ({((baseline_mae - mae_optimized) / baseline_mae * 100):.2f}% reduction)")
print(f"\nImprovement over default SVD:")
print(f"  RMSE: {rmse - rmse_optimized:.4f} ({((rmse - rmse_optimized) / rmse * 100):.2f}% reduction)")
print(f"  MAE:  {mae - mae_optimized:.4f} ({((mae - mae_optimized) / mae * 100):.2f}% reduction)")
print(f"{'='*50}")

## 12. Analyze Prediction Errors

In [None]:
# Extract predictions and actual ratings
y_true = [pred.r_ui for pred in predictions_optimized]
y_pred = [pred.est for pred in predictions_optimized]
errors = np.array(y_true) - np.array(y_pred)

# Create visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Error distribution
axes[0, 0].hist(errors, bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(0, color='red', linestyle='--', linewidth=2, label='Zero Error')
axes[0, 0].set_xlabel('Prediction Error (Actual - Predicted)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title(f'Error Distribution (Mean: {np.mean(errors):.4f}, Std: {np.std(errors):.4f})')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# 2. Actual vs Predicted scatter plot
axes[0, 1].scatter(y_true, y_pred, alpha=0.3, s=10)
axes[0, 1].plot([1, 5], [1, 5], 'r--', linewidth=2, label='Perfect Prediction')
axes[0, 1].set_xlabel('Actual Rating')
axes[0, 1].set_ylabel('Predicted Rating')
axes[0, 1].set_title('Actual vs Predicted Ratings')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# 3. Error by actual rating
rating_groups = pd.DataFrame({'actual': y_true, 'error': np.abs(errors)})
rating_groups.groupby('actual')['error'].mean().plot(kind='bar', ax=axes[1, 0], color='steelblue')
axes[1, 0].set_xlabel('Actual Rating')
axes[1, 0].set_ylabel('Mean Absolute Error')
axes[1, 0].set_title('MAE by Actual Rating')
axes[1, 0].grid(alpha=0.3)

# 4. Cumulative error distribution
sorted_abs_errors = np.sort(np.abs(errors))
cumulative = np.arange(1, len(sorted_abs_errors) + 1) / len(sorted_abs_errors) * 100
axes[1, 1].plot(sorted_abs_errors, cumulative, linewidth=2)
axes[1, 1].axvline(0.5, color='red', linestyle='--', alpha=0.5, label='0.5 error threshold')
axes[1, 1].axvline(1.0, color='orange', linestyle='--', alpha=0.5, label='1.0 error threshold')
axes[1, 1].set_xlabel('Absolute Error')
axes[1, 1].set_ylabel('Cumulative Percentage (%)')
axes[1, 1].set_title('Cumulative Error Distribution')
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Print error statistics
print("\nError Analysis:")
print(f"  Mean Error: {np.mean(errors):.4f}")
print(f"  Std Error: {np.std(errors):.4f}")
print(f"  Min Error: {np.min(errors):.4f}")
print(f"  Max Error: {np.max(errors):.4f}")
print(f"  Median Absolute Error: {np.median(np.abs(errors)):.4f}")
print(f"  % predictions within ±0.5: {(np.abs(errors) <= 0.5).mean() * 100:.2f}%")
print(f"  % predictions within ±1.0: {(np.abs(errors) <= 1.0).mean() * 100:.2f}%")

## 13. Model Comparison Summary

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': ['Baseline (Global Mean)', 'SVD (Default)', 'SVD (Optimized)'],
    'RMSE': [baseline_rmse, rmse, rmse_optimized],
    'MAE': [baseline_mae, mae, mae_optimized],
    'Training Time (s)': [0, training_time, optimized_training_time]
}).round(4)

print("\n" + "="*70)
print("MODEL COMPARISON SUMMARY")
print("="*70)
display(comparison_df)

# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# RMSE comparison
axes[0].bar(comparison_df['Model'], comparison_df['RMSE'], color=['gray', 'steelblue', 'darkgreen'])
axes[0].set_ylabel('RMSE')
axes[0].set_title('RMSE Comparison')
axes[0].tick_params(axis='x', rotation=15)
axes[0].grid(alpha=0.3)

# Add value labels on bars
for i, v in enumerate(comparison_df['RMSE']):
    axes[0].text(i, v + 0.01, f'{v:.4f}', ha='center', va='bottom', fontweight='bold')

# MAE comparison
axes[1].bar(comparison_df['Model'], comparison_df['MAE'], color=['gray', 'steelblue', 'darkgreen'])
axes[1].set_ylabel('MAE')
axes[1].set_title('MAE Comparison')
axes[1].tick_params(axis='x', rotation=15)
axes[1].grid(alpha=0.3)

# Add value labels on bars
for i, v in enumerate(comparison_df['MAE']):
    axes[1].text(i, v + 0.01, f'{v:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n" + "="*70)

## 14. Save Trained Model

In [None]:
import pickle

# Save the optimized model
model_path = DATA_PATH + 'svd_optimized_model.pkl'

with open(model_path, 'wb') as f:
    pickle.dump(svd_optimized, f)

print(f"Model saved to: {model_path}")

# Save model metadata
model_metadata = {
    'model_type': 'SVD',
    'library': 'Surprise',
    'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'parameters': best_params,
    'performance': {
        'rmse': float(rmse_optimized),
        'mae': float(mae_optimized),
        'training_time': float(optimized_training_time)
    },
    'dataset': {
        'train_size': len(train_df),
        'test_size': len(test_df),
        'n_users': trainset.n_users,
        'n_items': trainset.n_items
    }
}

metadata_path = DATA_PATH + 'svd_model_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(model_metadata, f, indent=2)

print(f"Model metadata saved to: {metadata_path}")
print("\nModel training and evaluation complete!")

## 15. Example: Making Predictions

Let's see how to use the trained model to make predictions for specific users and movies.

In [None]:
# Get some random test samples
sample_predictions = np.random.choice(predictions_optimized, size=10, replace=False)

print("Sample Predictions from Test Set:")
print("=" * 80)
print(f"{'User ID':<10} {'Movie ID':<10} {'Actual':<10} {'Predicted':<12} {'Error':<10}")
print("=" * 80)

for pred in sample_predictions:
    error = pred.r_ui - pred.est
    print(f"{pred.uid:<10} {pred.iid:<10} {pred.r_ui:<10.1f} {pred.est:<12.3f} {error:<10.3f}")

print("=" * 80)

In [None]:
# Function to get top-N recommendations for a user
def get_top_n_recommendations(model, user_id, n=10, items_to_predict=None):
    """
    Get top N movie recommendations for a specific user.
    
    Parameters:
    - model: Trained Surprise model
    - user_id: User ID
    - n: Number of recommendations to return
    - items_to_predict: List of movie IDs to predict (if None, predict for all items)
    """
    # Get list of all movies if not specified
    if items_to_predict is None:
        items_to_predict = train_df['movie_id'].unique()
    
    # Predict ratings for all movies
    predictions = []
    for item_id in items_to_predict:
        pred = model.predict(user_id, item_id)
        predictions.append((item_id, pred.est))
    
    # Sort by predicted rating
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Return top N
    return predictions[:n]

# Example: Get recommendations for a random user
sample_user = train_df['user_id'].sample(1).values[0]
print(f"\nTop 10 movie recommendations for User {sample_user}:")
print("=" * 60)
print(f"{'Rank':<6} {'Movie ID':<12} {'Predicted Rating':<20}")
print("=" * 60)

# Get movies user hasn't rated yet
user_rated_movies = train_df[train_df['user_id'] == sample_user]['movie_id'].values
all_movies = train_df['movie_id'].unique()
movies_to_predict = [m for m in all_movies if m not in user_rated_movies]

recommendations = get_top_n_recommendations(
    svd_optimized, 
    sample_user, 
    n=10, 
    items_to_predict=movies_to_predict[:100]  # Predict on subset for speed
)

for rank, (movie_id, rating) in enumerate(recommendations, 1):
    print(f"{rank:<6} {movie_id:<12} {rating:<20.3f}")

print("=" * 60)