In [None]:
print("üì¶ Installing required packages...\n")

import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", "numpy"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numpy==1.24.3"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "pandas==2.0.3"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "scikit-learn==1.3.2"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "scipy==1.11.4"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "--no-deps", "scikit-surprise==1.1.3"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "matplotlib", "seaborn"])

print("‚úÖ Packages installed successfully!")

import numpy as np
print(f"\nüìä NumPy version: {np.__version__}")
assert np.__version__.startswith('1.'), f"NumPy version {np.__version__} may cause issues. Expected 1.x"
print("‚úÖ NumPy compatibility verified!")

In [None]:
from google.colab import files
import zipfile
import os

uploaded = files.upload()

zip_filename = list(uploaded.keys())[0]
print(f"\nüìÇ Extracting {zip_filename}...")

with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall('data')

print("‚úÖ Dataset extracted successfully!")

for root, dirs, files in os.walk('data'):
    if 'ratings.csv' in files:
        dataset_path = root
        print(f"‚úÖ Found dataset at: {dataset_path}")
        break

In [None]:
import pandas as pd
from surprise import Dataset, Reader

print("üìä Loading MovieLens dataset...\n")

ratings_df = pd.read_csv(os.path.join(dataset_path, 'ratings.csv'))
print(f"‚úÖ Loaded {len(ratings_df):,} ratings")
print(f"   - Users: {ratings_df['userId'].nunique():,}")
print(f"   - Movies: {ratings_df['movieId'].nunique():,}")
print(f"   - Rating range: {ratings_df['rating'].min():.1f} - {ratings_df['rating'].max():.1f}")
print(f"   - Average rating: {ratings_df['rating'].mean():.2f}")

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

print("\n‚úÖ Dataset prepared for training!")

In [None]:
from surprise import SVD
from surprise.model_selection import cross_validate
import time

print("üîç Testing hyperparameter configurations...\n")

configs = [
    {'n_factors': 100, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.02, 'name': 'Balanced'},
    {'n_factors': 150, 'n_epochs': 25, 'lr_all': 0.007, 'reg_all': 0.015, 'name': 'High Factors'},
    {'n_factors': 200, 'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.01, 'name': 'Deep Learning'},
]

best_config = None
best_rmse = float('inf')

for config in configs:
    print(f"Testing {config['name']} configuration...")
    
    model = SVD(
        n_factors=config['n_factors'],
        n_epochs=config['n_epochs'],
        lr_all=config['lr_all'],
        reg_all=config['reg_all'],
        random_state=42
    )
    
    cv_results = cross_validate(model, data, measures=['RMSE', 'MAE'], cv=3, verbose=False)
    
    avg_rmse = cv_results['test_rmse'].mean()
    avg_mae = cv_results['test_mae'].mean()
    
    print(f"  RMSE: {avg_rmse:.4f} | MAE: {avg_mae:.4f}")
    
    if avg_rmse < best_rmse:
        best_rmse = avg_rmse
        best_config = config

print(f"\nüèÜ Best configuration: {best_config['name']}")
print(f"   RMSE: {best_rmse:.4f}")
print(f"   Factors: {best_config['n_factors']}, Epochs: {best_config['n_epochs']}")

In [None]:
from surprise import SVD
from surprise.model_selection import train_test_split

print("üöÄ Training final model with optimized hyperparameters...\n")

trainset_full = data.build_full_trainset()
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

start_time = time.time()

final_model = SVD(
    n_factors=best_config['n_factors'],
    n_epochs=best_config['n_epochs'],
    lr_all=best_config['lr_all'],
    reg_all=best_config['reg_all'],
    random_state=42,
    verbose=True
)

final_model.fit(trainset)

training_time = time.time() - start_time
print(f"\n‚úÖ Training completed in {training_time:.2f} seconds")

In [None]:
from surprise import accuracy

print("üìà Evaluating model on test set...\n")

predictions = final_model.test(testset)

rmse = accuracy.rmse(predictions, verbose=False)
mae = accuracy.mae(predictions, verbose=False)

print(f"Test Set Performance:")
print(f"  RMSE: {rmse:.4f}")
print(f"  MAE:  {mae:.4f}")

exact_matches = sum(1 for pred in predictions if abs(pred.est - pred.r_ui) < 0.1)
within_half_star = sum(1 for pred in predictions if abs(pred.est - pred.r_ui) <= 0.5)
within_one_star = sum(1 for pred in predictions if abs(pred.est - pred.r_ui) <= 1.0)

total = len(predictions)
print(f"\nPrediction Accuracy:")
print(f"  Exact match (¬±0.1): {exact_matches/total*100:.2f}%")
print(f"  Within ¬±0.5 stars:  {within_half_star/total*100:.2f}%")
print(f"  Within ¬±1.0 stars:  {within_one_star/total*100:.2f}%")

In [None]:
from collections import defaultdict
import numpy as np

print("üéØ Calculating Recall@K metrics...\n")

def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    
    return top_n

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((iid, est, true_r))
    
    precisions = []
    recalls = []
    
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        
        n_rel = sum((true_r >= threshold) for (_, _, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (_, est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (_, est, true_r) in user_ratings[:k]
        )
        
        precisions.append(n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0)
        recalls.append(n_rel_and_rec_k / n_rel if n_rel != 0 else 0)
    
    return sum(precisions) / len(precisions), sum(recalls) / len(recalls)

recall_metrics = {}
for k in [1, 3, 5]:
    precision, recall = precision_recall_at_k(predictions, k=k, threshold=3.5)
    recall_metrics[f'Recall@{k}'] = recall
    print(f"Recall@{k}: {recall:.4f} | Precision@{k}: {precision:.4f}")

print(f"\nüèÜ Competition Metrics:")
print(f"   Recall@5: {recall_metrics['Recall@5']:.4f}")
print(f"   Recall@3: {recall_metrics['Recall@3']:.4f}")
print(f"   Recall@1: {recall_metrics['Recall@1']:.4f}")

In [None]:
print("üîÑ Retraining on full dataset for final submission...\n")

trainset_full = data.build_full_trainset()

production_model = SVD(
    n_factors=best_config['n_factors'],
    n_epochs=best_config['n_epochs'],
    lr_all=best_config['lr_all'],
    reg_all=best_config['reg_all'],
    random_state=42,
    verbose=True
)

production_model.fit(trainset_full)

print("\n‚úÖ Production model trained on full dataset!")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

print("üìä Creating performance visualizations...\n")

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (15, 10)

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

errors = [pred.est - pred.r_ui for pred in predictions]
axes[0, 0].hist(errors, bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Prediction Error Distribution', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Error (Predicted - Actual)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(x=0, color='red', linestyle='--', linewidth=2, label='Perfect Prediction')
axes[0, 0].legend()

actual = [pred.r_ui for pred in predictions[:1000]]
predicted = [pred.est for pred in predictions[:1000]]
axes[0, 1].scatter(actual, predicted, alpha=0.3, s=10)
axes[0, 1].plot([0, 5], [0, 5], 'r--', linewidth=2, label='Perfect Prediction')
axes[0, 1].set_title('Actual vs Predicted Ratings', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Actual Rating')
axes[0, 1].set_ylabel('Predicted Rating')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

k_values = [1, 3, 5]
recall_values = [recall_metrics[f'Recall@{k}'] for k in k_values]
bars = axes[1, 0].bar([str(k) for k in k_values], recall_values, color=['#FF6B6B', '#4ECDC4', '#45B7D1'], edgecolor='black')
axes[1, 0].set_title('Recall@K Performance', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('K')
axes[1, 0].set_ylabel('Recall Score')
axes[1, 0].set_ylim([0, 1])
for bar, value in zip(bars, recall_values):
    height = bar.get_height()
    axes[1, 0].text(bar.get_x() + bar.get_width()/2., height,
                    f'{value:.4f}', ha='center', va='bottom', fontweight='bold')

axes[1, 1].axis('off')
summary_text = f"""
Model Performance Summary
{'='*40}

Configuration: {best_config['name']}
  ‚Ä¢ Factors: {best_config['n_factors']}
  ‚Ä¢ Epochs: {best_config['n_epochs']}
  ‚Ä¢ Learning Rate: {best_config['lr_all']}
  ‚Ä¢ Regularization: {best_config['reg_all']}

Error Metrics:
  ‚Ä¢ RMSE: {rmse:.4f}
  ‚Ä¢ MAE: {mae:.4f}

Competition Metrics:
  ‚Ä¢ Recall@5: {recall_metrics['Recall@5']:.4f}
  ‚Ä¢ Recall@3: {recall_metrics['Recall@3']:.4f}
  ‚Ä¢ Recall@1: {recall_metrics['Recall@1']:.4f}

Accuracy:
  ‚Ä¢ Within ¬±0.5 stars: {within_half_star/total*100:.1f}%
  ‚Ä¢ Within ¬±1.0 stars: {within_one_star/total*100:.1f}%

Training Time: {training_time:.2f}s
"""
axes[1, 1].text(0.1, 0.5, summary_text, fontsize=11, family='monospace',
                verticalalignment='center')

plt.tight_layout()
plt.savefig('model_performance.png', dpi=300, bbox_inches='tight')
print("‚úÖ Visualization saved as 'model_performance.png'")
plt.show()

In [None]:
import pickle
import json

print("üíæ Saving trained model...\n")

model_package = {
    'svd_model': production_model,
    'trainset': trainset_full,
    'metadata': {
        'model_type': 'SVD Collaborative Filtering',
        'library': 'scikit-surprise 1.1.3',
        'training_date': time.strftime('%Y-%m-%d %H:%M:%S'),
        'dataset': 'MovieLens latest-small',
        'n_ratings': len(ratings_df),
        'n_users': ratings_df['userId'].nunique(),
        'n_movies': ratings_df['movieId'].nunique(),
        'hyperparameters': {
            'n_factors': best_config['n_factors'],
            'n_epochs': best_config['n_epochs'],
            'lr_all': best_config['lr_all'],
            'reg_all': best_config['reg_all'],
        },
        'performance': {
            'rmse': float(rmse),
            'mae': float(mae),
            'recall_at_5': float(recall_metrics['Recall@5']),
            'recall_at_3': float(recall_metrics['Recall@3']),
            'recall_at_1': float(recall_metrics['Recall@1']),
            'accuracy_within_0.5': float(within_half_star/total),
            'accuracy_within_1.0': float(within_one_star/total),
        },
        'training_time_seconds': float(training_time),
    }
}

with open('recommendation_model.pkl', 'wb') as f:
    pickle.dump(model_package, f, protocol=pickle.HIGHEST_PROTOCOL)

with open('model_metadata.json', 'w') as f:
    json.dump(model_package['metadata'], f, indent=2)

print("‚úÖ Model saved successfully!")
print("\nSaved files:")
print("  - recommendation_model.pkl (trained model)")
print("  - model_metadata.json (performance metrics)")
print("  - model_performance.png (visualizations)")

import os
model_size = os.path.getsize('recommendation_model.pkl') / (1024 * 1024)
print(f"\nModel size: {model_size:.2f} MB")

In [None]:
from google.colab import files

print("üì• Downloading files to your computer...\n")

print("Downloading recommendation_model.pkl...")
files.download('recommendation_model.pkl')

print("Downloading model_metadata.json...")
files.download('model_metadata.json')

print("Downloading model_performance.png...")
files.download('model_performance.png')

print("\n" + "="*60)
print("üéâ TRAINING COMPLETE! üéâ")
print("="*60)
print("\nNext Steps:")
print("1. Move 'recommendation_model.pkl' to your project's 'models/' folder")
print("2. Test inference: python inference.py --test_data_path sample_test_phase_1")
print("3. Verify output files are generated in 'output/' folder")
print("4. Include 'model_performance.png' in your technical report")
print("5. Use metrics from 'model_metadata.json' for your report")
print("\n" + "="*60)
print(f"Model Performance (for your report):")
print("="*60)
print(f"Recall@5: {recall_metrics['Recall@5']:.4f}")
print(f"Recall@3: {recall_metrics['Recall@3']:.4f}")
print(f"Recall@1: {recall_metrics['Recall@1']:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print("="*60)