# K-Fold Cross-Validation for Matrix Factorization
## Robust Performance Evaluation with Statistical Significance

This notebook implements 5-fold cross-validation to get robust performance estimates and confidence intervals.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Data

In [None]:
# Load ratings data
ratings = pd.read_csv('../data/ratings.csv')
users = pd.read_csv('../data/users.csv')
modules = pd.read_csv('../data/modules.csv')

print(f"Total ratings: {len(ratings)}")
print(f"Total users: {len(users)}")
print(f"Total modules: {len(modules)}")
print(f"\nRatings range: {ratings['rating'].min():.1f} - {ratings['rating'].max():.1f}")
print(f"Rating mean: {ratings['rating'].mean():.2f}")
print(f"Rating std: {ratings['rating'].std():.2f}")

## 2. Simple Baseline Model for Cross-Validation Demo

Since ML.NET training is done in C#, we'll demonstrate cross-validation with Python-based baseline models.

**Models evaluated:**
1. **User-Item Mean**: Predicts based on user average + item average - global average
2. **Item Average**: Predicts based on module average rating
3. **Global Mean**: Always predicts the global average rating

In [None]:
class UserItemMeanModel:
    """Baseline: user mean + item mean - global mean"""
    def __init__(self):
        self.global_mean = 0
        self.user_means = {}
        self.item_means = {}
    
    def fit(self, train_df):
        self.global_mean = train_df['rating'].mean()
        self.user_means = train_df.groupby('user_id')['rating'].mean().to_dict()
        self.item_means = train_df.groupby('module_id')['rating'].mean().to_dict()
    
    def predict(self, test_df):
        predictions = []
        for _, row in test_df.iterrows():
            user_mean = self.user_means.get(row['user_id'], self.global_mean)
            item_mean = self.item_means.get(row['module_id'], self.global_mean)
            pred = user_mean + item_mean - self.global_mean
            predictions.append(np.clip(pred, 1, 5))  # Clip to valid range
        return np.array(predictions)

class ItemAverageModel:
    """Baseline: item average"""
    def __init__(self):
        self.global_mean = 0
        self.item_means = {}
    
    def fit(self, train_df):
        self.global_mean = train_df['rating'].mean()
        self.item_means = train_df.groupby('module_id')['rating'].mean().to_dict()
    
    def predict(self, test_df):
        predictions = []
        for _, row in test_df.iterrows():
            item_mean = self.item_means.get(row['module_id'], self.global_mean)
            predictions.append(item_mean)
        return np.array(predictions)

class GlobalMeanModel:
    """Baseline: global mean"""
    def __init__(self):
        self.global_mean = 0
    
    def fit(self, train_df):
        self.global_mean = train_df['rating'].mean()
    
    def predict(self, test_df):
        return np.full(len(test_df), self.global_mean)

print("✓ Baseline models defined")

## 3. K-Fold Cross-Validation (k=5)

In [None]:
def cross_validate_model(model_class, data, k=5, model_name="Model"):
    """Perform k-fold cross-validation"""
    kfold = KFold(n_splits=k, shuffle=True, random_state=42)
    
    fold_results = []
    
    print(f"\n{'='*60}")
    print(f"Cross-Validating: {model_name}")
    print(f"{'='*60}")
    
    for fold_idx, (train_idx, test_idx) in enumerate(kfold.split(data), 1):
        # Split data
        train_data = data.iloc[train_idx]
        test_data = data.iloc[test_idx]
        
        # Train model
        model = model_class()
        model.fit(train_data)
        
        # Predict
        predictions = model.predict(test_data)
        actuals = test_data['rating'].values
        
        # Calculate metrics
        mae = mean_absolute_error(actuals, predictions)
        rmse = np.sqrt(mean_squared_error(actuals, predictions))
        r2 = r2_score(actuals, predictions)
        
        fold_results.append({
            'fold': fold_idx,
            'mae': mae,
            'rmse': rmse,
            'r2': r2,
            'train_size': len(train_data),
            'test_size': len(test_data)
        })
        
        print(f"Fold {fold_idx}: MAE={mae:.4f}, RMSE={rmse:.4f}, R²={r2:.4f}")
    
    return pd.DataFrame(fold_results)

# Run cross-validation for all models
results_user_item = cross_validate_model(UserItemMeanModel, ratings, k=5, model_name="User-Item Mean")
results_item_avg = cross_validate_model(ItemAverageModel, ratings, k=5, model_name="Item Average")
results_global = cross_validate_model(GlobalMeanModel, ratings, k=5, model_name="Global Mean")

## 4. Statistical Summary with Confidence Intervals

In [None]:
def calculate_statistics(results_df, model_name):
    """Calculate mean, std, and 95% confidence intervals"""
    stats_dict = {
        'Model': model_name,
        'MAE_mean': results_df['mae'].mean(),
        'MAE_std': results_df['mae'].std(),
        'MAE_CI_lower': results_df['mae'].mean() - 1.96 * results_df['mae'].std(),
        'MAE_CI_upper': results_df['mae'].mean() + 1.96 * results_df['mae'].std(),
        'RMSE_mean': results_df['rmse'].mean(),
        'RMSE_std': results_df['rmse'].std(),
        'RMSE_CI_lower': results_df['rmse'].mean() - 1.96 * results_df['rmse'].std(),
        'RMSE_CI_upper': results_df['rmse'].mean() + 1.96 * results_df['rmse'].std(),
        'R2_mean': results_df['r2'].mean(),
        'R2_std': results_df['r2'].std(),
        'R2_CI_lower': results_df['r2'].mean() - 1.96 * results_df['r2'].std(),
        'R2_CI_upper': results_df['r2'].mean() + 1.96 * results_df['r2'].std(),
    }
    return stats_dict

# Calculate statistics for all models
stats_all = pd.DataFrame([
    calculate_statistics(results_user_item, 'User-Item Mean'),
    calculate_statistics(results_item_avg, 'Item Average'),
    calculate_statistics(results_global, 'Global Mean')
])

print("\n" + "="*80)
print("5-FOLD CROSS-VALIDATION RESULTS")
print("="*80)
print("\nMAE (Mean Absolute Error) - Lower is better:")
print("-" * 80)
for _, row in stats_all.iterrows():
    print(f"{row['Model']:20s}: {row['MAE_mean']:.4f} ± {row['MAE_std']:.4f}  [95% CI: {row['MAE_CI_lower']:.4f} - {row['MAE_CI_upper']:.4f}]")

print("\nRMSE (Root Mean Squared Error) - Lower is better:")
print("-" * 80)
for _, row in stats_all.iterrows():
    print(f"{row['Model']:20s}: {row['RMSE_mean']:.4f} ± {row['RMSE_std']:.4f}  [95% CI: {row['RMSE_CI_lower']:.4f} - {row['RMSE_CI_upper']:.4f}]")

print("\nR² (Coefficient of Determination) - Higher is better:")
print("-" * 80)
for _, row in stats_all.iterrows():
    print(f"{row['Model']:20s}: {row['R2_mean']:.4f} ± {row['R2_std']:.4f}  [95% CI: {row['R2_CI_lower']:.4f} - {row['R2_CI_upper']:.4f}]")

# Save results
stats_all.to_csv('../evaluation/cross_validation_results.csv', index=False)
print("\n✓ Cross-validation results saved to: evaluation/cross_validation_results.csv")

## 5. Visualize Cross-Validation Results

In [None]:
# Box plots for each metric
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
fig.suptitle('5-Fold Cross-Validation Results', fontsize=16, fontweight='bold')

# Combine all results
all_results = pd.concat([
    results_user_item.assign(model='User-Item Mean'),
    results_item_avg.assign(model='Item Average'),
    results_global.assign(model='Global Mean')
])

# MAE
sns.boxplot(data=all_results, x='model', y='mae', ax=axes[0])
axes[0].set_title('MAE Distribution', fontweight='bold')
axes[0].set_xlabel('')
axes[0].set_ylabel('MAE')
axes[0].tick_params(axis='x', rotation=15)

# RMSE
sns.boxplot(data=all_results, x='model', y='rmse', ax=axes[1])
axes[1].set_title('RMSE Distribution', fontweight='bold')
axes[1].set_xlabel('')
axes[1].set_ylabel('RMSE')
axes[1].tick_params(axis='x', rotation=15)

# R²
sns.boxplot(data=all_results, x='model', y='r2', ax=axes[2])
axes[2].set_title('R² Distribution', fontweight='bold')
axes[2].set_xlabel('')
axes[2].set_ylabel('R²')
axes[2].tick_params(axis='x', rotation=15)

plt.tight_layout()
plt.savefig('../evaluation/plots/cross_validation_boxplots.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Statistical Significance Testing

Paired t-test to check if differences between models are statistically significant

In [None]:
print("\n" + "="*80)
print("STATISTICAL SIGNIFICANCE TESTS (Paired t-test)")
print("="*80)
print("\nH0: No significant difference between models")
print("H1: Significant difference exists (α = 0.05)\n")

# User-Item Mean vs Item Average
t_stat, p_value = stats.ttest_rel(results_user_item['mae'], results_item_avg['mae'])
print(f"User-Item Mean vs Item Average (MAE):")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Result: {'Significant difference (reject H0)' if p_value < 0.05 else 'No significant difference'}\n")

# User-Item Mean vs Global Mean
t_stat, p_value = stats.ttest_rel(results_user_item['mae'], results_global['mae'])
print(f"User-Item Mean vs Global Mean (MAE):")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Result: {'Significant difference (reject H0)' if p_value < 0.05 else 'No significant difference'}\n")

# Item Average vs Global Mean
t_stat, p_value = stats.ttest_rel(results_item_avg['mae'], results_global['mae'])
print(f"Item Average vs Global Mean (MAE):")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Result: {'Significant difference (reject H0)' if p_value < 0.05 else 'No significant difference'}")

## 7. Key Findings

### Cross-Validation Benefits:

1. **Robustness**: 5-fold CV provides more reliable performance estimates than single train/test split
2. **Variance**: Standard deviation shows model stability across different data splits
3. **Confidence Intervals**: 95% CI gives range of expected performance in production
4. **Statistical Significance**: t-tests confirm whether model differences are meaningful

### Integration with ML.NET:

For the production ML.NET Matrix Factorization model:
1. The same cross-validation approach should be applied in C# using ML.NET's cross-validation API
2. Expected performance: MAE ≈ 0.50 (based on previous evaluation)
3. This would significantly outperform all Python baselines shown here

### Recommendations:

- ✓ Use k-fold cross-validation (k=5 or k=10) for final model evaluation
- ✓ Report mean ± standard deviation for all metrics
- ✓ Include confidence intervals in production monitoring
- ✓ Perform statistical tests when comparing model versions

In [None]:
print("\n" + "="*80)
print("CROSS-VALIDATION ANALYSIS COMPLETE")
print("="*80)
print("\nDeliverables:")
print("  ✓ 5-fold cross-validation results")
print("  ✓ Statistical summary with confidence intervals")
print("  ✓ Box plot visualizations")
print("  ✓ Paired t-tests for significance")
print("\nFiles generated:")
print("  - evaluation/cross_validation_results.csv")
print("  - evaluation/plots/cross_validation_boxplots.png")