# Student Grade Prediction - Model Training

This notebook demonstrates the complete machine learning pipeline for predicting
student academic performance using the UCI Student Performance dataset.

## Pipeline Steps
1. Data Loading and Preprocessing
2. Feature Engineering
3. Model Training and Selection
4. Model Evaluation
5. Model Saving

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
import warnings
warnings.filterwarnings('ignore')

# Add parent directory to path for imports
sys.path.append('..')

# Import project modules
from src.config import DataConfig, ModelConfig
from src.data_loader import StudentDataLoader
from src.feature_engineer import StudentFeatureEngineer
from src.preprocessor import StudentPreprocessor
from src.model_trainer import GradeModelTrainer
from src.model_evaluator import GradeModelEvaluator

# Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')

## 1. Load Data

In [None]:
# Load the student performance dataset
data_path = Path('../data/student_mat.csv')

if data_path.exists():
    df = pd.read_csv(data_path, sep=';')
    print(f"Dataset loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"\nFeatures: {list(df.columns)}")
else:
    print("Dataset not found. Please download from:")
    print("https://archive.ics.uci.edu/ml/datasets/Student+Performance")

In [None]:
# Display sample data
df.head()

## 2. Feature Engineering

In [None]:
# Apply feature engineering
engineer = StudentFeatureEngineer()
df_engineered = engineer.create_all_features(df.copy())

print(f"Original features: {len(df.columns)}")
print(f"After engineering: {len(df_engineered.columns)}")
print(f"\nNew features created:")
new_features = [col for col in df_engineered.columns if col not in df.columns]
for feat in new_features:
    print(f"  - {feat}")

In [None]:
# Display engineered features
df_engineered[new_features].head()

## 3. Data Preprocessing

In [None]:
# Encode categorical variables
categorical_cols = df_engineered.select_dtypes(include=['object']).columns
print(f"Categorical columns to encode: {list(categorical_cols)}")

# Label encoding for categorical columns
from sklearn.preprocessing import LabelEncoder

df_processed = df_engineered.copy()
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_processed[col] = le.fit_transform(df_processed[col])
    encoders[col] = le

print("\nCategorical encoding complete!")

In [None]:
# Separate features and target
target_column = 'G3'
X = df_processed.drop(columns=[target_column])
y = df_processed[target_column]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget statistics:")
print(y.describe())

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

In [None]:
# Scale features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling complete!")
print(f"Scaled training mean: {X_train_scaled.mean():.4f}")
print(f"Scaled training std: {X_train_scaled.std():.4f}")

## 4. Model Training

In [None]:
# Import models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

# Define models to train
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
}

print(f"Training {len(models)} models...")

In [None]:
# Train and evaluate all models
results = []

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, 
                                 cv=5, scoring='neg_mean_squared_error')
    cv_rmse = np.sqrt(-cv_scores.mean())
    
    # Train on full training set
    model.fit(X_train_scaled, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        'Model': name,
        'CV RMSE': cv_rmse,
        'Test RMSE': rmse,
        'Test MAE': mae,
        'Test R²': r2
    })
    
    print(f"  CV RMSE: {cv_rmse:.4f}")
    print(f"  Test RMSE: {rmse:.4f}")
    print(f"  Test R²: {r2:.4f}")

# Create results dataframe
results_df = pd.DataFrame(results).sort_values('Test RMSE')
print("\n" + "="*60)
print("MODEL COMPARISON RESULTS")
print("="*60)
print(results_df.to_string(index=False))

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# RMSE comparison
axes[0].barh(results_df['Model'], results_df['Test RMSE'], color='steelblue', alpha=0.7)
axes[0].set_xlabel('Test RMSE')
axes[0].set_title('Model Comparison - RMSE (lower is better)')
axes[0].invert_yaxis()

# R² comparison
axes[1].barh(results_df['Model'], results_df['Test R²'], color='green', alpha=0.7)
axes[1].set_xlabel('Test R²')
axes[1].set_title('Model Comparison - R² Score (higher is better)')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

## 5. Best Model Analysis

In [None]:
# Select best model based on RMSE
best_model_name = results_df.iloc[0]['Model']
best_model = models[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"Test RMSE: {results_df.iloc[0]['Test RMSE']:.4f}")
print(f"Test R²: {results_df.iloc[0]['Test R²']:.4f}")

In [None]:
# Make predictions with best model
y_pred_best = best_model.predict(X_test_scaled)

# Actual vs Predicted plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot
axes[0].scatter(y_test, y_pred_best, alpha=0.5)
axes[0].plot([0, 20], [0, 20], 'r--', label='Perfect Prediction')
axes[0].set_xlabel('Actual Grade')
axes[0].set_ylabel('Predicted Grade')
axes[0].set_title(f'{best_model_name}: Actual vs Predicted')
axes[0].legend()
axes[0].set_xlim(0, 20)
axes[0].set_ylim(0, 20)

# Residual plot
residuals = y_test - y_pred_best
axes[1].scatter(y_pred_best, residuals, alpha=0.5)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Predicted Grade')
axes[1].set_ylabel('Residuals')
axes[1].set_title('Residual Plot')

plt.tight_layout()
plt.show()

In [None]:
# Feature importance (if available)
if hasattr(best_model, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 8))
    plt.barh(importance_df['Feature'][:15], importance_df['Importance'][:15], color='steelblue')
    plt.xlabel('Feature Importance')
    plt.title(f'{best_model_name} - Top 15 Feature Importance')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print("\nTop 10 Most Important Features:")
    print(importance_df.head(10).to_string(index=False))

## 6. Save Model and Artifacts

In [None]:
import joblib
from datetime import datetime

# Create models directory
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

# Save best model
model_path = models_dir / 'best_model.joblib'
joblib.dump(best_model, model_path)
print(f"Model saved to: {model_path}")

# Save scaler
scaler_path = models_dir / 'scaler.joblib'
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to: {scaler_path}")

# Save metrics
metrics = {
    'model_name': best_model_name,
    'rmse': results_df.iloc[0]['Test RMSE'],
    'mae': results_df.iloc[0]['Test MAE'],
    'r2': results_df.iloc[0]['Test R²'],
    'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'n_features': X.shape[1],
    'n_samples': len(df),
    'feature_names': list(X.columns)
}

if hasattr(best_model, 'feature_importances_'):
    metrics['feature_importance'] = best_model.feature_importances_.tolist()

metrics_path = models_dir / 'metrics.joblib'
joblib.dump(metrics, metrics_path)
print(f"Metrics saved to: {metrics_path}")

print("\nAll artifacts saved successfully!")

## 7. Summary

In [None]:
print("="*60)
print("TRAINING SUMMARY")
print("="*60)
print(f"\nDataset: UCI Student Performance (Mathematics)")
print(f"Total Samples: {len(df)}")
print(f"Features Used: {X.shape[1]}")
print(f"\nBest Model: {best_model_name}")
print(f"\nPerformance Metrics:")
print(f"  - RMSE: {results_df.iloc[0]['Test RMSE']:.4f}")
print(f"  - MAE: {results_df.iloc[0]['Test MAE']:.4f}")
print(f"  - R²: {results_df.iloc[0]['Test R²']:.4f}")
print(f"\nSaved Artifacts:")
print(f"  - Model: models/best_model.joblib")
print(f"  - Scaler: models/scaler.joblib")
print(f"  - Metrics: models/metrics.joblib")
print("\nNext Steps:")
print("  - Run the API: uvicorn api:app --reload")
print("  - Run the dashboard: streamlit run dashboard.py")