In [2]:
# ============================================================
# STUDENT PERFORMANCE PREDICTION â€“ REAL WORLD PIPELINE
# ============================================================

# -----------------------------
# 1. IMPORTS
# -----------------------------
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings("ignore")


# -----------------------------
# 2. LOAD CLEAN DATA
# -----------------------------
df = pd.read_csv("../data/stud_clean.csv")

print("Data loaded:", df.shape)
display(df.head())


# -----------------------------
# 3. DEFINE REALISTIC FEATURES
# -----------------------------
# Based on EDA insights:
# - Gender
# - Race / Ethnicity
# - Parental education
# - Lunch type (proxy for SES)
# - Test preparation (behavioural factor)

TARGET = "average"

FEATURES = [
    "gender",
    "race_ethnicity",
    "parental_level_of_education",
    "lunch",
    "test_preparation_course"
]

X = df[FEATURES]
y = df[TARGET]

print("\nFeatures:", FEATURES)
print("X shape:", X.shape)
print("y shape:", y.shape)


# -----------------------------
# 4. TRAIN / TEST SPLIT (REALISTIC)
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


# -----------------------------
# 5. PREPROCESSING
# -----------------------------
categorical_features = FEATURES

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


# -----------------------------
# 6. MODELS (REAL-WORLD SET)
# -----------------------------
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.01),
    "ElasticNet": ElasticNet(alpha=0.01, l1_ratio=0.5),
    
    "RandomForest": RandomForestRegressor(
        n_estimators=400,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    ),
    
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    )
}


# -----------------------------
# 7. CROSS-VALIDATION SETUP
# -----------------------------
cv = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


# -----------------------------
# 8. TRAIN, VALIDATE, TEST
# -----------------------------
results = []

print("\n================= MODEL COMPARISON =================\n")

for name, model in models.items():
    
    pipeline = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", model)
    ])
    
    # Cross-validation (R2)
    cv_scores = cross_val_score(
        pipeline,
        X_train,
        y_train,
        cv=cv,
        scoring="r2"
    )
    
    # Train final model
    pipeline.fit(X_train, y_train)
    
    # Test set evaluation
    y_pred = pipeline.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    results.append({
        "Model": name,
        "CV_R2_Mean": cv_scores.mean(),
        "CV_R2_Std": cv_scores.std(),
        "Test_R2": r2,
        "MAE": mae,
        "RMSE": rmse
    })
    
    print(name)
    print("  CV R2 Mean :", round(cv_scores.mean(), 4))
    print("  Test R2   :", round(r2, 4))
    print("  MAE       :", round(mae, 2))
    print("  RMSE      :", round(rmse, 2))
    print("-" * 45)


# -----------------------------
# 9. RESULTS SUMMARY
# -----------------------------
results_df = (
    pd.DataFrame(results)
    .sort_values(by="Test_R2", ascending=False)
)

print("\n================= FINAL RANKING =================\n")
display(results_df)


# -----------------------------
# 10. BEST MODEL SELECTION
# -----------------------------
best_model = results_df.iloc[0]

print("\n================= BEST MODEL =================")
print("Model       :", best_model["Model"])
print("Test R2     :", round(best_model["Test_R2"], 4))
print("MAE         :", round(best_model["MAE"], 2))
print("RMSE        :", round(best_model["RMSE"], 2))
print("=============================================")


# -----------------------------
# 11. REAL-WORLD INTERPRETATION
# -----------------------------
print("""
INTERPRETATION:
- This model predicts overall student performance using background factors only.
- Lunch type and test preparation act as strong proxies for socioeconomic and behavioural support.
- Model can be used for early-risk identification BEFORE exams.
- Suitable for policy intervention, not grading.
""")



Data loaded: (1000, 10)


Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total score,average
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333



Features: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
X shape: (1000, 5)
y shape: (1000,)


LinearRegression
  CV R2 Mean : 0.228
  Test R2   : 0.1622
  MAE       : 10.49
  RMSE      : 13.4
---------------------------------------------
Ridge
  CV R2 Mean : 0.2282
  Test R2   : 0.1619
  MAE       : 10.49
  RMSE      : 13.4
---------------------------------------------
Lasso
  CV R2 Mean : 0.228
  Test R2   : 0.1609
  MAE       : 10.5
  RMSE      : 13.41
---------------------------------------------
ElasticNet
  CV R2 Mean : 0.2286
  Test R2   : 0.1608
  MAE       : 10.51
  RMSE      : 13.41
---------------------------------------------
RandomForest
  CV R2 Mean : 0.0323
  Test R2   : -0.0083
  MAE       : 11.39
  RMSE      : 14.7
---------------------------------------------
GradientBoosting
  CV R2 Mean : 0.1381
  Test R2   : 0.0798
  MAE       : 10.87
  RMSE      : 14.04
---------------------------------------------




Unnamed: 0,Model,CV_R2_Mean,CV_R2_Std,Test_R2,MAE,RMSE
0,LinearRegression,0.227977,0.037002,0.162172,10.490183,13.401581
1,Ridge,0.228199,0.036916,0.161946,10.492945,13.403383
2,Lasso,0.22803,0.037022,0.160895,10.502743,13.411788
3,ElasticNet,0.22857,0.036775,0.160757,10.50533,13.412892
5,GradientBoosting,0.138121,0.050623,0.07981,10.868091,14.044857
4,RandomForest,0.032341,0.076698,-0.008336,11.39382,14.702157



Model       : LinearRegression
Test R2     : 0.1622
MAE         : 10.49
RMSE        : 13.4

INTERPRETATION:
- This model predicts overall student performance using background factors only.
- Lunch type and test preparation act as strong proxies for socioeconomic and behavioural support.
- Model can be used for early-risk identification BEFORE exams.
- Suitable for policy intervention, not grading.

