In [1]:
"""
Student Exam Score Predictor - Advanced Model Training Pipeline
This script loads student data, preprocesses it, performs hyperparameter 
tuning across multiple advanced regression models (XGBoost & LightGBM),
and saves the highest-performing pipeline for deployment.
"""

import sys
import joblib
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings('ignore')

In [2]:
# ==========================================
# 1. DATA LOADING
# ==========================================
# Safely load the dataset (Ensure 'train.csv' is in the same directory)
try:
    # Dropping the 'id' column as it holds no predictive value
    train_df = pd.read_csv('train.csv').drop(columns=['id'])
except FileNotFoundError:
    print("Error: 'train.csv' file not found. Please ensure the file is in the same directory.")
    sys.exit(1)

# Define features (X) and target (y)
X = train_df.drop(columns=['exam_score'])
y = train_df['exam_score']

In [3]:
# ==========================================
# 2. FEATURE ENGINEERING & PREPROCESSING
# ==========================================
# Split the data into training and testing sets before applying transformations
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up dynamic preprocessing steps based on column data types
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), make_column_selector(dtype_include=['int64', 'float64'])),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), make_column_selector(dtype_include=['object']))
    ]
)

In [4]:
# ==========================================
# 3. MODEL CONFIGURATION & PIPELINES
# ==========================================
# Define pipelines for the models to be evaluated
pipelines = {
    'LightGBM': Pipeline(steps=[('preprocessor', preprocessor),('model', LGBMRegressor(random_state=42, verbose=-1))]),
    'XGBoost': Pipeline(steps=[('preprocessor', preprocessor),('model', XGBRegressor(random_state=42))])
}

# Define the hyperparameter grid (Shared across both XGBoost and LightGBM)
param_grid = {
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__n_estimators': [100, 300]
}

In [5]:
# ==========================================
# 4. HYPERPARAMETER TUNING & EVALUATION
# ==========================================
best_mae = float('inf')
best_model_name = None
best_model_pipeline = None

print("Starting Grid Search for the best model and hyperparameters...\n")

for name, pipeline in pipelines.items():
    
    # Initialize GridSearchCV for the current pipeline
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=3,
        scoring='neg_mean_absolute_error',
        n_jobs=-1
    )
    
    # Fit the grid search to the training data
    grid_search.fit(X_train, y_train)
    
    # Generate predictions using the best estimator found
    y_pred = grid_search.best_estimator_.predict(X_test)
    
    # Calculate Mean Absolute Error
    mae = mean_absolute_error(y_test, y_pred)
    
    # Output the results for the current model
    print(f"{name} - Best Parameters: {grid_search.best_params_}")
    print(f"{name} - Test MAE: {mae:.4f}\n")

    # Keep track of the highest performing model
    if mae < best_mae:
        best_mae = mae
        best_model_name = name
        best_model_pipeline = grid_search.best_estimator_

print(f"üèÜ Winning Model: {best_model_name} (MAE: {best_mae:.4f})")

Starting Grid Search for the best model and hyperparameters...

LightGBM - Best Parameters: {'model__learning_rate': 0.2, 'model__max_depth': 5, 'model__n_estimators': 300}
LightGBM - Test MAE: 6.9853

XGBoost - Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 300}
XGBoost - Test MAE: 6.9879

üèÜ Winning Model: LightGBM (MAE: 6.9853)


In [6]:
# ==========================================
# 5. EXPORTING THE DEPLOYMENT PIPELINE
# ==========================================
print(f"\nRetraining the winning {best_model_name} model on the entire dataset to maximize accuracy...")
best_model_pipeline.fit(X, y)

# Serialize and save the final pipeline
model_filename = 'student_exam_predict_pipeline.joblib'
joblib.dump(best_model_pipeline, model_filename)

print(f"Complete! The pipeline has been saved as '{model_filename}'.")


Retraining the winning LightGBM model on the entire dataset to maximize accuracy...
Complete! The pipeline has been saved as 'student_exam_predict_pipeline.joblib'.
