# Model Experiments

In this notebook, we will experiment with different machine learning models and their configurations using the Australian Student Performance dataset.

In [35]:
# Import necessary libraries
import sys
import os

sys.path.append(os.path.abspath('..'))

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from src.data.loader import load_data
from src.data.cleaning import clean_data, one_hot_encode
from src.data.balancing import balance_dataset
from src.data.split import split_data
from src.models.model_factory import create_model
from src.training.trainer import Trainer
from src.evaluation.evaluate import evaluate_model
from scripts.run_tuning import get_param_grid
from src.training.hyperparameter_tuning import HyperparameterTuner
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline


In [36]:
# Load and clean the data
data = load_data('../data/raw/Australian_Student_PerformanceData (ASPD24).csv')
cleaned_data = clean_data(data)

# Fix column names with leading/trailing spaces
cleaned_data.columns = cleaned_data.columns.str.strip()


# Balance and split the data
#balanced_data = balance_dataset(cleaned_data, target_column='Performance') # Note -> we dont really need balancing here as the differences in target value counts are not that large

# split data
train_data, val_data, test_data = split_data(cleaned_data, test_size=0.2, val_size=0.1, random_state=42) 

# remove imperfections
for df in [train_data, val_data, test_data]:
    df.columns = df.columns.str.strip()

# encoding according to actual feature types
drop_columns = ["Student ID", "University ID"] # those are not needed (noise)

numeric_features = [
    "Age",
    "Year of Study",
    "GPA",
    "High School GPA",
    "Entrance Exam Score",
    "Attendance Rate",
    "Participation in Extracurricular Activities",
    "Part-time Job",
    "Hours of Study per Week",
    "Family Income",
    "Distance from Home to University",
    "Internet Access at Home",
    "Library Usage",
    "Access to Academic Resources",
    "Scholarship",
    "Financial Aid",
    "Tutor Support",
    "Counseling Services",
    "Hours of Sleep per Night",
    "Exercise Frequency",
    "Peer Support",
    "Use of Online Learning Platforms",
    "Project/Assignment Scores",
    "Midterm Exam Scores",
    "Final Exam Scores",
    "Attendance at Office Hours",
    "Group Work Participation",
    "Research Involvement",
    "Internship Experience",
    "Peer Reviews",
    "Academic Advising",
    "Core Course Average",
    "Extracurricular Participation",
    "Peer Evaluations"
]

ordinal_features = [
    "Health Condition",
    "Mental Health Status",
    "Diet Quality",
    "Social Integration",
    "Language Proficiency",
    "Study Environment",
    "Class Participation"
]

ordinal_categories = [
    ["Poor", "Fair", "Good", "Excellent"],  # Health Condition
    ["Poor", "Fair", "Good", "Excellent"],  # Mental Health Status
    ["Poor", "Fair", "Good", "Excellent"],  # Diet Quality
    ["Poor", "Fair", "Good", "Excellent"],  # Social Integration
    ["Poor", "Fair", "Good", "Excellent"],  # Language Proficiency
    ["Poor", "Fair", "Good", "Excellent"],  # Study Environment
    ["Poor", "Fair", "Good", "Excellent"]   # Class Participation
]

nominal_features = [
    "University Name",
    "Gender",
    "Major",
    "Parental Education Level",
    "Accommodation Type",
    "Transportation Mode",
    "Learning Style"
]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("ord", OrdinalEncoder(categories=ordinal_categories), ordinal_features),
        ("nom", OneHotEncoder(handle_unknown="ignore", drop="first"), nominal_features),
    ],
    remainder="drop"
)

In [37]:

X_train = train_data.drop(columns=["Performance"])
y_train = train_data["Performance"]
X_val = val_data.drop(columns=["Performance"])
y_val = val_data["Performance"]


In [None]:
# Experiment with different models

models = ["logistic_regression", "decision_tree", "random_forest", "svm"]
results = {}

X_train = train_data.drop(columns=["Performance"])
y_train = train_data["Performance"]
X_val = val_data.drop(columns=["Performance"])
y_val = val_data["Performance"]

for model_name in models:
    print(f"\n Tuning {model_name}...")

    config = get_param_grid(model_name)

    # Build full pipeline
    model_pipeline = Pipeline([
        ("preprocess", preprocessor),
        ("clf", config["model"])
    ])

    tuner = HyperparameterTuner(
        model=model_pipeline,
        param_grid={f"clf__{k}": v for k, v in config["params"].items()},
        scoring="accuracy",
        cv=3,
        n_iter=20,
        random_state=42
    )

    best_model, best_params, best_score = tuner.tune(X_train, y_train, method="random")

    print("Best Params:", best_params)
    print("Best CV Score:", best_score)

    val_preds = best_model.predict(X_val)
    val_acc = accuracy_score(y_val, val_preds)

    print(f"Validation Accuracy ({model_name}):", val_acc)

    results[model_name] = (best_model, val_acc)



 Tuning logistic_regression...




In [9]:
# Evaluate the best model on the test set
best_model_name = max(results, key=results.get)
best_model = create_model(best_model_name)
train_model(best_model, train_data)
test_predictions = best_model.predict(test_data.drop('target', axis=1))
test_report = classification_report(test_data['target'], test_predictions)
print(f'Best Model: {best_model_name}')
print(test_report)


NameError: name 'train_model' is not defined