# Model Experiments

In this notebook, we will experiment with different machine learning models and their configurations using the Australian Student Performance dataset.

In [1]:
# Import necessary libraries
import sys
import os

sys.path.append(os.path.abspath('..'))

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from src.data.loader import load_data
from src.data.cleaning import clean_data, one_hot_encode
from src.data.balancing import balance_dataset
from src.data.split import split_data
from src.models.model_factory import create_model
from src.training.trainer import Trainer
from src.evaluation.evaluate import evaluate_model


In [2]:
# Load and clean the data
data = load_data('../data/raw/Australian_Student_PerformanceData (ASPD24).csv')
cleaned_data = clean_data(data)
performance = cleaned_data['Performance']
cleaned_data = cleaned_data.drop('Performance', axis=1)
cleaned_data = one_hot_encode(cleaned_data)
cleaned_data['Performance'] = performance
print(cleaned_data.columns)

Index(['Student ID', 'University ID', 'Age', 'Year of Study', 'GPA',
       'High School GPA', 'Entrance Exam Score', 'Attendance Rate',
       'Participation in Extracurricular Activities', 'Part-time Job',
       'Hours of Study per Week', 'Family Income',
       'Distance from Home to University', 'Internet Access at Home',
       'Library Usage', 'Access to Academic Resources', 'Scholarship',
       'Financial Aid', 'Tutor Support', 'Counseling Services',
       'Hours of Sleep per Night', 'Exercise Frequency', 'Peer Support',
       'Use of Online Learning Platforms', 'Project/Assignment Scores',
       'Midterm Exam Scores', 'Final Exam Scores',
       'Attendance at Office Hours', 'Group Work Participation',
       'Research Involvement', 'Internship Experience', 'Peer Reviews',
       'Academic Advising', 'Core Course Average',
       'Extracurricular Participation', 'Peer Evaluations',
       'University Name_University B', 'University Name_University C',
       'Gender_M', 'M

In [3]:
# Balance and split the data
balanced_data = balance_dataset(cleaned_data, target_column='Performance')
train_data, val_data, test_data = split_data(balanced_data)


Original dataset shape: Counter({'Satisfactory': 23996, 'Needs Improvement': 20260, 'Good': 16050, 'Poor': 11967, 'Excellent': 8017})
Balanced dataset shape: Counter({'Satisfactory': 23996, 'Needs Improvement': 23996, 'Good': 23996, 'Excellent': 23996, 'Poor': 23996})


In [None]:
# Experiment with different models
models = ['logistic_regression', 'decision_tree', 'random_forest', 'svm']
results = {}

for model_name in models:
    model = create_model(model_name)
    X_train = train_data.drop('Performance', axis=1)
    y_train = train_data['Performance']
    model.fit(X_train, y_train)
    val_predictions = model.predict(val_data.drop('Performance', axis=1))
    accuracy = accuracy_score(val_data['Performance'], val_predictions)
    results[model_name] = accuracy
    print(f'{model_name} Accuracy: {accuracy}')


logistic_regression Accuracy: 0.20032387121356449
decision_tree Accuracy: 0.2803867403314917
random_forest Accuracy: 0.3836445037149933


In [None]:
# Evaluate the best model on the test set
best_model_name = max(results, key=results.get)
best_model = create_model(best_model_name)
train_model(best_model, train_data)
test_predictions = best_model.predict(test_data.drop('target', axis=1))
test_report = classification_report(test_data['target'], test_predictions)
print(f'Best Model: {best_model_name}')
print(test_report)
