In [7]:
# Import Libraries and display settings
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
import joblib

pd.set_option('display.max_columns', None)

In [None]:


# Load datasets
mat_path = "../data/student_mat.csv"
por_path = "../data/student_por.csv"

student_mat = pd.read_csv(mat_path)
student_por = pd.read_csv(por_path)

display(f"Math dataset shape: {student_mat.shape}")
display(f"Portuguese dataset shape: {student_por.shape}")

students = pd.concat([student_mat, student_por], ignore_index=True)
display(f"Combined dataset shape: {students.shape}")
students.head()


'Math dataset shape: (395, 33)'

'Portuguese dataset shape: (649, 33)'

'Combined dataset shape: (1044, 33)'

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10


In [None]:
# Feature engineering
students['avg_grade'] = students[['G1','G2','G3']].mean(axis=1)
students['grade_trend'] = students['G3'] - students['G1']
students['passed_final'] = (students['G3'] >= 10).astype(int)

# Rule-based dropout label
grade_thresh = 10
absences_thresh = 10
decline_thresh = -3

students['dropout_label_rule'] = (
    (students['G3'] < grade_thresh) |
    (students['absences'] >= absences_thresh) |
    (students['grade_trend'] <= decline_thresh)
).astype(int)

# Quantile-based risk label
max_absences = max(students['absences'].max(), 1)
grade_component = (20 - students['G3']) / 20
abs_component = students['absences'] / max_absences
decline_component = np.maximum(0, students['G1'] - students['G3'])
max_decline = max(decline_component.max(), 1)
decline_component = decline_component / max_decline

students['risk_score'] = 0.5 * grade_component + 0.4 * abs_component + 0.1 * decline_component
quantile = 0.8
threshold = students['risk_score'].quantile(quantile)
students['dropout_label_quantile'] = (students['risk_score'] >= threshold).astype(int)

display(f"Quantile threshold (q={quantile}): {threshold}")
students[['dropout_label_rule','dropout_label_quantile','risk_score']].head()



Quantile threshold (q=0.8): 0.30826666666666674


Unnamed: 0,dropout_label_rule,dropout_label_quantile,risk_score
0,1,1,0.382
1,1,1,0.371333
2,1,0,0.303333
3,0,0,0.135667
4,0,0,0.271333


In [None]:
# Label distributions
display("Rule-based label distribution:")
display(students['dropout_label_rule'].value_counts(normalize=True))
display("\nQuantile-based label distribution:")
display(students['dropout_label_quantile'].value_counts(normalize=True))

'Rule-based label distribution:'

dropout_label_rule
0    0.676245
1    0.323755
Name: proportion, dtype: float64

'\nQuantile-based label distribution:'

dropout_label_quantile
0    0.799808
1    0.200192
Name: proportion, dtype: float64

In [None]:
# Encoding and scaling
cat_cols = students.select_dtypes(include=['object','category']).columns.tolist()
num_cols = students.select_dtypes(include=[np.number]).columns.tolist()
targets = ['dropout_label_rule','dropout_label_quantile','risk_score']
num_cols_to_scale = [c for c in num_cols if c not in targets]

display(f"Categorical columns: {cat_cols}")
display(f"Numeric columns to scale: {num_cols_to_scale}")

scaler = StandardScaler()
students[num_cols_to_scale] = scaler.fit_transform(students[num_cols_to_scale])

Path("../model").mkdir(parents=True, exist_ok=True)
joblib.dump(scaler, "../model/scaler.joblib")
display("Scaler saved to model/scaler.joblib")

if cat_cols:
    students = pd.get_dummies(students, columns=cat_cols, drop_first=True)

display(f"After encoding, shape: {students.shape}")

"Categorical columns: ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']"

"Numeric columns to scale: ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3', 'avg_grade', 'grade_trend', 'passed_final']"

'Scaler saved to model/scaler.joblib'

'After encoding, shape: (1044, 48)'

In [None]:
# Save processed dataset
Path("../data").mkdir(parents=True, exist_ok=True)
students.to_csv("../data/processed_students_for_model.csv", index=False)
display("Processed dataset saved to data/processed_students_for_model.csv")


'Processed dataset saved to data/processed_students_for_model.csv'