In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import optuna

from sklearn.model_selection import train_test_split

# Comparison of Bayesian Optimization Frameworks

According to the [Kaggle description](https://www.kaggle.com/datasets/uciml/student-alcohol-consumption) the columns are defined as follows:

| Column     | Description                                                                                      |
|------------|--------------------------------------------------------------------------------------------------|
| school     | student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)                |
| sex        | student's sex (binary: 'F' - female or 'M' - male)                                            |
| age        | student's age (numeric: from 15 to 22)                                                        |
| address    | student's home address type (binary: 'U' - urban or 'R' - rural)                             |
| famsize    | family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)                   |
| Pstatus    | parent's cohabitation status (binary: 'T' - living together or 'A' - apart)                   |
| Medu       | mother's education (numeric: 0 - none, 1 - primary education (4th grade), ...)                |
| Fedu       | father's education (numeric: 0 - none, 1 - primary education (4th grade), ...)                |
| Mjob       | mother's job (nominal: 'teacher', 'health' care related, ...)                                  |
| Fjob       | father's job (nominal: 'teacher', 'health' care related, ...)                                  |
| reason     | reason to choose this school (nominal: close to 'home', school 'reputation', ...)               |
| guardian   | student's guardian (nominal: 'mother', 'father' or 'other')                                    |
| traveltime | home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., ...)                     |
| studytime  | weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, ...)                                |
| failures   | number of past class failures (numeric: n if 1<=n<3, else 4)                                   |
| schoolsup  | extra educational support (binary: yes or no)                                                   |
| famsup     | family educational support (binary: yes or no)                                                   |
| paid       | extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)           |
| activities | extra-curricular activities (binary: yes or no)                                                 |
| nursery    | attended nursery school (binary: yes or no)                                                     |
| higher     | wants to take higher education (binary: yes or no)                                              |
| internet   | Internet access at home (binary: yes or no)                                                     |
| romantic   | with a romantic relationship (binary: yes or no)                                                |
| famrel     | quality of family relationships (numeric: from 1 - very bad to 5 - excellent)                    |
| freetime   | free time after school (numeric: from 1 - very low to 5 - very high)                             |
| goout      | going out with friends (numeric: from 1 - very low to 5 - very high)                             |
| Dalc       | workday alcohol consumption (numeric: from 1 - very low to 5 - very high)                         |
| Walc       | weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)                         |
| health     | current health status (numeric: from 1 - very bad to 5 - very good)                               |
| absences   | number of school absences (numeric: from 0 to 93)                                               |
| G1         | first period grade (numeric: from 0 to 20)                                                       |
| G2         | second period grade (numeric: from 0 to 20)                                                      |
| G3         | final grade (numeric: from 0 to 20, output target)                                               |


In [3]:
RANDOM_SEED = 42

In [4]:
df = pd.read_csv('../datasets/student-alcohol-consumption.csv', index_col=[0])
print(df.shape)
df.head(5)

(395, 29)


Unnamed: 0,school,sex,age,famsize,Pstatus,Medu,Fedu,traveltime,failures,schoolsup,...,goout,Dalc,Walc,health,absences,G1,G2,G3,location,study_time
0,GP,F,18,GT3,A,4,4,2,0,yes,...,4,1,1,3,6,5,6,6,Urban,2 to 5 hours
1,GP,F,17,GT3,T,1,1,1,0,no,...,3,1,1,3,4,5,5,6,Urban,2 to 5 hours
2,GP,F,15,LE3,T,1,1,1,3,yes,...,2,2,3,3,10,7,8,10,Urban,2 to 5 hours
3,GP,F,15,GT3,T,4,2,1,0,no,...,2,1,1,5,2,15,14,15,Urban,5 to 10 hours
4,GP,F,16,GT3,T,3,3,1,0,no,...,2,1,2,5,4,6,10,10,Urban,2 to 5 hours


## Dataset Preparation

In [5]:
X = df.loc[:, ~df.columns.isin(['G1', 'G2', 'G3'])]
int_columns = X.select_dtypes(include=['int']).columns
X = X.loc[:, X.columns.isin(int_columns)]
y = df['G3']

X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_SEED)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=RANDOM_SEED)

print(f"""
Shapes
======
    train : {X_train.shape}
    val   : {X_val.shape}
    test  : {X_test.shape}
""")


Shapes
    train : (276, 12)
    val   : (59, 12)
    test  : (60, 12)



# Model Training & Hyperparameter Tuning 

The problem of predicting the final grade `G3` is a multi-class classification problem. For comparison purposes we are going to use different algorithms to train our model. The following sections will use different optimization packages and/or libraries to train the hyperparameters of the respective algorithms.


## Optuna 

In [6]:
import optuna
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [7]:
def objective(trial):
    # Defining the hyperparameter search space.
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    min_samples_split = trial.suggest_float('min_samples_split', 0.1, 1.0)
    min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.1, 0.5)
    
    # Build the pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=RANDOM_SEED,
        ))
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    
    return acc


In [8]:
# Creating a study and running Optuna optimization.
study = optuna.create_study(study_name='my_optuna_study',
                            direction='maximize')
study.optimize(objective, n_trials=10)

# The searched optimal parameters.
best_params = study.best_params

# Create the pipeline with the searched optimal parameters.
best_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        random_state=RANDOM_SEED,
    ))
])

# Train the final model on the searched optimal parameters.
best_pipeline.fit(X_train, y_train)

# Train accuracy.
y_train_pred = best_pipeline.predict(X_train)
train_acc = accuracy_score(y_train, y_train_pred)

# Test accuracy.
y_test_pred = best_pipeline.predict(X_test)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"""
Optuna Results
==============
    Best hyperparameters: {best_params}

    Accuracy
    --------
        Train: {train_acc:<0.4f} 
        Test : {test_acc:<0.4f}
""")

[I 2023-08-07 11:32:52,097] A new study created in memory with name: my_optuna_study


[I 2023-08-07 11:32:52,296] Trial 0 finished with value: 0.15254237288135594 and parameters: {'n_estimators': 117, 'max_depth': 15, 'min_samples_split': 0.9872932015318743, 'min_samples_leaf': 0.12850314179489697}. Best is trial 0 with value: 0.15254237288135594.
[I 2023-08-07 11:32:52,590] Trial 1 finished with value: 0.0847457627118644 and parameters: {'n_estimators': 189, 'max_depth': 15, 'min_samples_split': 0.4880445134056033, 'min_samples_leaf': 0.30119199350593756}. Best is trial 0 with value: 0.15254237288135594.
[I 2023-08-07 11:32:52,774] Trial 2 finished with value: 0.0847457627118644 and parameters: {'n_estimators': 113, 'max_depth': 6, 'min_samples_split': 0.16337780557406967, 'min_samples_leaf': 0.3044302613012829}. Best is trial 0 with value: 0.15254237288135594.
[I 2023-08-07 11:32:53,162] Trial 3 finished with value: 0.15254237288135594 and parameters: {'n_estimators': 201, 'max_depth': 4, 'min_samples_split': 0.5285356759526376, 'min_samples_leaf': 0.1901823196918064}


Optuna Results
    Best hyperparameters: {'n_estimators': 117, 'max_depth': 15, 'min_samples_split': 0.9872932015318743, 'min_samples_leaf': 0.12850314179489697}

    Accuracy
    --------
        Train: 0.1413 
        Test : 0.1333

