default XGBClassifier model

In [9]:
import optuna
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [10]:
#import cleaned data set
df = pd.read_csv("clean_dataset.csv")

In [11]:
#hide warnings so they do not affect the functionality of the package
import warnings
warnings.filterwarnings('ignore')

In [12]:
features_list = ['Maritalstatus', 'Applicationmode', 'Applicationorder', 'Course',
       'Daytimeeveningattendance', 'Previousqualification', 'Nationality',
       'Mothersqualification', 'Fathersqualification', 'Mothersoccupation',
       'Fathersoccupation', 'Displaced', 'Educationalspecialneeds', 'Debtor',
       'Tuitionfeesuptodate', 'Gender', 'Scholarshipholder', 'Ageatenrollment',
       'International', 'Curricularunits1stsem(credited)',
       'Curricularunits1stsem(enrolled)', 'Curricularunits1stsem(evaluations)',
       'Curricularunits1stsem(approved)', 'Curricularunits1stsem(grade)',
       'Curricularunits1stsem(withoutevaluations)',
       'Curricularunits2ndsem(credited)', 'Curricularunits2ndsem(enrolled)',
       'Curricularunits2ndsem(evaluations)', 'Curricularunits2ndsem(approved)',
       'Curricularunits2ndsem(grade)',
       'Curricularunits2ndsem(withoutevaluations)', 'Unemploymentrate',
       'Inflationrate', 'GDP']

forecast_var = ["Target"]

In [13]:
from sklearn.preprocessing import LabelEncoder

X = df[features_list]
y = df[forecast_var] 

# Encode categorical features
y = y.astype("category")
le = LabelEncoder()
y = le.fit_transform(np.ravel(y))

#Normalise numerical features
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9)

In [14]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

def objective(trial):
    #define the hyperparameters to optimize
    param = {
        'max_depth': trial.suggest_int('max_depth', 2, 15),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1.0, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 1000, 10000, 100),
        'eta': trial.suggest_discrete_uniform('eta', 0.01, 0.1, 0.01),
        'reg_alpha': trial.suggest_int('reg_alpha', 1, 50),
        'reg_lambda': trial.suggest_int('reg_lambda', 5, 100),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 20),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0)}

    #define model
    model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    return acc

In [15]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20) #change number of trials later

[32m[I 2023-03-27 14:45:37,705][0m A new study created in memory with name: no-name-4b950129-453d-4ecc-8d75-2c272aef5d43[0m
[32m[I 2023-03-27 14:45:38,248][0m Trial 0 finished with value: 0.781038374717833 and parameters: {'max_depth': 7, 'subsample': 0.75, 'n_estimators': 2000, 'eta': 0.03, 'reg_alpha': 4, 'reg_lambda': 37, 'min_child_weight': 5, 'colsample_bytree': 0.7092788877133263}. Best is trial 0 with value: 0.781038374717833.[0m
[32m[I 2023-03-27 14:45:38,753][0m Trial 1 finished with value: 0.781038374717833 and parameters: {'max_depth': 2, 'subsample': 0.7, 'n_estimators': 2200, 'eta': 0.09, 'reg_alpha': 17, 'reg_lambda': 86, 'min_child_weight': 20, 'colsample_bytree': 0.1606148798116948}. Best is trial 0 with value: 0.781038374717833.[0m
[32m[I 2023-03-27 14:45:39,292][0m Trial 2 finished with value: 0.781038374717833 and parameters: {'max_depth': 8, 'subsample': 0.9, 'n_estimators': 6700, 'eta': 0.060000000000000005, 'reg_alpha': 12, 'reg_lambda': 19, 'min_child_

In [16]:
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

Number of finished trials: 20
Best trial:
  Value: 0.781038374717833
  Params: 
    max_depth: 7
    subsample: 0.75
    n_estimators: 2000
    eta: 0.03
    reg_alpha: 4
    reg_lambda: 37
    min_child_weight: 5
    colsample_bytree: 0.7092788877133263


In [17]:
#get the best parameters and train and evaluate the RFC model with the best parameters
best_params = study.best_params
model = XGBClassifier(**best_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy after tuning: %.2f%%" % (accuracy * 100.0))
print(classification_report(y_test, y_pred))

Test accuracy: 0.781038374717833
              precision    recall  f1-score   support

           0       0.79      0.79      0.79       142
           1       0.64      0.46      0.53        85
           2       0.81      0.90      0.86       216

    accuracy                           0.78       443
   macro avg       0.75      0.72      0.73       443
weighted avg       0.77      0.78      0.77       443

