In [16]:
import optuna
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [17]:
#import cleaned data set
df = pd.read_csv("clean_dataset.csv")

In [18]:
df.head()

Unnamed: 0,Maritalstatus,Applicationmode,Applicationorder,Course,Daytimeeveningattendance,Previousqualification,Nationality,Mothersqualification,Fathersqualification,Mothersoccupation,...,Curricularunits2ndsem(credited),Curricularunits2ndsem(enrolled),Curricularunits2ndsem(evaluations),Curricularunits2ndsem(approved),Curricularunits2ndsem(grade),Curricularunits2ndsem(withoutevaluations),Unemploymentrate,Inflationrate,GDP,Target
0,1,8,5,2,1,7,1,15,11,6,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,6,1,11,1,7,1,13,17,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,5,1,7,1,15,3,10,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,8,2,15,1,7,1,15,3,6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,12,1,3,0,7,1,15,4,10,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [19]:
#hide warnings so they do not affect the functionality of the package
import warnings
warnings.filterwarnings('ignore')

In [20]:
features_list = ['Maritalstatus', 'Applicationmode', 'Applicationorder', 'Course',
       'Daytimeeveningattendance', 'Previousqualification', 'Nationality',
       'Mothersqualification', 'Fathersqualification', 'Mothersoccupation',
       'Fathersoccupation', 'Displaced', 'Educationalspecialneeds', 'Debtor',
       'Tuitionfeesuptodate', 'Gender', 'Scholarshipholder', 'Ageatenrollment',
       'International', 'Curricularunits1stsem(credited)',
       'Curricularunits1stsem(enrolled)', 'Curricularunits1stsem(evaluations)',
       'Curricularunits1stsem(approved)', 'Curricularunits1stsem(grade)',
       'Curricularunits1stsem(withoutevaluations)',
       'Curricularunits2ndsem(credited)', 'Curricularunits2ndsem(enrolled)',
       'Curricularunits2ndsem(evaluations)', 'Curricularunits2ndsem(approved)',
       'Curricularunits2ndsem(grade)',
       'Curricularunits2ndsem(withoutevaluations)', 'Unemploymentrate',
       'Inflationrate', 'GDP']

forecast_var = ["Target"]

In [21]:
from sklearn.preprocessing import LabelEncoder

X = df[features_list]
y = df[forecast_var] 

# Encode categorical features
y = y.astype("category")
le = LabelEncoder()
y = le.fit_transform(np.ravel(y))

#Normalise numerical features
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9)

# Split validation set from initial train set to form 8:1:1 train:validation:test ratio
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.89,)

In [22]:
#unsure about this part, should it even come before we define the trial?
"""
model = XGBClassifier(use_label_encoder=False, 
                      eval_metric='mlogloss')
model.fit(X_train, y_train)

#model = xgb.XGBClassifier(n_estimators = 3900, early_stopping_rounds = 50, learning_rate = 0.001, max_depth = 12, tree_method = 'approx',booster = 'dart',enable_categorical=True)
#model.fit(X_train, y_train, eval_set = [(X_val,y_val)], verbose = 10)

"""

"\nmodel = XGBClassifier(use_label_encoder=False, \n                      eval_metric='mlogloss')\nmodel.fit(X_train, y_train)\n\n#model = xgb.XGBClassifier(n_estimators = 3900, early_stopping_rounds = 50, learning_rate = 0.001, max_depth = 12, tree_method = 'approx',booster = 'dart',enable_categorical=True)\n#model.fit(X_train, y_train, eval_set = [(X_val,y_val)], verbose = 10)\n\n"

## Using RandomForestClassifier

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

def objective(trial):
    #define the hyperparameters to optimize
    n_estimators = trial.suggest_int('n_estimators', 10, 1000)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_uniform('max_features', 0.1, 1.0)

    #define model
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   min_samples_split=min_samples_split,
                                   min_samples_leaf=min_samples_leaf,
                                   max_features=max_features,
                                   random_state=42)

    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Evaluate the model on the testing data
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    score = mean_squared_error(y_test, y_pred)
    
    return accuracy 

In [24]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20) #change number of trials later

[32m[I 2023-03-27 14:28:52,508][0m A new study created in memory with name: no-name-36684e14-2c9f-459a-a6b8-4132a887db2d[0m
[32m[I 2023-03-27 14:28:54,340][0m Trial 0 finished with value: 0.7246049661399548 and parameters: {'n_estimators': 721, 'max_depth': 2, 'min_samples_split': 3, 'min_samples_leaf': 9, 'max_features': 0.7179785112809058}. Best is trial 0 with value: 0.7246049661399548.[0m
[32m[I 2023-03-27 14:28:54,900][0m Trial 1 finished with value: 0.7133182844243793 and parameters: {'n_estimators': 318, 'max_depth': 2, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': 0.40513204074131093}. Best is trial 0 with value: 0.7246049661399548.[0m
[32m[I 2023-03-27 14:28:55,680][0m Trial 2 finished with value: 0.7765237020316027 and parameters: {'n_estimators': 353, 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 0.20104757590228617}. Best is trial 2 with value: 0.7765237020316027.[0m
[32m[I 2023-03-27 14:28:56,640][0m Trial 3 f

KeyboardInterrupt: 

In [None]:
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

Number of finished trials: 20
Best trial:
  Value: 0.781038374717833
  Params: 
    n_estimators: 826
    max_depth: 28
    min_samples_split: 2
    min_samples_leaf: 5
    max_features: 0.9963459199556591


In [None]:
#get the best parameters and train and evaluate the RFC model with the best parameters
best_params = study.best_params
model = RandomForestClassifier(**best_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test accuracy: {accuracy}')

print(classification_report(y_test, y_pred))


Test accuracy: 0.781038374717833
              precision    recall  f1-score   support

           0       0.79      0.75      0.77       123
           1       0.58      0.40      0.48        84
           2       0.82      0.93      0.87       236

    accuracy                           0.78       443
   macro avg       0.73      0.69      0.71       443
weighted avg       0.77      0.78      0.77       443



In [None]:
XGBmodel = XGBClassifier(n_estimators=best_params['n_estimators'],
                               max_depth=best_params['max_depth'],
                               min_samples_split=best_params['min_samples_split'],
                               min_samples_leaf=best_params['min_samples_leaf'],
                               max_features=best_params['max_features'],
                               random_state=42)
XGBmodel.fit(X_train, y_train)

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.



In [None]:
y_pred = XGBmodel.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy after tuning: %.2f%%" % (accuracy * 100.0))
print(classification_report(y_test, y_pred))

Accuracy after tuning: 77.65%
              precision    recall  f1-score   support

           0       0.73      0.77      0.75       123
           1       0.58      0.40      0.48        84
           2       0.85      0.91      0.88       236

    accuracy                           0.78       443
   macro avg       0.72      0.70      0.70       443
weighted avg       0.76      0.78      0.77       443



In [None]:
"""
def objective(trial):
    #Define the objective function

    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }

    # Fit the model
    optuna_model = XGBClassifier(**params)
    optuna_model.fit(X_train, y_train)

    # Make predictions
    y_pred = optuna_model.predict(X_test)

    # Evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

"""

"\ndef objective(trial):\n    #Define the objective function\n\n    params = {\n        'max_depth': trial.suggest_int('max_depth', 1, 9),\n        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),\n        'n_estimators': trial.suggest_int('n_estimators', 50, 500),\n        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),\n        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),\n        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),\n        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),\n        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),\n        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),\n        'eval_metric': 'mlogloss',\n        'use_label_encoder': False\n    }\n\n    # Fit the model\n    optuna_model = XGBClassifier(**params)\n    optuna_model.fit(X_train, y_train)\n\n    # Make predictions\n    y_pred = optuna_model.predict(X_test)\n\n    # Evalu