In [2]:
import os
parent_path = os.path.dirname(os.getcwd())
os.chdir(parent_path)

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import sys
from utils.metrics import quadratic_weighted_kappa
from utils.variables import search_space_xgb, search_space_lgbm, search_space_catboost, search_space_rf, search_space_logistic, search_knn, search_space_SVC
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

  backends.update(_get_backends("networkx.backends"))


In [4]:
search_space = search_space_xgb
train_path = 'data/train.csv'

In [5]:
# read the data
train = pd.read_csv(train_path)

# preprocessing

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 
        'FGC-Season', 'BIA-Season', 'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']
pciat = train.columns[train.columns.str.startswith('PCIAT-PCIAT')].tolist() + ['sii', "PCIAT-Season"]

train_clean = pd.concat([train, pd.get_dummies(train[cat_c]).astype(int)], axis=1)
to_drop = ["id"] + cat_c
train_clean = train_clean.drop(to_drop, axis=1)
train_clean = train_clean.dropna(subset=pciat)

x_train, x_test, y_train, y_test = train_test_split(train_clean.drop(pciat, axis=1), train_clean['sii'], test_size=0.2, random_state=42)

In [6]:
from hyperopt.pyll import scope
search_space_xgb = {
    'max_depth': scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)), 
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 500, 10)),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1), 
}

In [8]:
def objective(params):

    # params['max_depth'] = int(params['max_depth'])
    # params['n_estimators'] = int(params['n_estimators'])   
    model = xgb.XGBClassifier(**params)
    model.fit(x_train.to_numpy(), y_train.to_numpy())
    y_pred = model.predict(x_test.to_numpy())
    score = quadratic_weighted_kappa(y_test, y_pred)

    return {'loss': -score, 'status': STATUS_OK}

In [10]:
# Fine Tuning model
trials = Trials()

best_params = fmin(
fn=objective,  # Objective function
space=search_space_xgb,  # Hyperparameter search space
algo=tpe.suggest,  # Tree-structured Parzen Estimator
max_evals=50,  # Number of iterations
trials=trials,  # Store trial results
rstate=np.random.default_rng(42)  # For reproducibility
)

best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])

# Train the model
model = xgb.XGBClassifier(**best_params)
model.fit(x_train.to_numpy(), y_train.to_numpy())

# Predict on the test set
y_pred = model.predict(x_test.to_numpy())

100%|██████████| 50/50 [00:59<00:00,  1.19s/trial, best loss: -0.3862468102630475] 


In [7]:
print(f"QWK: {quadratic_weighted_kappa(y_test, y_pred)}\n")
print("------------------------------------------------------------------\n")
print(f"Accuracy: {model.score(x_test.to_numpy(), y_test.to_numpy())}\n")
print("------------------------------------------------------------------\n")
print(f"Model: {model.get_params()}\n")
print("------------------------------------------------------------------\n")
print(f"Data: {train_path}\n")
print("------------------------------------------------------------------\n")
print(f"Features: {train_clean.columns}\n")
print("------------------------------------------------------------------\n")

QWK: 0.3862468102630475

------------------------------------------------------------------

Accuracy: 0.6280373831775701

------------------------------------------------------------------

Model: {'objective': 'multi:softprob', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.6477205861351052, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.08838500339046752, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 8, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 280, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method'