In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import sys
from utils.metrics import quadratic_weighted_kappa
from utils.variables import search_space_xgb, search_space_lgbm, search_space_catboost, search_space_rf, search_space_logistic, search_knn, search_space_SVC
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

  backends.update(_get_backends("networkx.backends"))


In [3]:
train_path = 'training_sets/imputed_train_optimal_knn.csv'
name = "imputed_train_grid"

dict_models = {
    'xgb': {"model": xgb.XGBClassifier, "search_space": search_space_xgb},
    'lgbm': {"model": LGBMClassifier, "search_space": search_space_lgbm},
    'catboost': {"model": CatBoostClassifier, "search_space": search_space_catboost},
    'rf': {"model": RandomForestClassifier, "search_space": search_space_rf},
    'logistic': {"model": LogisticRegression, "search_space": search_space_logistic},
    'knn': {"model": KNeighborsClassifier, "search_space": search_knn},
}

In [20]:
train2 = pd.read_csv("data/train.csv")

In [None]:
to_impute=['FGC-FGC_CU_Zone', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD_Zone',
       'FGC-FGC_PU_Zone', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR_Zone',
       'FGC-FGC_TL_Zone', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_Frame_num']



In [12]:
# read the data
train = pd.read_csv(train_path)
# train = train.drop("PreInt_EduHx-Season.1", axis=1)

# preprocessing

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 
        'FGC-Season', 'BIA-Season', 'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']
pciat = train.columns[train.columns.str.startswith('PCIAT-PCIAT')].tolist() + ['sii', "PCIAT-Season"]

train_clean = pd.concat([train, pd.get_dummies(train[cat_c]).astype(int)], axis=1)
to_drop = ["id"] + cat_c
train_clean = train_clean.drop(to_drop, axis=1)
train_clean = train_clean.dropna(subset=pciat)

x_train, x_test, y_train, y_test = train_test_split(train_clean.drop(pciat, axis=1), train_clean['sii'], test_size=0.2, random_state=42)

In [6]:
results = []

In [11]:
from warnings import filterwarnings
filterwarnings('ignore')

for clf in dict_models.keys():

    print("Training model: ", clf)

    clf_model = dict_models[clf]['model']
    search_space = dict_models[clf]['search_space']

    def objective(params):
        if clf=="lgbm":
            params["verbose"] = -1
        if clf=="catboost":
            params["verbose"] = 0   
        model = clf_model(**params)
        model.fit(x_train.to_numpy(), y_train.to_numpy())
        y_pred = model.predict(x_test.to_numpy())
        score = cohen_kappa_score(y_test, y_pred, weights='quadratic')
        return {'loss': -score, 'status': STATUS_OK}
    
    # Fine Tuning model
    trials = Trials()

    best_params = fmin(
    fn=objective,  # Objective function
    space=search_space,  # Hyperparameter search space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator
    max_evals=50,  # Number of iterations
    trials=trials,  # Store trial results
    rstate=np.random.default_rng(42)  # For reproducibility
    )

    if clf == "rf":
        best_params["n_estimators"] = int(best_params["n_estimators"])
        best_params["max_depth"] = int(best_params["max_depth"])

    if clf == "catboost":
        best_params["verbose"] = 0

    if clf == "xgb":
        best_params["max_depth"] = int(best_params["max_depth"])
        best_params["n_estimators"] = int(best_params["n_estimators"])

    if clf == "lgbm":
        best_params["max_depth"] = int(best_params["max_depth"])
        best_params["n_estimators"] = int(best_params["n_estimators"])


    if clf == "knn":
        best_params["n_neighbors"] = int(best_params["n_neighbors"])

    # Train the model
    model = clf_model(**best_params)
    model.fit(x_train.to_numpy(), y_train.to_numpy())

    # Predict on the test set
    y_pred = model.predict(x_test.to_numpy())

    results.append(pd.DataFrame(index = [clf], data = {'QWK': [cohen_kappa_score(y_test, y_pred, weights='quadratic')], 'params': [best_params], "Accuracy": [model.score(x_test.to_numpy(), y_test.to_numpy())]}))

results = pd.concat(results)

Training model:  xgb
100%|██████████| 50/50 [00:36<00:00,  1.38trial/s, best loss: -0.3640079155578063]
Training model:  lgbm
100%|██████████| 50/50 [00:14<00:00,  3.47trial/s, best loss: -0.35678407812319257]
Training model:  catboost
100%|██████████| 50/50 [03:03<00:00,  3.67s/trial, best loss: -0.3267853451745909] 
Training model:  rf
100%|██████████| 50/50 [04:04<00:00,  4.88s/trial, best loss: -0.3180700080503295] 
Training model:  logistic
100%|██████████| 50/50 [00:03<00:00, 13.17trial/s, best loss: -0.0830636251127308]
Training model:  knn
100%|██████████| 50/50 [00:01<00:00, 40.72trial/s, best loss: -0.18055680691954223]


In [12]:
results.to_csv(f"results/{name}_fine_tuning_results.csv")

OSError: Cannot save file into a non-existent directory: 'results'