In [1]:
import os
os.chdir('./../')

In [2]:
import numpy as np
import optuna
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.svm import SVC
import xgboost as xgb

from scripts.categorical_transformer import CategoricalTransformer
from scripts.loss import get_bal_log_loss
from scripts.outlier_remover import OutlierRemover

  from pandas import MultiIndex, Int64Index


In [3]:
base_path = './data'

In [4]:
all_df = pd.read_csv(f'{base_path}/train.csv', index_col=0)
final_test_df = pd.read_csv(f'{base_path}/test.csv', index_col=0)

In [5]:
all_train = all_df.loc[:, all_df.columns != 'Class']
all_test = all_df.loc[:, 'Class']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(all_train, 
                                                    all_test, 
                                                    random_state=42, 
                                                    stratify=all_test)

In [7]:
index_of_ej = list(x_train.columns).index('EJ')

In [8]:
xgb_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                             OutlierRemover(),
                             IterativeImputer(initial_strategy='median'),
                             xgb.XGBClassifier(random_state=42, 
                                               use_label_encoder=False,
                                               eval_metric='logloss'))

In [9]:
cv_scores = cross_val_score(xgb_pipeline, 
                            x_train, 
                            y_train, 
                            cv=StratifiedKFold(shuffle=True, random_state=42), 
                            scoring=get_bal_log_loss())
cv_scores

array([-0.74439085, -0.5822911 , -0.8178164 , -0.99961527, -0.56996175])

In [10]:
rf_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                            OutlierRemover(),
                            IterativeImputer(initial_strategy='median'),
                            RandomForestClassifier(random_state=42, class_weight='balanced'))

In [11]:
cv_scores = cross_val_score(rf_pipeline, 
                            x_train, 
                            y_train, 
                            cv=StratifiedKFold(shuffle=True, random_state=42), 
                            scoring=get_bal_log_loss())
cv_scores

array([-0.73018173, -0.66375817, -0.82382843, -0.84717035, -0.72656212])

In [12]:
svc_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                             OutlierRemover(),
                             IterativeImputer(initial_strategy='median'),
                             PowerTransformer(),
                             SVC(probability=True, random_state=42, class_weight='balanced'))

In [13]:
cv_scores = cross_val_score(svc_pipeline, 
                            x_train, 
                            y_train, 
                            cv=StratifiedKFold(shuffle=True, random_state=42), 
                            scoring=get_bal_log_loss())
cv_scores

  loglike = -n_samples / 2 * np.log(x_trans.var())


array([-0.64154054, -0.48676298, -0.82831744, -0.74206446, -0.48861026])

In [14]:
gb_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                            OutlierRemover(),
                            IterativeImputer(initial_strategy='median'),
                            GradientBoostingClassifier(random_state=42))

In [15]:
cv_scores = cross_val_score(gb_pipeline, 
                            x_train, 
                            y_train, 
                            cv=StratifiedKFold(shuffle=True, random_state=42), 
                            scoring=get_bal_log_loss())
cv_scores

array([-0.98588675, -0.68526406, -0.98045097, -1.38011402, -0.70573669])

In [16]:
def objective(trial):
    x, y = x_train, y_train

    classifier_name = trial.suggest_categorical("classifier", ['SVC' ,'RF', 'XGB'])
    if classifier_name == "SVC":
        svc_c = trial.suggest_float("svc_c", 1e-10, 1e10, log=True)
        svc_algo = trial.suggest_categorical("svc_algo", ['poly', 'rbf', 'sigmoid'])
        svc_cls_wgt = trial.suggest_categorical("svc_cls_wgt", [None, 'balanced'])
        classifier_obj = make_pipeline(CategoricalTransformer(index_of_ej), 
                                       OutlierRemover(),
                                       IterativeImputer(initial_strategy='median'),
                                       PowerTransformer(),
                                       SVC(C=svc_c, 
                                           kernel=svc_algo, 
                                           random_state=42, 
                                           probability=True,
                                           class_weight=svc_cls_wgt))
    elif classifier_name == 'RF':
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
        rf_estimators = trial.suggest_int("rf_estimators", 10, 1000, log=True)
        rf_criterion = trial.suggest_categorical("rf_criterion", ['gini', 'entropy'])
        rf_cls_wgt = trial.suggest_categorical("rf_cls_wgt", [None, 'balanced', 'balanced_subsample'])
        classifier_obj = make_pipeline(CategoricalTransformer(index_of_ej),  
                                       OutlierRemover(),
                                       IterativeImputer(initial_strategy='median'),
                                       RandomForestClassifier(max_depth=rf_max_depth,  
                                                              criterion=rf_criterion,
                                                              n_estimators=rf_estimators,
                                                              random_state=42,
                                                             class_weight=rf_cls_wgt))
    else:
        xgb_max_depth = trial.suggest_int("xgb_max_depth", 2, 32, log=True)
        xgb_estimators = trial.suggest_int("xgb_estimators", 10, 1000, log=True)
        xgb_booster = trial.suggest_categorical("xgb_booster", ['gbtree', 'dart'])
        xgb_scale_pos_weight = trial.suggest_float('xgb_scale_pos_weight', 1, 10, log=True)
        classifier_obj = make_pipeline(CategoricalTransformer(index_of_ej), 
                                       OutlierRemover(),
                                       IterativeImputer(initial_strategy='median'),
                                       xgb.XGBClassifier(max_depth=xgb_max_depth,
                                                         n_estimators=xgb_estimators,
                                                         booster=xgb_booster,
                                                         scale_pos_weight = xgb_scale_pos_weight,
                                                         random_state=42,
                                                         use_label_encoder=False,
                                                         eval_metric='logloss'))
    score = cross_val_score(classifier_obj, 
                            x, 
                            y, 
                            cv=StratifiedKFold(shuffle=True, random_state=42), 
                            scoring=get_bal_log_loss(),
                            n_jobs=-1)
    accuracy = score.mean()
    return accuracy

In [17]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)
print(study.best_trial)

[I 2023-07-08 08:24:49,071] A new study created in memory with name: no-name-beca3247-e003-4545-847c-629771ea6d2d
[I 2023-07-08 08:24:52,480] Trial 0 finished with value: -0.6788845950756731 and parameters: {'classifier': 'XGB', 'xgb_max_depth': 7, 'xgb_estimators': 12, 'xgb_booster': 'dart', 'xgb_scale_pos_weight': 1.9838155356548532}. Best is trial 0 with value: -0.6788845950756731.
[I 2023-07-08 08:24:54,215] Trial 1 finished with value: -0.756378254510902 and parameters: {'classifier': 'RF', 'rf_max_depth': 11, 'rf_estimators': 114, 'rf_criterion': 'entropy', 'rf_cls_wgt': 'balanced_subsample'}. Best is trial 0 with value: -0.6788845950756731.
[I 2023-07-08 08:24:57,357] Trial 2 finished with value: -0.6070818261052898 and parameters: {'classifier': 'XGB', 'xgb_max_depth': 3, 'xgb_estimators': 694, 'xgb_booster': 'gbtree', 'xgb_scale_pos_weight': 7.606684136001854}. Best is trial 2 with value: -0.6070818261052898.
[I 2023-07-08 08:25:01,605] Trial 3 finished with value: -0.77839312

FrozenTrial(number=194, state=TrialState.COMPLETE, values=[-0.32495545478861454], datetime_start=datetime.datetime(2023, 7, 8, 8, 30, 23, 306331), datetime_complete=datetime.datetime(2023, 7, 8, 8, 30, 24, 801950), params={'classifier': 'XGB', 'xgb_max_depth': 2, 'xgb_estimators': 13, 'xgb_booster': 'gbtree', 'xgb_scale_pos_weight': 9.974904601424486}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'classifier': CategoricalDistribution(choices=('SVC', 'RF', 'XGB')), 'xgb_max_depth': IntDistribution(high=32, log=True, low=2, step=1), 'xgb_estimators': IntDistribution(high=1000, log=True, low=10, step=1), 'xgb_booster': CategoricalDistribution(choices=('gbtree', 'dart')), 'xgb_scale_pos_weight': FloatDistribution(high=10.0, log=True, low=1.0, step=None)}, trial_id=194, value=None)


In [18]:
study.best_params

{'classifier': 'XGB',
 'xgb_max_depth': 2,
 'xgb_estimators': 13,
 'xgb_booster': 'gbtree',
 'xgb_scale_pos_weight': 9.974904601424486}

In [19]:
best_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                              OutlierRemover(),
                              IterativeImputer(initial_strategy='median'),
                              xgb.XGBClassifier(max_depth=2,
                                                n_estimators=13,
                                                booster='gbtree',
                                                scale_pos_weight = 9.974904601424486,
                                                random_state=42,
                                                use_label_encoder=False,
                                                eval_metric='logloss'))

In [20]:
best_pipeline.fit(x_train, y_train)
log_loss = get_bal_log_loss()
log_loss(best_pipeline, x_test, y_test)

-0.41475448386352204

  from pandas import MultiIndex, Int64Index
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  from pandas import MultiIndex, Int64Index
  loglike = -n_samples / 2 * np.log(x_trans.var())
  from pandas import MultiIndex, Int64Index
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  from pandas import MultiIndex, Int64Index
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())


In [21]:
def objective(trial):
    x, y = x_train, y_train

    xgb_max_depth = trial.suggest_int("xgb_max_depth", 2, 32, log=True)
    xgb_estimators = trial.suggest_int("xgb_estimators", 10, 1000, log=True)
    xgb_booster = trial.suggest_categorical("xgb_booster", ['gbtree', 'dart'])
    xgb_scale_pos_weight = trial.suggest_float('xgb_scale_pos_weight', 1, 100, log=True)
    classifier_obj = make_pipeline(CategoricalTransformer(index_of_ej), 
                                   OutlierRemover(),
                                   IterativeImputer(initial_strategy='median'),
                                   xgb.XGBClassifier(max_depth=xgb_max_depth,
                                                     n_estimators=xgb_estimators,
                                                     booster=xgb_booster,
                                                     scale_pos_weight = xgb_scale_pos_weight,
                                                     random_state=42,
                                                     use_label_encoder=False,
                                                     eval_metric='logloss'))
    score = cross_val_score(classifier_obj, 
                            x, 
                            y, 
                            cv=StratifiedKFold(shuffle=True, random_state=42), 
                            scoring=get_bal_log_loss(),
                            n_jobs=-1)
    accuracy = score.mean()
    return accuracy

In [22]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)
print(study.best_trial)

[I 2023-07-08 08:37:13,600] A new study created in memory with name: no-name-b9c1ad40-77ee-4396-8cf0-e43fd06de551
[I 2023-07-08 08:37:16,763] Trial 0 finished with value: -0.30415180946414483 and parameters: {'xgb_max_depth': 2, 'xgb_estimators': 36, 'xgb_booster': 'dart', 'xgb_scale_pos_weight': 22.048752372764103}. Best is trial 0 with value: -0.30415180946414483.
[I 2023-07-08 08:37:18,528] Trial 1 finished with value: -0.4325461474861214 and parameters: {'xgb_max_depth': 3, 'xgb_estimators': 42, 'xgb_booster': 'gbtree', 'xgb_scale_pos_weight': 21.752490714820986}. Best is trial 0 with value: -0.30415180946414483.
[I 2023-07-08 08:37:21,550] Trial 2 finished with value: -0.5927261629681405 and parameters: {'xgb_max_depth': 10, 'xgb_estimators': 507, 'xgb_booster': 'gbtree', 'xgb_scale_pos_weight': 24.02842720997775}. Best is trial 0 with value: -0.30415180946414483.
[I 2023-07-08 08:37:23,467] Trial 3 finished with value: -0.48477949980991786 and parameters: {'xgb_max_depth': 2, 'xg

FrozenTrial(number=24, state=TrialState.COMPLETE, values=[-0.23371325915400903], datetime_start=datetime.datetime(2023, 7, 8, 8, 37, 58, 391704), datetime_complete=datetime.datetime(2023, 7, 8, 8, 38, 0, 155860), params={'xgb_max_depth': 2, 'xgb_estimators': 25, 'xgb_booster': 'dart', 'xgb_scale_pos_weight': 35.92582512604342}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'xgb_max_depth': IntDistribution(high=32, log=True, low=2, step=1), 'xgb_estimators': IntDistribution(high=1000, log=True, low=10, step=1), 'xgb_booster': CategoricalDistribution(choices=('gbtree', 'dart')), 'xgb_scale_pos_weight': FloatDistribution(high=100.0, log=True, low=1.0, step=None)}, trial_id=24, value=None)


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index


In [23]:
study.best_params

{'xgb_max_depth': 2,
 'xgb_estimators': 25,
 'xgb_booster': 'dart',
 'xgb_scale_pos_weight': 35.92582512604342}

In [24]:
best_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                              OutlierRemover(),
                              IterativeImputer(initial_strategy='median'),
                              xgb.XGBClassifier(max_depth=2,
                                                n_estimators=25,
                                                booster='dart',
                                                scale_pos_weight = 35.92582512604342,
                                                random_state=42,
                                                use_label_encoder=False,
                                                eval_metric='logloss'))

In [25]:
best_pipeline.fit(x_train, y_train)
log_loss = get_bal_log_loss()
log_loss(best_pipeline, x_test, y_test)

-0.30998755209192785

In [31]:
def objective(trial):
    x, y = x_train, y_train

    xgb_max_depth = trial.suggest_int("xgb_max_depth", 2, 16, log=True)
    xgb_estimators = trial.suggest_int("xgb_estimators", 10, 50, log=True)
    xgb_booster = trial.suggest_categorical("xgb_booster", ['gbtree', 'dart'])
    xgb_scale_pos_weight = trial.suggest_float('xgb_scale_pos_weight', 29, 40, log=True)
    classifier_obj = make_pipeline(CategoricalTransformer(index_of_ej), 
                                   OutlierRemover(),
                                   IterativeImputer(initial_strategy='median'),
                                   xgb.XGBClassifier(max_depth=xgb_max_depth,
                                                     n_estimators=xgb_estimators,
                                                     booster=xgb_booster,
                                                     scale_pos_weight = xgb_scale_pos_weight,
                                                     random_state=42,
                                                     use_label_encoder=False,
                                                     eval_metric='logloss'))
    score = cross_val_score(classifier_obj, 
                            x, 
                            y, 
                            cv=StratifiedKFold(shuffle=True, random_state=42), 
                            scoring=get_bal_log_loss(),
                            n_jobs=-1)
    accuracy = score.mean()
    return accuracy

In [32]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)
print(study.best_trial)

[I 2023-07-08 09:20:40,935] A new study created in memory with name: no-name-51b4eaea-f155-4dce-bcb4-6c2d24976fa4
[I 2023-07-08 09:20:44,325] Trial 0 finished with value: -0.5506846183937621 and parameters: {'xgb_max_depth': 8, 'xgb_estimators': 31, 'xgb_booster': 'dart', 'xgb_scale_pos_weight': 33.70393621885378}. Best is trial 0 with value: -0.5506846183937621.
[I 2023-07-08 09:20:45,940] Trial 1 finished with value: -0.34893396757601514 and parameters: {'xgb_max_depth': 3, 'xgb_estimators': 20, 'xgb_booster': 'dart', 'xgb_scale_pos_weight': 39.27758388120352}. Best is trial 1 with value: -0.34893396757601514.
[I 2023-07-08 09:20:47,507] Trial 2 finished with value: -0.39898827967907 and parameters: {'xgb_max_depth': 4, 'xgb_estimators': 14, 'xgb_booster': 'gbtree', 'xgb_scale_pos_weight': 30.40327715229492}. Best is trial 1 with value: -0.34893396757601514.
[I 2023-07-08 09:20:49,353] Trial 3 finished with value: -0.4581841958080711 and parameters: {'xgb_max_depth': 9, 'xgb_estimato

FrozenTrial(number=157, state=TrialState.COMPLETE, values=[-0.22732044207409854], datetime_start=datetime.datetime(2023, 7, 8, 9, 25, 3, 670745), datetime_complete=datetime.datetime(2023, 7, 8, 9, 25, 5, 286639), params={'xgb_max_depth': 2, 'xgb_estimators': 26, 'xgb_booster': 'dart', 'xgb_scale_pos_weight': 35.892864263920046}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'xgb_max_depth': IntDistribution(high=16, log=True, low=2, step=1), 'xgb_estimators': IntDistribution(high=50, log=True, low=10, step=1), 'xgb_booster': CategoricalDistribution(choices=('gbtree', 'dart')), 'xgb_scale_pos_weight': FloatDistribution(high=40.0, log=True, low=29.0, step=None)}, trial_id=157, value=None)


In [33]:
study.best_params

{'xgb_max_depth': 2,
 'xgb_estimators': 26,
 'xgb_booster': 'dart',
 'xgb_scale_pos_weight': 35.892864263920046}

In [34]:
best_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                              OutlierRemover(),
                              IterativeImputer(initial_strategy='median'),
                              xgb.XGBClassifier(max_depth=2,
                                                n_estimators=26,
                                                booster='dart',
                                                scale_pos_weight = 35.892864263920046,
                                                random_state=42,
                                                use_label_encoder=False,
                                                eval_metric='logloss'))

In [35]:
best_pipeline.fit(x_train, y_train)
log_loss = get_bal_log_loss()
log_loss(best_pipeline, x_test, y_test)

-0.32865939513469616

  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
