In [1]:
import os
os.chdir('./../')

In [2]:
import numpy as np
import optuna
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer

from scripts.categorical_transformer import CategoricalTransformer
from scripts.loss import get_bal_log_loss
from scripts.outlier_remover import OutlierRemover



In [3]:
base_path = './data'

In [4]:
all_df = pd.read_csv(f'{base_path}/train.csv', index_col=0)
final_test_df = pd.read_csv(f'{base_path}/test.csv', index_col=0)

In [5]:
all_train = all_df.loc[:, all_df.columns != 'Class']
all_test = all_df.loc[:, 'Class']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(all_train, 
                                                    all_test, 
                                                    random_state=42, 
                                                    stratify=all_test)

In [7]:
index_of_ej = list(x_train.columns).index('EJ')

In [8]:
svc_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                             OutlierRemover(),
                             IterativeImputer(initial_strategy='median'),
                             PowerTransformer(),
                             SVC(probability=True, random_state=42, class_weight='balanced'))

In [9]:
cv_scores = cross_val_score(svc_pipeline, x_train, y_train, cv=5, scoring=get_bal_log_loss())
cv_scores

  loglike = -n_samples / 2 * np.log(x_trans.var())


array([-0.45674195, -0.50885743, -0.84616288, -0.66060942, -1.04965479])

In [10]:
rf_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                            OutlierRemover(),
                            IterativeImputer(initial_strategy='median'),
                            RandomForestClassifier(random_state=42, class_weight='balanced'))

In [11]:
cv_scores = cross_val_score(rf_pipeline, x_train, y_train, cv=5, scoring=get_bal_log_loss())
cv_scores

array([-0.71237826, -0.62150803, -0.75232311, -0.71401494, -1.00008393])

In [12]:
gb_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                            OutlierRemover(),
                            IterativeImputer(initial_strategy='median'),
                            GradientBoostingClassifier(random_state=42))

In [13]:
cv_scores = cross_val_score(gb_pipeline, x_train, y_train, cv=5, scoring=get_bal_log_loss())
cv_scores

array([-1.03757899, -0.41996786, -0.98374988, -0.55716114, -1.4627205 ])

In [14]:
def objective(trial):
    x, y = x_train, y_train

    classifier_name = trial.suggest_categorical("classifier", ['SVC' ,'RF', 'GB'])
    if classifier_name == "SVC":
        svc_c = trial.suggest_float("svc_c", 1e-10, 1e10, log=True)
        svc_algo = trial.suggest_categorical("svc_algo", ['poly', 'rbf', 'sigmoid'])
        svc_cls_wgt = trial.suggest_categorical("svc_cls_wgt", [None, 'balanced'])
        classifier_obj = make_pipeline(CategoricalTransformer(index_of_ej), 
                                       OutlierRemover(),
                                       IterativeImputer(initial_strategy='median'),
                                       PowerTransformer(),
                                       SVC(C=svc_c, 
                                           kernel=svc_algo, 
                                           random_state=42, 
                                           probability=True,
                                           class_weight=svc_cls_wgt))
    elif classifier_name == 'RF':
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
        rf_estimators = trial.suggest_int("rf_estimators", 10, 1000, log=True)
        rf_criterion = trial.suggest_categorical("rf_criterion", ['gini', 'entropy'])
        rf_cls_wgt = trial.suggest_categorical("rf_cls_wgt", [None, 'balanced', 'balanced_subsample'])
        classifier_obj = make_pipeline(CategoricalTransformer(index_of_ej),  
                                       OutlierRemover(),
                                       IterativeImputer(initial_strategy='median'),
                                       RandomForestClassifier(max_depth=rf_max_depth,  
                                                              criterion=rf_criterion,
                                                              n_estimators=rf_estimators,
                                                              random_state=42,
                                                             class_weight=rf_cls_wgt))
    else:
        gb_max_depth = trial.suggest_int("gb_max_depth", 2, 32, log=True)
        gb_estimators = trial.suggest_int("gb_estimators", 10, 1000, log=True)
        gb_criterion = trial.suggest_categorical("gb_criterion", ['friedman_mse', 'squared_error'])
        classifier_obj = make_pipeline(CategoricalTransformer(index_of_ej), 
                                       OutlierRemover(),
                                       IterativeImputer(initial_strategy='median'),
                                       GradientBoostingClassifier(max_depth=gb_max_depth,  
                                                                  criterion=gb_criterion,
                                                                  n_estimators=gb_estimators,
                                                                  random_state=42))

    score = cross_val_score(classifier_obj, x, y, n_jobs=-1, cv=5, scoring=get_bal_log_loss())
    accuracy = score.mean()
    return accuracy

In [15]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)
print(study.best_trial)

[I 2023-07-06 05:20:33,802] A new study created in memory with name: no-name-3792c2d2-4efb-483c-9499-5da2ed1de59c
[I 2023-07-06 05:20:36,885] Trial 0 finished with value: -1.2788282906307846 and parameters: {'classifier': 'GB', 'gb_max_depth': 12, 'gb_estimators': 28, 'gb_criterion': 'friedman_mse'}. Best is trial 0 with value: -1.2788282906307846.
[I 2023-07-06 05:20:40,299] Trial 1 finished with value: -0.567826761981278 and parameters: {'classifier': 'RF', 'rf_max_depth': 4, 'rf_estimators': 631, 'rf_criterion': 'gini', 'rf_cls_wgt': 'balanced_subsample'}. Best is trial 1 with value: -0.567826761981278.
[I 2023-07-06 05:20:42,908] Trial 2 finished with value: -2.93111064277102 and parameters: {'classifier': 'GB', 'gb_max_depth': 8, 'gb_estimators': 92, 'gb_criterion': 'friedman_mse'}. Best is trial 1 with value: -0.567826761981278.
[I 2023-07-06 05:20:45,787] Trial 3 finished with value: -0.6981375727791386 and parameters: {'classifier': 'RF', 'rf_max_depth': 8, 'rf_estimators': 503

FrozenTrial(number=183, state=TrialState.COMPLETE, values=[-0.5288086315954714], datetime_start=datetime.datetime(2023, 7, 6, 5, 25, 9, 350167), datetime_complete=datetime.datetime(2023, 7, 6, 5, 25, 10, 689754), params={'classifier': 'RF', 'rf_max_depth': 3, 'rf_estimators': 56, 'rf_criterion': 'gini', 'rf_cls_wgt': 'balanced'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'classifier': CategoricalDistribution(choices=('SVC', 'RF', 'GB')), 'rf_max_depth': IntDistribution(high=32, log=True, low=2, step=1), 'rf_estimators': IntDistribution(high=1000, log=True, low=10, step=1), 'rf_criterion': CategoricalDistribution(choices=('gini', 'entropy')), 'rf_cls_wgt': CategoricalDistribution(choices=(None, 'balanced', 'balanced_subsample'))}, trial_id=183, value=None)


In [16]:
study.best_params

{'classifier': 'RF',
 'rf_max_depth': 3,
 'rf_estimators': 56,
 'rf_criterion': 'gini',
 'rf_cls_wgt': 'balanced'}

In [17]:
best_rf_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                                 OutlierRemover(),
                                 IterativeImputer(initial_strategy='median'),
                                 RandomForestClassifier(random_state=42,
                                                        max_depth=3,
                                                        n_estimators=56,
                                                        criterion='gini',
                                                        class_weight='balanced'))

In [22]:
best_rf_pipeline.fit(x_train, y_train)
log_loss = get_bal_log_loss()
log_loss(best_rf_pipeline, x_test, y_test)

-0.5870834369135836

In [23]:
#From a previous run with log loss of: 0.3035
old_best_rf_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                                     OutlierRemover(),
                                     IterativeImputer(initial_strategy='median'),
                                     RandomForestClassifier(random_state=42,
                                                            max_depth=7,
                                                            n_estimators=264,
                                                            criterion='entropy'))

In [24]:
old_best_rf_pipeline.fit(x_train, y_train)
log_loss = get_bal_log_loss()
log_loss(old_best_rf_pipeline, x_test, y_test)

-0.6437672570288462