In [1]:
import os
os.chdir('./../')

In [2]:
import joblib
import numpy as np
import optuna
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer



In [3]:
base_path = './data'

In [4]:
all_df = pd.read_csv(f'{base_path}/train.csv', index_col=0)
final_test_df = pd.read_csv(f'{base_path}/test.csv', index_col=0)

In [5]:
all_train = all_df.loc[:, all_df.columns != 'Class']
all_test = all_df.loc[:, 'Class']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(all_train, 
                                                    all_test, 
                                                    random_state=42)

In [7]:
index_of_ej = list(x_train.columns).index('EJ')

In [8]:
class CategoricalTransformer:
    
    def __init__(self, index_of_col):
        self._index_of_col = index_of_col
        self._a = None
        self._b = None
    
    def fit(self, x, y):
        if isinstance(x, pd.DataFrame):
            x = x.values
        self._a = y[x[:, self._index_of_col] == 'A'].mean()
        self._b = y[x[:, self._index_of_col] == 'B'].mean()
    
    def transform(self, x):
        if isinstance(x, pd.DataFrame):
            x = x.values
        x[:, self._index_of_col] = np.where(x[:, self._index_of_col] == 'A', self._a, self._b)
        return x
        
    def fit_transform(self, x, y):
        self.fit(x, y)
        return self.transform(x)

In [9]:
svc_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                             IterativeImputer(initial_strategy='median'),
                             PowerTransformer(),
                             SVC(probability=True, random_state=42))

In [10]:
cv_scores = cross_val_score(svc_pipeline, x_train, y_train, cv=5, scoring='neg_log_loss')
cv_scores

array([-0.19601827, -0.16018103, -0.37800674, -0.15333945, -0.30716651])

In [11]:
rf_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                            IterativeImputer(initial_strategy='median'),
                            RandomForestClassifier(random_state=42))

In [12]:
cv_scores = cross_val_score(rf_pipeline, x_train, y_train, cv=5, scoring='neg_log_loss')
cv_scores

array([-0.23797267, -0.24670715, -0.32359585, -0.22211718, -0.2658788 ])

In [13]:
gb_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                            IterativeImputer(initial_strategy='median'),
                            GradientBoostingClassifier(random_state=42))

In [14]:
cv_scores = cross_val_score(gb_pipeline, x_train, y_train, cv=5, scoring='neg_log_loss')
cv_scores

array([-0.17150358, -0.19813229, -0.44429288, -0.15512462, -0.25250791])

## Feature selection via optuna

In [15]:
def objective(trial):
    x, y = x_train, y_train

    classifier_name = trial.suggest_categorical("classifier", ["SVC", "RF", 'GB'])
    if classifier_name == "SVC":
        svc_c = trial.suggest_float("svc_c", 1e-10, 1e10, log=True)
        svc_algo = trial.suggest_categorical("svc_algo", ['poly', 'rbf', 'sigmoid'])
        classifier_obj = make_pipeline(CategoricalTransformer(index_of_ej), 
                                       IterativeImputer(initial_strategy='median'),
                                       PowerTransformer(),
                                       SVC(C=svc_c, kernel=svc_algo, random_state=42, probability=True))
    elif classifier_name == 'RF':
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
        rf_estimators = trial.suggest_int("rf_estimators", 10, 1000, log=True)
        rf_criterion = trial.suggest_categorical("rf_criterion", ['gini', 'entropy'])
        classifier_obj = make_pipeline(CategoricalTransformer(index_of_ej),  
                                       IterativeImputer(initial_strategy='median'),
                                       RandomForestClassifier(max_depth=rf_max_depth,  
                                                              criterion=rf_criterion,
                                                              n_estimators=rf_estimators,
                                                              random_state=42))
    else:
        gb_max_depth = trial.suggest_int("gb_max_depth", 2, 32, log=True)
        gb_estimators = trial.suggest_int("gb_estimators", 10, 1000, log=True)
        gb_criterion = trial.suggest_categorical("gb_criterion", ['friedman_mse', 'squared_error'])
        classifier_obj = make_pipeline(CategoricalTransformer(index_of_ej), 
                                       IterativeImputer(initial_strategy='median'),
                                       GradientBoostingClassifier(max_depth=gb_max_depth,  
                                                                  criterion=gb_criterion,
                                                                  n_estimators=gb_estimators,
                                                                  random_state=42))

    score = cross_val_score(classifier_obj, x, y, n_jobs=-1, cv=5, scoring='neg_log_loss')
    accuracy = score.min()
    return accuracy

In [16]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print(study.best_trial)

[I 2023-07-04 16:46:49,836] A new study created in memory with name: no-name-d798d1c3-e985-4031-9b2f-2936d7b8b630
[I 2023-07-04 16:46:51,913] Trial 0 finished with value: -1.018051128629217 and parameters: {'classifier': 'RF', 'rf_max_depth': 10, 'rf_estimators': 13, 'rf_criterion': 'entropy'}. Best is trial 0 with value: -1.018051128629217.
[I 2023-07-04 16:46:52,589] Trial 1 finished with value: -0.4129989498073073 and parameters: {'classifier': 'SVC', 'svc_c': 1806409758.4186769, 'svc_algo': 'sigmoid'}. Best is trial 1 with value: -0.4129989498073073.
[I 2023-07-04 16:46:53,400] Trial 2 finished with value: -1.782119533250708 and parameters: {'classifier': 'SVC', 'svc_c': 5.092650332351243e-05, 'svc_algo': 'poly'}. Best is trial 1 with value: -0.4129989498073073.
[I 2023-07-04 16:46:54,339] Trial 3 finished with value: -0.3233883012464291 and parameters: {'classifier': 'RF', 'rf_max_depth': 10, 'rf_estimators': 131, 'rf_criterion': 'gini'}. Best is trial 3 with value: -0.32338830124

FrozenTrial(number=61, state=TrialState.COMPLETE, values=[-0.30960300887731634], datetime_start=datetime.datetime(2023, 7, 4, 16, 48, 59, 9627), datetime_complete=datetime.datetime(2023, 7, 4, 16, 49, 2, 457781), params={'classifier': 'RF', 'rf_max_depth': 29, 'rf_estimators': 745, 'rf_criterion': 'entropy'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'classifier': CategoricalDistribution(choices=('SVC', 'RF', 'GB')), 'rf_max_depth': IntDistribution(high=32, log=True, low=2, step=1), 'rf_estimators': IntDistribution(high=1000, log=True, low=10, step=1), 'rf_criterion': CategoricalDistribution(choices=('gini', 'entropy'))}, trial_id=61, value=None)
