In [1]:
import os
os.chdir('./../')

In [2]:
import joblib
import numpy as np
import optuna
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer



In [3]:
base_path = './data'

In [4]:
all_df = pd.read_csv(f'{base_path}/train.csv', index_col=0)
final_test_df = pd.read_csv(f'{base_path}/test.csv', index_col=0)

In [5]:
all_train = all_df.loc[:, all_df.columns != 'Class']
all_test = all_df.loc[:, 'Class']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(all_train, 
                                                    all_test, 
                                                    random_state=42)

In [7]:
index_of_ej = list(x_train.columns).index('EJ')

In [8]:
class CategoricalTransformer:
    
    def __init__(self, index_of_col):
        self._index_of_col = index_of_col
        self._a = None
        self._b = None
    
    def fit(self, x, y):
        if isinstance(x, pd.DataFrame):
            x = x.values
        self._a = y[x[:, self._index_of_col] == 'A'].mean()
        self._b = y[x[:, self._index_of_col] == 'B'].mean()
    
    def transform(self, x):
        if isinstance(x, pd.DataFrame):
            x = x.values
        x[:, self._index_of_col] = np.where(x[:, self._index_of_col] == 'A', self._a, self._b)
        return x
        
    def fit_transform(self, x, y):
        self.fit(x, y)
        return self.transform(x)

In [9]:
class OutlierRemover:
    
    def __init__(self):
        self._lower_lims = {}
        self._upper_lims = {}
    
    def fit(self, x, y):
        for i in range(x.shape[1]):
            data = x[:, i].copy()
            mean = data.mean()
            std = data.std()
            self._lower_lims[i] = mean - 6*std
            self._upper_lims[i] = mean + 6*std
    
    def transform(self, x):
        for i in range(x.shape[1]):
            x[:, i] = np.where(((x[:, i] > self._upper_lims[i]) | (x[:, i] < self._lower_lims[i])), np.nan, x[:, i])
        return x
        
    def fit_transform(self, x, y):
        self.fit(x, y)
        return self.transform(x)

In [10]:
svc_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                             OutlierRemover(),
                             IterativeImputer(initial_strategy='median'),
                             PowerTransformer(),
                             SVC(probability=True, random_state=42))

In [11]:
cv_scores = cross_val_score(svc_pipeline, x_train, y_train, cv=5, scoring='neg_log_loss')
cv_scores

array([-0.20314225, -0.16314338, -0.3751    , -0.1582064 , -0.28970652])

In [12]:
rf_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                            OutlierRemover(),
                            IterativeImputer(initial_strategy='median'),
                            RandomForestClassifier(random_state=42))

In [13]:
cv_scores = cross_val_score(rf_pipeline, x_train, y_train, cv=5, scoring='neg_log_loss')
cv_scores

array([-0.23712174, -0.24068814, -0.31638874, -0.22337477, -0.26578255])

In [14]:
gb_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                            OutlierRemover(),
                            IterativeImputer(initial_strategy='median'),
                            GradientBoostingClassifier(random_state=42))

In [15]:
cv_scores = cross_val_score(gb_pipeline, x_train, y_train, cv=5, scoring='neg_log_loss')
cv_scores

array([-0.1612475 , -0.19537146, -0.3971608 , -0.15972658, -0.2568481 ])

In [16]:
def objective(trial):
    x, y = x_train, y_train

    classifier_name = trial.suggest_categorical("classifier", ["RF"])
    if classifier_name == "SVC":
        svc_c = trial.suggest_float("svc_c", 1e-10, 1e10, log=True)
        svc_algo = trial.suggest_categorical("svc_algo", ['poly', 'rbf', 'sigmoid'])
        classifier_obj = make_pipeline(CategoricalTransformer(index_of_ej), 
                                       OutlierRemover(),
                                       IterativeImputer(initial_strategy='median'),
                                       PowerTransformer(),
                                       SVC(C=svc_c, kernel=svc_algo, random_state=42, probability=True))
    elif classifier_name == 'RF':
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
        rf_estimators = trial.suggest_int("rf_estimators", 10, 1000, log=True)
        rf_criterion = trial.suggest_categorical("rf_criterion", ['gini', 'entropy'])
        classifier_obj = make_pipeline(CategoricalTransformer(index_of_ej),  
                                       OutlierRemover(),
                                       IterativeImputer(initial_strategy='median'),
                                       RandomForestClassifier(max_depth=rf_max_depth,  
                                                              criterion=rf_criterion,
                                                              n_estimators=rf_estimators,
                                                              random_state=42))
    else:
        gb_max_depth = trial.suggest_int("gb_max_depth", 2, 32, log=True)
        gb_estimators = trial.suggest_int("gb_estimators", 10, 1000, log=True)
        gb_criterion = trial.suggest_categorical("gb_criterion", ['friedman_mse', 'squared_error'])
        classifier_obj = make_pipeline(CategoricalTransformer(index_of_ej), 
                                       OutlierRemover(),
                                       IterativeImputer(initial_strategy='median'),
                                       GradientBoostingClassifier(max_depth=gb_max_depth,  
                                                                  criterion=gb_criterion,
                                                                  n_estimators=gb_estimators,
                                                                  random_state=42))

    score = cross_val_score(classifier_obj, x, y, n_jobs=-1, cv=5, scoring='neg_log_loss')
    accuracy = score.min()
    return accuracy

In [17]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print(study.best_trial)

[I 2023-07-05 17:34:01,267] A new study created in memory with name: no-name-834efac1-fff6-48fd-9ad1-52bfb4fe70ee
[I 2023-07-05 17:34:04,343] Trial 0 finished with value: -0.31727899963711215 and parameters: {'classifier': 'RF', 'rf_max_depth': 22, 'rf_estimators': 102, 'rf_criterion': 'entropy'}. Best is trial 0 with value: -0.31727899963711215.
[I 2023-07-05 17:34:05,670] Trial 1 finished with value: -0.3514347133065879 and parameters: {'classifier': 'RF', 'rf_max_depth': 3, 'rf_estimators': 88, 'rf_criterion': 'gini'}. Best is trial 0 with value: -0.31727899963711215.
[I 2023-07-05 17:34:07,336] Trial 2 finished with value: -0.3228992630528764 and parameters: {'classifier': 'RF', 'rf_max_depth': 6, 'rf_estimators': 174, 'rf_criterion': 'gini'}. Best is trial 0 with value: -0.31727899963711215.
[I 2023-07-05 17:34:08,783] Trial 3 finished with value: -0.31691897638487 and parameters: {'classifier': 'RF', 'rf_max_depth': 29, 'rf_estimators': 101, 'rf_criterion': 'gini'}. Best is trial

FrozenTrial(number=53, state=TrialState.COMPLETE, values=[-0.30428902018516873], datetime_start=datetime.datetime(2023, 7, 5, 17, 36, 5, 387550), datetime_complete=datetime.datetime(2023, 7, 5, 17, 36, 7, 491132), params={'classifier': 'RF', 'rf_max_depth': 19, 'rf_estimators': 263, 'rf_criterion': 'entropy'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'classifier': CategoricalDistribution(choices=('RF',)), 'rf_max_depth': IntDistribution(high=32, log=True, low=2, step=1), 'rf_estimators': IntDistribution(high=1000, log=True, low=10, step=1), 'rf_criterion': CategoricalDistribution(choices=('gini', 'entropy'))}, trial_id=53, value=None)


In [18]:
study.best_params

{'classifier': 'RF',
 'rf_max_depth': 19,
 'rf_estimators': 263,
 'rf_criterion': 'entropy'}

In [20]:
#From a previous run with log loss of: 0.3035
best_rf_pipeline = make_pipeline(CategoricalTransformer(index_of_ej), 
                                 OutlierRemover(),
                                 IterativeImputer(initial_strategy='median'),
                                 RandomForestClassifier(random_state=42,
                                                        max_depth=7,
                                                        n_estimators=264,
                                                        criterion='entropy'))

In [21]:
best_rf_pipeline.fit(x_train, y_train)
log_loss(y_test, best_rf_pipeline.predict_proba(x_test))

0.22734551340825515

