In [1]:
from collections import Counter
import pandas as pd
import numpy as np
import optuna
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.metrics import precision_score, accuracy_score, recall_score,f1_score

# Data Parser

In [2]:
Train = pd.read_csv('../Dataset/Train.csv')
Test = pd.read_csv('../Dataset/Test.csv')

x_train = np.array(Train.iloc[:,:-1])
x_test = np.array(Test.iloc[:,:-1])

y_train = Train.iloc[:,-1]
y_test = Test.iloc[:,-1]

print(x_train.shape, x_test.shape)

(31167, 47) (988, 47)


# Random Forest Classifier

- [x] Calibration
- [x] Oversampling
- [x] Parameter Tuning

In [3]:
class RFClassifier:
    def __init__(self,model, parameters=None, calibrate=False):
        self.model=model
        self.parameters = parameters
        self.calibrate = calibrate
        self.best_param = None
        self.scorer_function = None
        self.y_pred = None
        
#         if self.parameters is not None:
#             self.RF_model = RandomForestClassifier(**self.parameters)
#         else:
#             print('using default parameters')
#             self.RF_model = RandomForestClassifier(n_jobs=-1)
        
        
    def fit(self, x_train, y_train,
            scorer_function=accuracy_score, 
            oversampling=False, 
            tune_param=True,
           n_iter=100):
        self.scorer_function = scorer_function
        if oversampling:
            print('Previous dataset shape %s' % Counter(y_train))
            sm = BorderlineSMOTE(random_state=42)
            x_train, y_train = sm.fit_resample(x_train, y_train)
            print('Resampled dataset shape %s' % Counter(y_train))
            
        if tune_param:
            self.parameter_tune(n_iter=n_iter)
            self.model = self.model(**self.best_param)
            
        else:
            if self.parameters is not None:
                self.model = self.model(self.parameters)
            else:
                self.model = self.model(n_jobs=-1)
        
        self.model.fit(x_train, y_train)
#         if self.calibrate:
#             self.RF_model = CalibratedClassifierCV(self.RF_model)
#             self.RF_model.fit(x_train, y_train)

    def predict(self, x_test):
        self.y_pred = self.model.predict(x_test)
        return self.y_pred

    def predict_proba(self, x_test):
        proba = self.model.predict_proba(x_test)
        return proba
    
    def evaluate(self, x_test, y_test, metric=[accuracy_score, recall_score, f1_score]):
#         y_pred = self.predict(x_test)
        for met in metric:
            print(str(met.__name__),' : ',met(y_test, self.y_pred))

        
    def parameter_tune(self, n_iter=100):
        
#         self.default_params = {'max_depth' : [2, 5],
#                                 'min_samples_split': [2, 3, 5],
#                                 'min_samples_leaf': [1, 2, 3],
#                                 'class_weight':['balanced', None],
#                                 'random_state':[135],
#                                 'n_estimators': [75, 100],
#                                 'n_jobs':-1}
#         self.default_params.update(self.parameters)
        self.default_params = self.parameters
        self.study = optuna.create_study(direction='maximize')
        self.study.optimize(self.objective, n_trials=n_iter)
        self.trial = self.study.best_trial
        self.best_param = self.trial.params
        
        
    def objective(self, trial):
        max_depth = trial.suggest_categorical('max_depth', self.default_params['max_depth'])
        min_samples_split = trial.suggest_categorical('min_samples_split', self.default_params['min_samples_split'])
        min_samples_leaf = trial.suggest_categorical('min_samples_leaf', self.default_params['min_samples_leaf'])
        class_weight = trial.suggest_categorical('class_weight', self.default_params['class_weight'])
        n_estimators = trial.suggest_categorical('n_estimators', self.default_params['n_estimators'])
        n_jobs = self.default_params['n_jobs']

        clf = self.model(n_estimators=n_estimators, max_depth=max_depth,
                                     min_samples_split=min_samples_split,
                                    min_samples_leaf=min_samples_leaf,class_weight=class_weight,
                                    n_jobs=n_jobs)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        return self.scorer_function(y_test, y_pred)



# Optuna Test

In [None]:
params = {'max_depth' : [2, 5, 8, 10, None],
'min_samples_split': [2, 3, 5, 10],
'min_samples_leaf': [1, 2, 3, 5, 10],
'class_weight':['balanced', None],
'random_state':[135],
'n_estimators': [50, 75, 100],
'n_jobs':-1}

clf = RFClassifier(RandomForestClassifier, parameters=params, calibrate=True)
clf.fit(x_train, y_train, oversampling=False, tune_param=True, n_iter=10)
clf.predict(x_test)
clf.evaluate(x_test, y_test)

# xgboost

In [4]:
params = {'n_estimators': [50, 75, 100], 'learning_rate': [0.1,0.5],
          'max_depth': [3,5,10],
          'random_state': 0,
         'n_jobs':-1}


clf = RFClassifier(XGBClassifier, parameters=params, calibrate=True)
clf.fit(x_train, y_train, oversampling=False, tune_param=True, n_iter=10)
clf.predict(x_test)
clf.evaluate(x_test, y_test)

[32m[I 2022-06-21 10:41:10,614][0m A new study created in memory with name: no-name-0901cf72-7baa-4ddb-8d00-222a260aae2a[0m
[33m[W 2022-06-21 10:41:10,616][0m Trial 0 failed because of the following error: KeyError('min_samples_split')[0m
Traceback (most recent call last):
  File "/home/aci/anaconda3/envs/test_env/lib/python3.8/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_57663/3850936899.py", line 77, in objective
    min_samples_split = trial.suggest_categorical('min_samples_split', self.default_params['min_samples_split'])
KeyError: 'min_samples_split'


KeyError: 'min_samples_split'

In [None]:
parameters = {'bootstrap': True,
              'n_estimators': 50,
              'max_features': 'sqrt',
              'min_samples_split': 2,
              'max_depth': None,
              'min_samples_leaf': 50,
              'max_leaf_nodes': None,
              'n_jobs':-1,
              'random_state':135}

clf = RFClassifier(RandomForestClassifier, parameters=params, calibrate=True)
clf.fit(x_train, y_train, oversampling=False, tune_param=False, n_iter=10)
clf.evaluate(x_test, y_test, metric=[accuracy_score, recall_score, f1_score])

# with oversampling

In [None]:
clf.fit(x_train, y_train, oversampling=True)
clf.evaluate(x_test, y_test, metric=[accuracy_score, recall_score, f1_score])

# without oversampling

In [None]:
clf.fit(x_train, y_train, oversampling=False)
clf.evaluate(x_test, y_test, metric=[accuracy_score, recall_score, f1_score])