In [1]:
import warnings
warnings.filterwarnings("ignore")
from collections import Counter
import pandas as pd
import numpy as np
import optuna
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.metrics import precision_score, accuracy_score, recall_score,f1_score

# Data Parser

In [2]:
Train = pd.read_csv('../Dataset/Train.csv')
Test = pd.read_csv('../Dataset/Test.csv')

x_train = np.array(Train.iloc[:,:-1])
x_test = np.array(Test.iloc[:,:-1])

y_train = Train.iloc[:,-1]
y_test = Test.iloc[:,-1]

print(x_train.shape, x_test.shape)

(31167, 47) (988, 47)


# Random Forest Classifier

- [x] Calibration
- [x] Oversampling
- [x] Parameter Tuning

In [3]:
class ClassifierModel:
    def __init__(self,model, parameters=None):
        self.model=model
        print('Model Selected : ',self.model.__name__)
        self.parameters = parameters
        self.best_param = None
        self.scorer_function = None
        self.y_pred = None
        
        
    def fit(self, x_train, y_train,
            scorer_function=accuracy_score, 
            oversampling=False, 
            tune_param=True,
           n_iter=100):
        self.scorer_function = scorer_function
        if oversampling:
            print('Previous dataset shape %s' % Counter(y_train))
            sm = BorderlineSMOTE(random_state=42)
            x_train, y_train = sm.fit_resample(x_train, y_train)
            print('Resampled dataset shape %s' % Counter(y_train))
            
        if tune_param:
            self.parameter_tune(n_iter=n_iter)
            self.best_param['n_jobs']=-1
            self.model = self.model(**self.best_param)
            
        else:
            if self.parameters is not None:
                self.model = self.model(**self.parameters)
            else:
                self.model = self.model(n_jobs=-1)
        
        self.model.fit(x_train, y_train)

    def predict(self, x_test):
        self.y_pred = self.model.predict(x_test)
        return self.y_pred

    def predict_proba(self, x_test):
        proba = self.model.predict_proba(x_test)
        return proba
    
    def evaluate(self, x_test, y_test, metric=[accuracy_score, recall_score, f1_score]):
        self.y_pred = self.predict(x_test)
        for met in metric:
            print(str(met.__name__),' : ',met(y_test, self.y_pred))

        
    def parameter_tune(self, n_iter=100):
        optuna.logging.set_verbosity(optuna.logging.WARNING)
        self.default_params = self.parameters
        self.study = optuna.create_study(direction='maximize')
        self.study.optimize(self.objective, n_trials=n_iter,show_progress_bar=True)
        self.trial = self.study.best_trial
        self.best_param = self.trial.params
        
        
    def objective(self, trial):
        parameter_dict = {}
        for key, value in self.default_params.items():
            parameter_dict[key] = trial.suggest_categorical(key, value)

        parameter_dict['n_jobs']=-1
        clf = self.model(**parameter_dict)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        return self.scorer_function(y_test, y_pred)



# Parameter tuning Random Forest

In [4]:
params_list_RF = {'max_depth' : [2, 5, 8, 10, None],
                    'min_samples_split': [2, 3, 5, 10],
                    'min_samples_leaf': [1, 2, 3, 5, 10],
                    'class_weight':['balanced', None],
                    'random_state':[135],
                    'n_estimators': [50, 75, 100]}

clf = ClassifierModel(RandomForestClassifier, parameters=params_list_RF)
clf.fit(x_train, y_train, oversampling=False, tune_param=True, n_iter=10)
clf.evaluate(x_test, y_test)

Model Selected :  RandomForestClassifier


  0%|          | 0/10 [00:00<?, ?it/s]

accuracy_score  :  0.881578947368421
recall_score  :  0.75
f1_score  :  0.7664670658682634


# xgboost

In [5]:
params_list_xgb = {'n_estimators': [50, 75, 100], 
          'learning_rate': [0.1, 0.5],
          'max_depth': [3, 5, 10]}


clf = ClassifierModel(XGBClassifier, parameters=params_list_xgb)
clf.fit(x_train, y_train, oversampling=False, tune_param=True, n_iter=10)
clf.evaluate(x_test, y_test)

Model Selected :  XGBClassifier


  0%|          | 0/10 [00:00<?, ?it/s]

accuracy_score  :  0.8896761133603239
recall_score  :  0.58984375
f1_score  :  0.734793187347932


# without oversampling and parameter tuning

In [6]:
parameters = {'bootstrap': True,
              'n_estimators': 50,
              'max_features': 'sqrt',
              'min_samples_split': 2,
              'max_depth': None,
              'min_samples_leaf': 50,
              'max_leaf_nodes': None,
              'n_jobs':-1,
              'random_state':135}

clf = ClassifierModel(RandomForestClassifier, parameters=parameters)
clf.fit(x_train, y_train, oversampling=False, tune_param=False, n_iter=10)
clf.evaluate(x_test, y_test, metric=[accuracy_score, recall_score, f1_score])

Model Selected :  RandomForestClassifier
accuracy_score  :  0.8765182186234818
recall_score  :  0.5390625
f1_score  :  0.6934673366834172


# with oversampling

In [7]:
clf = ClassifierModel(RandomForestClassifier, parameters=parameters)
clf.fit(x_train, y_train, oversampling=True, tune_param=False, n_iter=10)
clf.evaluate(x_test, y_test, metric=[accuracy_score, recall_score, f1_score])

Model Selected :  RandomForestClassifier
Previous dataset shape Counter({0: 24254, 1: 6913})
Resampled dataset shape Counter({1: 24254, 0: 24254})
accuracy_score  :  0.8502024291497976
recall_score  :  0.7890625
f1_score  :  0.7318840579710145


# oversampling + parameter tuning

In [8]:
clf = ClassifierModel(RandomForestClassifier, parameters=params_list_RF)
clf.fit(x_train, y_train, oversampling=True, tune_param=True, n_iter=10)
clf.evaluate(x_test, y_test, metric=[accuracy_score, recall_score, f1_score])

Model Selected :  RandomForestClassifier
Previous dataset shape Counter({0: 24254, 1: 6913})
Resampled dataset shape Counter({1: 24254, 0: 24254})


  0%|          | 0/10 [00:00<?, ?it/s]

accuracy_score  :  0.8441295546558705
recall_score  :  0.77734375
f1_score  :  0.7210144927536232
