In [1]:
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.metrics import precision_score, accuracy_score, recall_score,f1_score

# Data Parser

In [2]:
Train = pd.read_csv('../Dataset/Train.csv')
Test = pd.read_csv('../Dataset/Test.csv')

x_train = np.array(Train.iloc[:,:-1])
x_test = np.array(Test.iloc[:,:-1])

y_train = Train.iloc[:,-1]
y_test = Test.iloc[:,-1]

print(x_train.shape, x_test.shape)

(31167, 47) (988, 47)


# Random Forest Classifier

- [x] Calibration
- [x] Oversampling

In [3]:
class RFClassifier:
    def __init__(self, parameters=None, calibrate=False):
        self.RF_model = RandomForestClassifier()
        self.calibrate = calibrate
        
    def fit(self, x_train, y_train, oversampling=False):
        if oversampling:
            print('Previous dataset shape %s' % Counter(y_train))
            sm = BorderlineSMOTE(random_state=42)
            x_train, y_train = sm.fit_resample(x_train, y_train)
            print('Resampled dataset shape %s' % Counter(y_train))
        self.RF_model.fit(x_train, y_train)
        if self.calibrate:
            self.sig_RF = CalibratedClassifierCV(self.RF_model)
            self.sig_RF.fit(x_train, y_train)

    def predict(self, x_test):
        if self.calibrate:
            RF_predictions = self.sig_RF.predict(x_test)
        else:
            RF_predictions = self.RF_model.predict(x_test)
        return RF_predictions

    def predict_proba(self, x_test):
        if self.calibrate:
            proba = self.sig_RF.predict_proba(x_test)
        else:
            proba = self.RF_model.predict_proba(x_test)
        return proba
    
    def evaluate(self, x_test, y_test):
        y_pred = self.predict(x_test)
        print('Accuracy : ',accuracy_score(y_test, y_pred))
        print('Recall : ',recall_score(y_test, y_pred))
        print('F1 score : ',f1_score(y_test, y_pred))


In [4]:
clf = RFClassifier()

In [6]:
clf.fit(x_train, y_train, oversampling=True)
clf.evaluate(x_test, y_test)

Previous dataset shape Counter({0: 24254, 1: 6913})
Resampled dataset shape Counter({1: 24254, 0: 24254})
Accuracy :  0.8694331983805668
Recall :  0.73828125
F1 score :  0.7455621301775148


In [7]:
clf.fit(x_train, y_train, oversampling=False)
clf.evaluate(x_test, y_test)

Accuracy :  0.8836032388663968
Recall :  0.58984375
F1 score :  0.724220623501199
