In [1]:
from collections import Counter
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.metrics import precision_score, accuracy_score, recall_score,f1_score

# Data parser

In [2]:
Train = pd.read_csv('../Dataset/Train.csv')
Test = pd.read_csv('../Dataset/Test.csv')

x_train = np.array(Train.iloc[:,:-1])
x_test = np.array(Test.iloc[:,:-1])

y_train = Train.iloc[:,-1]
y_test = Test.iloc[:,-1]

print(x_train.shape, x_test.shape)

(31167, 47) (988, 47)


# XGBoost Classifier

In [3]:
class XGBoostClassifier:
    def __init__(self, params):
        self.xgb_clf = XGBClassifier(**params)

    def fit(self, x_train, y_train, oversampling=False):
        if oversampling:
            print('Previous dataset shape %s' % Counter(y_train))
            sm = BorderlineSMOTE(random_state=42)
            x_train, y_train = sm.fit_resample(x_train, y_train)
            print('Resampled dataset shape %s' % Counter(y_train))
        self.xgb_clf.fit(x_train, y_train)

    def predict(self, x_test):
        predictions = self.xgb_clf.predict(x_test)
        return predictions

    def predict_proba(self, x_test):
        proba = self.xgb_clf.predict_proba(x_test)
        return proba
    
    def evaluate(self, x_test, y_test):
        y_pred = self.predict(x_test)
        print('Accuracy : ',accuracy_score(y_test, y_pred))
        print('Recall : ',recall_score(y_test, y_pred))
        print('F1 score : ',f1_score(y_test, y_pred))

In [4]:
params = {'n_estimators': 80, 'learning_rate': 0.5,
          'max_depth': 3, 'random_state': 0,
         'n_jobs':-1}

XGB = XGBoostClassifier(params)

# with oversampling

In [5]:
XGB.fit(x_train, y_train, oversampling=True)
XGB.evaluate(x_test, y_test)

Previous dataset shape Counter({0: 24254, 1: 6913})
Resampled dataset shape Counter({1: 24254, 0: 24254})
Accuracy :  0.8228744939271255
Recall :  0.828125
F1 score :  0.7078464106844741


# without oversampling

In [6]:
XGB.fit(x_train, y_train, oversampling=False)
XGB.evaluate(x_test, y_test)

Accuracy :  0.8846153846153846
Recall :  0.58984375
F1 score :  0.7259615384615384
