# Dealing With Imbalanced Classes: SMOTE

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score

%matplotlib inline

In [22]:
df = pd.read_excel('../data/default of credit card clients.xls') 
df = df.iloc[1:, 1:]
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [23]:
X = df.loc[:,'X1':'X23']
y = df.loc[:, 'Y'].astype('category')

In [24]:
from collections import Counter
Counter(y).items()

dict_items([(1, 6636), (0, 23364)])

In [25]:
from imblearn.over_sampling import SMOTE

X_smote, y_smote = SMOTE().fit_sample(X, y)
Counter(y_smote).items()

dict_items([(1, 23364), (0, 23364)])

# Split the data into training and test sets

In [6]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_smote,
                                                 y_smote,
                                                 test_size=.25,
                                                 random_state=123)

# Decision Tree

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

#params = {'max_depth':range(1,30)}
#gscv = GridSearchCV(DecisionTreeClassifier(), params, cv=10)
#gscv.fit(X_train,y_train)
#print(gscv.best_params_)

{'max_depth': 10}


In [8]:
#best param output:{'max_depth': 10}

In [16]:
dtc = DecisionTreeClassifier(max_depth=10)
dtc.fit(X_train,y_train)

print(f'training accuracy: {dtc.score(X_train,y_train):0.4f}')
print(f'test accuracy: {dtc.score(X_test,y_test):0.4f}')
print(f'AUC: {roc_auc_score(y_test, dtc.predict(X_test)):0.4f}')

training accuracy: 0.8693
test accuracy: 0.8384
AUC: 0.8393


# GradientBoosting

In [10]:
from sklearn.ensemble import GradientBoostingClassifier

#params = {'learning_rate':[0.1,0.05,0.01],
#          'n_estimators':[10,50,100],
#          'max_depth':[1,2,5,7],
#          'min_samples_split': [2,3,4],
#          'min_samples_leaf':[1,3,5]}
#gscv = GridSearchCV(GradientBoostingClassifier(), params, n_jobs=-1)
#gscv.fit(X_train,y_train)
#print(gscv.best_params_)

{'learning_rate': 0.1, 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 100}


In [None]:
# best params output{'learning_rate': 0.1, 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 100}

In [17]:
gbc = GradientBoostingClassifier(learning_rate=0.1,
                                 max_depth=7,
                                 min_samples_leaf=1,
                                 min_samples_split=4,
                                 n_estimators=100,
                                 )
gbc.fit(X_train,y_train)

print(f'training accuracy: {gbc.score(X_train,y_train):0.4f}')
print(f'test accuracy: {gbc.score(X_test,y_test):0.4f}')
print(f'AUC: {roc_auc_score(y_test, gbc.predict(X_test)):0.4f}')

training accuracy: 0.9044
test accuracy: 0.8718
AUC: 0.8726
