# Importing Libraries

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



# models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# model tuning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# ensembles
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier


# class imbalance
from sklearn.dummy import DummyClassifier
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

# evaluating models
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# import xgboost 


# reading in the dataframe
raw_data = pd.read_csv('dataset.csv',index_col=0)

# Cleaning the data

In [26]:
def clean_data(raw_data):
    
    raw_data['MARRIAGE'].replace(0,3,inplace=True)
    raw_data['EDUCATION'].replace([0,5,6],4,inplace=True)
    
    raw_data.drop('PAY_6',axis = 1,inplace=True)
    raw_data['PAY_5'] = np.where(raw_data['PAY_AMT5'] + raw_data['PAY_AMT6'] >= raw_data['BILL_AMT6'],0,1)
    raw_data['PAY_4'] = np.where(raw_data['PAY_AMT4'] >= raw_data['BILL_AMT5'],0,raw_data['PAY_5']+1)
    raw_data['PAY_3'] = np.where(raw_data['PAY_AMT3'] >= raw_data['BILL_AMT4'],0,raw_data['PAY_4']+1)
    raw_data['PAY_2'] = np.where(raw_data['PAY_AMT2'] >= raw_data['BILL_AMT3'],0,raw_data['PAY_3']+1)
    raw_data['PAY_0'] = np.where(raw_data['PAY_AMT1'] >= raw_data['BILL_AMT2'],0,raw_data['PAY_2']+1)
    
    
    return raw_data
df = clean_data(raw_data)

In [27]:
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,200000,2,3,1,30,5,4,3,2,1,...,147273,149244,151973,6600,6000,5860,6000,5000,0,0
1,200000,2,4,2,27,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20000,2,2,2,28,4,3,2,1,0,...,8168,6894,11424,3353,3,5101,6,6530,8103,1
3,50000,1,2,2,23,2,1,0,2,1,...,48437,18712,19129,4175,41000,51705,700,718,700,0
4,20000,1,2,1,47,0,0,0,0,0,...,0,0,0,780,0,0,0,0,0,0


# Modeling begins

### Splitting target and features & splitting for training

In [28]:
X = df.drop('default payment next month',axis=1)
y = df['default payment next month']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25,random_state=10)

Base Logistic Regressor

In [29]:
# Using "balanced" to handle class imbalance
logit_base = LogisticRegression(class_weight='balanced',solver='liblinear')

logit_base.fit(X_train,y_train)

logit_base_preds = logit_base.predict(X_test)

print('The f1 score for the base Logistic regressor is:',f1_score(y_test,logit_base_preds))

The f1 score for the base Logistic regressor is: 0.4120370370370371


Base KNN

In [30]:
knn_base = KNeighborsClassifier()

knn_base.fit(X_train,y_train)

knn_base_preds = knn_base.predict(X_test)

print('The f1 score for the base KNN regressor is:',f1_score(y_test,knn_base_preds))

The f1 score for the base KNN regressor is: 0.24973544973544975


Base Decision tree


In [32]:
dt_base = DecisionTreeClassifier()

dt_base.fit(X_train,y_train)

dt_base_preds = knn_base.predict(X_test)

print('The f1 score for the base Decision tree regressor is:',f1_score(y_test,dt_base_preds))
print('The decision tree base precision is:',precision_score(y_test,dt_base_preds))
print('The decision tree base recall is:',recall_score(y_test,dt_base_preds))
print('The decision tree base accuracy is:',accuracy_score(y_test,dt_base_preds))

The f1 score for the base Decision tree regressor is: 0.24973544973544975
The decision tree base precision is: 0.36875
The decision tree base recall is: 0.1888
The decision tree base accuracy is: 0.7479111111111111


In [60]:
param_grid_log = {
                   'tol':[0.0001,.001],
                   'C':[1.0,10.], 
                  'class_weight':['balanced'],
                  'random_state':[1,10],
                  'solver':['lbfgs', 'liblinear', 'sag', 'saga'], 
                  'max_iter':[100,200]}

In [61]:
grid_logit = GridSearchCV(LogisticRegression(),param_grid_log,cv=5,n_jobs=-1)
grid_logit.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [1.0, 10.0], 'class_weight': ['balanced'],
                         'max_iter': [100, 200], 'random_state': [1, 10],
                         'solver': ['lbfgs', 'liblinear', 'sag', 'saga'],
                         'tol': [0.0001, 0.001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False

In [62]:
grid_log_pred = grid_logit.best_estimator_.predict(X_test)

In [63]:
f1_score(y_test,grid_log_pred)

0.3328467153284671

Random forest time

In [65]:
rf = RandomForestClassifier(random_state = 1, n_estimators=100, max_depth=1, max_features=4)
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=1, max_features=4, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [71]:
rf_preds = rf.predict(X_test)

In [76]:
param_grid = {
    'n_estimators': [100,200],
    'max_depth': [1,2,3,4],
    'max_features': [2,3,4,5],
    'criterion': ['gini','entropy']
}

In [80]:
gs = GridSearchCV(RandomForestClassifier(class_weight='balanced'),param_grid,cv=5,verbose=1,scoring='f1',n_jobs=-1)
gs.fit(X_train,y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  1.2min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True,
                                              class_weight='balanced',
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                             

In [81]:
f1_score(y_test, gs.best_estimator_.predict(X_test))

0.4459124690338563

In [82]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=4, max_features=3,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

## Validation set

In [99]:
val_set = pd.read_csv('validation_set.csv',index_col=0)
val_set = clean_data(val_set)

In [100]:
gs_preds_val = gs.best_estimator_.predict(val_set)

In [101]:
pd.DataFrame(gs_preds_val).to_csv('gs_rf_preds.csv')

In [109]:
pd.read_csv('gs_rf_preds.csv',index_col=0).describe()

Unnamed: 0,0
count,7500.0
mean,0.407067
std,0.49132
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0
