In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
df=pd.read_csv('/kaggle/input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv')

In [None]:

df.head()

### Dataset Information

This dataset contains information on default payments, demographic factors, credit data, history of payment, and bill statements of credit card clients in Taiwan from April 2005 to September 2005. 

#### Content

There are 25 variables:

ID: ID of each client

LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit

SEX: Gender (1=male, 2=female)

EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)

MARRIAGE: Marital status (1=married, 2=single, 3=others)

AGE: Age in years

PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, … 
8=payment delay for eight months, 9=payment delay for nine months and above)

PAY_2: Repayment status in August, 2005 (scale same as above)

PAY_3: Repayment status in July, 2005 (scale same as above)

PAY_4: Repayment status in June, 2005 (scale same as above)

PAY_5: Repayment status in May, 2005 (scale same as above)

PAY_6: Repayment status in April, 2005 (scale same as above)

BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)

BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)

BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)

BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)

BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)

BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)

PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)

PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)

PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)

PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)

PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)

PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)

default.payment.next.month: Default payment (1=yes, 0=no)

In [None]:
df.info()

In [None]:
pd.set_option('display.max_columns',30)
df.describe()

In [None]:
df1=df.drop('ID',axis=1)

In [None]:
df1['default.payment.next.month'].value_counts()

In [None]:
df1.corr()

In [None]:
cat=['SEX','EDUCATION','MARRIAGE','PAY_0',
'PAY_2',
'PAY_3',
'PAY_4',
'PAY_5',
'PAY_6',
'default.payment.next.month']
for i in cat:
    sns.countplot(df1[i],hue=df1['default.payment.next.month'])
    plt.show()

In [None]:
df1['PAY_2'].value_counts()

In [None]:
num=['LIMIT_BAL','AGE',
'BILL_AMT1',
'BILL_AMT2',
'BILL_AMT3',
'BILL_AMT4',
'BILL_AMT5',
'BILL_AMT6',
'PAY_AMT1',
'PAY_AMT2',
'PAY_AMT3',
'PAY_AMT4',
'PAY_AMT5',
'PAY_AMT6']
for i in num:
    sns.boxplot(y=df[i],x=df['default.payment.next.month'])
    plt.show()

In [None]:
x=df1.drop('default.payment.next.month',axis=1)
y=df1['default.payment.next.month']

In [None]:


from statsmodels.stats.outliers_influence import variance_inflation_factor

vif=[variance_inflation_factor(x.values,i) for i in range(x.shape[1])]

pd.DataFrame(vif,index=x.columns)

# Classification Models

## No Changes 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split,RandomizedSearchCV,KFold,cross_val_score
from sklearn.metrics import roc_curve,roc_auc_score,accuracy_score,classification_report,confusion_matrix

In [None]:
def mod_eval(algo,x,y):
    cv1= cross_val_score(algo,x,y,scoring='roc_auc',cv=10)
    cv2=cross_val_score(algo,x,y,scoring='accuracy',cv=10)
    
    print('10-fold auc_score',np.mean(cv1))
    print('10-fold accuracy',np.mean(cv2))
    
    

In [None]:
def rand_search(algo,params):
    rs=RandomizedSearchCV(algo,param_distributions=params,random_state=0,n_jobs=-1,n_iter=100,scoring='roc_auc',cv=5)
    mod=rs.fit(x,y)
    print(mod.best_score_)
    return mod.best_params_

In [None]:
rfc_params={'n_estimators':st.randint(50,300),
    'criterion':['gini','entropy'],
    'max_depth':st.randint(2,20),
    'min_samples_split':st.randint(2,100),
    'min_samples_leaf':st.randint(2,100)}
lgb_params={ 'num_leaves':st.randint(31,60),
   'max_depth':st.randint(2,20),
    'learning_rate':st.uniform(0,1),
    'n_estimators':st.randint(50,300),
    'min_split_gain':st.uniform(0,0.3)}

In [None]:
rbp=rand_search(RandomForestClassifier(),rfc_params)

In [None]:
lbp=rand_search(LGBMClassifier(),lgb_params)

In [None]:
models={'Logistic Regression':LogisticRegression(solver='liblinear'),'Random Forest':RandomForestClassifier(**rbp),
       'Light GBM(Boosting)':LGBMClassifier(**lbp),'Gausian Naive Bayes':GaussianNB()
       }

In [None]:
for i in models.keys():
    print(i,'\n')
    mod_eval(models[i],x,y)

## Let us try SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sm=SMOTE(sampling_strategy=0.5,random_state=7)
x_sm,y_sm=sm.fit_resample(x,y)
print(x_sm.shape,y_sm.shape)

In [None]:
y_sm.value_counts()

In [None]:
def rand_search_sm(algo,params):
    rs=RandomizedSearchCV(algo,param_distributions=params,random_state=0,n_jobs=-1,n_iter=100,scoring='roc_auc',cv=10)
    mod=rs.fit(x_sm,y_sm)
    print(mod.best_score_)
    return mod.best_params_

In [None]:
rbp_sm=rand_search_sm(RandomForestClassifier(),rfc_params)

In [None]:
lbp_sm=rand_search_sm(LGBMClassifier(),lgb_params)

In [None]:
models_sm={'Logistic Regression':LogisticRegression(solver='liblinear'),'Random Forest':RandomForestClassifier(**rbp_sm),
       'Light GBM(Boosting)':LGBMClassifier(**lbp_sm),'Gausian Naive Bayes':GaussianNB()
       }

In [None]:
for i in models_sm.keys():
    print(i,'\n')
    mod_eval(models_sm[i],x_sm,y_sm)

It is clear that Random Forest and Light GBM are giving the best results. Hence we can pick these Continue with these algorithms.

By using SMOTE we have seen sigificant improvement in auc but the accuracy is lower

In [None]:
def model_eval(algo,x,y):
    x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.3,random_state=3)

    mod=algo.fit(x_train,y_train)

    train_pred=mod.predict(x_train)
    train_prob=mod.predict_proba(x_train)[:,1]

    print('overall accuracy -Train: ',accuracy_score(y_train,train_pred))
    print('confusion matrix:\n',confusion_matrix (y_train,train_pred))
    print('AUC-train:',roc_auc_score(y_train,train_prob))

    test_pred=mod.predict(x_test)
    test_prob=mod.predict_proba(x_test)[:,1]

    print('overall accuracy -Test: ',accuracy_score(y_test,test_pred))
    print('confusion matrix:\n',confusion_matrix (y_test,test_pred))
    print('AUC-Test:',roc_auc_score(y_test,test_prob))
    print('Classification Report \n',classification_report(y_test,test_pred))

    fpr,tpr,th=roc_curve(y_test,test_prob)
    fig,ax=plt.subplots()
    plt.plot(fpr,tpr)
    plt.plot(fpr,fpr)

In [None]:
model_eval(RandomForestClassifier(**rbp),x,y)

In [None]:
model_eval(RandomForestClassifier(**rbp_sm),x_sm,y_sm)

In [None]:
model_eval(LGBMClassifier(**lbp),x,y)

In [None]:
model_eval(LGBMClassifier(**lbp_sm),x_sm,y_sm)

# Results

### With no changes

#### Random Forest 

accuracy = 0.8282222222222222

auc score = 0.7909824139986402

#### Light GBM

accuracy = 0.8293333333333334

auc score = 0.7881805926211242

### Using SMOTE

#### Random Forest 

accuracy = 0.7924671866083317

auc score = 0.83868289770667

#### Light GBM

accuracy = 0.8028343161498954

auc score = 0.8452131366800507

## Conclusion

We can see that Smote sloves the problem created by imbalanced data to some extent. It significantly increases recal and also the auc score, thus incresesing the possibility of choosing a better threshold according to the business requirements