In [25]:
import warnings
from imblearn.under_sampling import TomekLinks
from xgboost import XGBClassifier
warnings.filterwarnings('ignore')
from id3 import Id3Estimator
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score


In [26]:
data= pd.read_csv('../3_credit_qmvd_cmvd.csv')
data.head()

Unnamed: 0,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies
0,Fully Paid,445412.0,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,6.0,1.0,228190.0,416746.0,>=1
1,Fully Paid,99999999.0,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,18.0,1.0,297996.0,750090.0,0.0
2,Fully Paid,347666.0,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,9.0,0.0,256329.0,386958.0,0.0
3,Charged Off,206602.0,Short Term,7290.0,896857.0,10+ years,Home Mortgage,Debt Consolidation,16367.74,17.3,6.0,0.0,215308.0,272448.0,0.0
4,Fully Paid,217646.0,Short Term,730.0,1184194.0,< 1 year,Home Mortgage,Debt Consolidation,10855.08,19.6,13.0,1.0,122170.0,272052.0,>=1


In [27]:
data.dtypes

Loan Status                   object
Current Loan Amount          float64
Term                          object
Credit Score                 float64
Annual Income                float64
Years in current job          object
Home Ownership                object
Purpose                       object
Monthly Debt                 float64
Years of Credit History      float64
Number of Open Accounts      float64
Number of Credit Problems     object
Current Credit Balance       float64
Maximum Open Credit          float64
Bankruptcies                  object
dtype: object

In [28]:
##copying data
data1=data.copy()

## Data Binning

In [29]:
#Current Loan Amount
group_names= ['1', '2', '3', '4']
bins=[0, 0.2, 0.6, 0.8, 1]
data1['Current Loan Amount'] = pd.qcut(data1['Current Loan Amount'], bins, labels=group_names)
data1['Current Loan Amount'].value_counts()

2    26996
1    13499
4    13498
3    13497
Name: Current Loan Amount, dtype: int64

In [30]:
#Credit Score
data1['Credit Score'] = pd.qcut(data1['Credit Score'], bins, labels=group_names)
data1['Credit Score'].value_counts()

2    27121
1    13617
4    13444
3    13308
Name: Credit Score, dtype: int64

In [31]:
#Annual Income
data1['Annual Income'] = pd.qcut(data1['Annual Income'], bins, labels=group_names)
data1['Annual Income'].value_counts()

2    26996
3    13499
1    13498
4    13497
Name: Annual Income, dtype: int64

In [32]:
#Monthly Debt
data1['Monthly Debt'] = pd.qcut(data1['Monthly Debt'], bins, labels=group_names)
data1['Monthly Debt'].value_counts()

2    26996
4    13498
3    13498
1    13498
Name: Monthly Debt, dtype: int64

In [33]:
#Years of Credit History
data1['Years of Credit History'] = pd.qcut(data1['Years of Credit History'], bins, labels=group_names)
data1['Years of Credit History'].value_counts()

2    26988
1    13593
3    13527
4    13382
Name: Years of Credit History, dtype: int64

In [34]:
#Number of Open Accounts 
data1['Number of Open Accounts'] = pd.qcut(data1['Number of Open Accounts'], bins, labels=group_names)
data1['Number of Open Accounts'].value_counts()

2    29238
1    16051
4    11541
3    10660
Name: Number of Open Accounts, dtype: int64

In [35]:
#Current Credit Balance
data1['Current Credit Balance'] = pd.qcut(data1['Current Credit Balance'], bins, labels=group_names)
data1['Current Credit Balance'].value_counts()

2    26998
3    13498
1    13498
4    13496
Name: Current Credit Balance, dtype: int64

In [36]:
#Maximum Open Credit
data1['Maximum Open Credit'] = pd.qcut(data1['Maximum Open Credit'], bins, labels=group_names)
data1['Maximum Open Credit'].value_counts()

2    26994
1    13500
4    13498
3    13498
Name: Maximum Open Credit, dtype: int64

### label encoding for categorical variables

In [37]:
def labelencoding(df, var_list):
    for var in var_list:
        df[var]=df[var].astype('category')
        df[var] = df[var].cat.codes
    return df

cols= ["Loan Status", "Term","Years in current job","Home Ownership","Purpose","Number of Credit Problems","Bankruptcies"]
data1=labelencoding(data1, cols)

## dicision tree (ID3)


### spliting data en X et Y

In [38]:
X= data1.drop('Loan Status', axis=1)
Y= data1['Loan Status']

### spliting the data to train and test

In [39]:
X_train , X_test , Y_train , Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 123, stratify=Y)

### balancing the data again

In [40]:
tl =TomekLinks()
X_train , Y_train = tl.fit_resample(X_train, Y_train)



In [41]:
## training the model
ID3 = Id3Estimator()
ID3.fit(X_train, Y_train)
y_predict= ID3.predict(X_test)


In [42]:
## accuracy function 
def accuracy(confusion_matrix):
    return confusion_matrix.trace() / confusion_matrix.sum()

## evaluating the model
print(confusion_matrix(Y_test,y_predict))
print("----------")
print(classification_report(Y_test, y_predict))
print("----------")
print("accuracy: ",accuracy(confusion_matrix(Y_test,y_predict)))


[[ 771 2472]
 [ 587 9668]]
----------
              precision    recall  f1-score   support

           0       0.57      0.24      0.34      3243
           1       0.80      0.94      0.86     10255

    accuracy                           0.77     13498
   macro avg       0.68      0.59      0.60     13498
weighted avg       0.74      0.77      0.74     13498

----------
accuracy:  0.7733738331604683


## dicision tree (CART)

In [43]:
### spliting data en X et Y
X= data.drop('Loan Status', axis=1)
Y= data['Loan Status']
### One hot encoding
X= pd.get_dummies(X, drop_first= True)
Y= pd.get_dummies(Y, drop_first= True)
## spliting the data to train and test
X_train , X_test , Y_train , Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 123, stratify=Y)
### balancing the data again
tl =TomekLinks()
X_train , Y_train = tl.fit_resample(X_train, Y_train)
##Training the model
dtc = DecisionTreeClassifier(random_state=1234)
dtc.fit(X_train, Y_train)
y_predict= dtc.predict(X_test)

In [44]:
## evaluating the model
print(confusion_matrix(Y_test,y_predict))
print("----------")
print(classification_report(Y_test, y_predict))
print("----------")
print("accuracy: ",dtc.score(X_test, Y_test))

[[1635 1608]
 [2094 8161]]
----------
              precision    recall  f1-score   support

           0       0.44      0.50      0.47      3243
           1       0.84      0.80      0.82     10255

    accuracy                           0.73     13498
   macro avg       0.64      0.65      0.64     13498
weighted avg       0.74      0.73      0.73     13498

----------
accuracy:  0.725737146243888


## Random Forest & XGBoost
### Random Forest

In [45]:
##training the model
rfc= RandomForestClassifier(n_estimators=500, n_jobs=-1,random_state= 123)
rfc.fit(X_train, Y_train)
y_predict= rfc.predict(X_test)

In [46]:
## evaluating the model
print(confusion_matrix(Y_test,y_predict))
print("----------")
print(classification_report(Y_test, y_predict))
print("----------")
print("Score: ",rfc.score(X_test, Y_test))

[[ 1017  2226]
 [  154 10101]]
----------
              precision    recall  f1-score   support

           0       0.87      0.31      0.46      3243
           1       0.82      0.98      0.89     10255

    accuracy                           0.82     13498
   macro avg       0.84      0.65      0.68     13498
weighted avg       0.83      0.82      0.79     13498

----------
Score:  0.8236775818639799


### XGBoost

In [47]:
## converting Y_train & X_test & Y_train & Y_test to numpy array pour XGBoost
X_test_xg = X_test.values
Y_test_xg = Y_test.values

In [48]:
## copying
X_train_xg = X_train
Y_train_xg = Y_train

In [49]:
##training the model
xgb= XGBClassifier()
xgb.fit(X_train_xg, Y_train_xg)
y_predict= xgb.predict(X_test_xg)

In [50]:
## evaluating the model
print(confusion_matrix(Y_test_xg,y_predict))
print("----------")
print(classification_report(Y_test_xg, y_predict))
print("----------")
print("Score: ",xgb.score(X_test_xg, Y_test_xg))


[[  925  2318]
 [   29 10226]]
----------
              precision    recall  f1-score   support

           0       0.97      0.29      0.44      3243
           1       0.82      1.00      0.90     10255

    accuracy                           0.83     13498
   macro avg       0.89      0.64      0.67     13498
weighted avg       0.85      0.83      0.79     13498

----------
Score:  0.8261223885020003


## Cross Validation (K-Fold)

In [51]:
scores1= cross_val_score(estimator=rfc, X=X_train, y=Y_train, cv=10)
scores2= cross_val_score(estimator=xgb, X=X_train_xg, y=Y_train_xg, cv=10)

In [52]:
scores1_list= [round(elem, 3) for elem in list(scores1)]
scores2_list= [round(elem, 3) for elem in list(scores2)]
print("Decision Tree: ")
print("---------------")
print("scores: ", scores1_list)
print("Mean score: ", round(scores1.mean(), 3))
print("standart deviation: ", round(scores1.std(), 3))
print("")
print("XGBoost: ")
print("---------------")
print("scores: ", scores2_list)
print("Mean score: ", round(scores2.mean(), 3))
print("standart deviation: ", round(scores2.std(), 3))



Decision Tree: 
---------------
scores:  [0.811, 0.804, 0.811, 0.812, 0.802, 0.806, 0.798, 0.806, 0.811, 0.801]
Mean score:  0.806
standart deviation:  0.005

XGBoost: 
---------------
scores:  [0.813, 0.802, 0.809, 0.811, 0.8, 0.805, 0.799, 0.806, 0.81, 0.803]
Mean score:  0.806
standart deviation:  0.005
