In [136]:
import warnings
warnings.filterwarnings('ignore')
from id3 import Id3Estimator
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score


### Discrétisation pour les variables numériques (data bining)


In [137]:
data= pd.read_csv('../3_credit_qmvd_cmvd.csv')
data.head()

Unnamed: 0,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies
0,Fully Paid,445412.0,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,6.0,1.0,228190.0,416746.0,>=1
1,Fully Paid,99999999.0,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,18.0,1.0,297996.0,750090.0,0.0
2,Fully Paid,347666.0,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,9.0,0.0,256329.0,386958.0,0.0
3,Charged Off,206602.0,Short Term,7290.0,896857.0,10+ years,Home Mortgage,Debt Consolidation,16367.74,17.3,6.0,0.0,215308.0,272448.0,0.0
4,Fully Paid,217646.0,Short Term,730.0,1184194.0,< 1 year,Home Mortgage,Debt Consolidation,10855.08,19.6,13.0,1.0,122170.0,272052.0,>=1


In [138]:
data.dtypes

Loan Status                   object
Current Loan Amount          float64
Term                          object
Credit Score                 float64
Annual Income                float64
Years in current job          object
Home Ownership                object
Purpose                       object
Monthly Debt                 float64
Years of Credit History      float64
Number of Open Accounts      float64
Number of Credit Problems     object
Current Credit Balance       float64
Maximum Open Credit          float64
Bankruptcies                  object
dtype: object

In [139]:
##copying data
data1=data.copy()

## Data Binning

In [140]:
#Current Loan Amount
group_names= ['1', '2', '3', '4']
bins=[0, 0.2, 0.6, 0.8, 1]
data1['Current Loan Amount'] = pd.qcut(data1['Current Loan Amount'], bins, labels=group_names)
data1['Current Loan Amount'].value_counts()

2    26996
1    13499
4    13498
3    13497
Name: Current Loan Amount, dtype: int64

In [141]:
#Credit Score
data1['Credit Score'] = pd.qcut(data1['Credit Score'], bins, labels=group_names)
data1['Credit Score'].value_counts()

2    27121
1    13617
4    13444
3    13308
Name: Credit Score, dtype: int64

In [142]:
#Annual Income
data1['Annual Income'] = pd.qcut(data1['Annual Income'], bins, labels=group_names)
data1['Annual Income'].value_counts()

2    26996
3    13499
1    13498
4    13497
Name: Annual Income, dtype: int64

In [143]:
#Monthly Debt
data1['Monthly Debt'] = pd.qcut(data1['Monthly Debt'], bins, labels=group_names)
data1['Monthly Debt'].value_counts()

2    26996
4    13498
3    13498
1    13498
Name: Monthly Debt, dtype: int64

In [144]:
#Years of Credit History
data1['Years of Credit History'] = pd.qcut(data1['Years of Credit History'], bins, labels=group_names)
data1['Years of Credit History'].value_counts()

2    26988
1    13593
3    13527
4    13382
Name: Years of Credit History, dtype: int64

In [145]:
#Number of Open Accounts 
data1['Number of Open Accounts'] = pd.qcut(data1['Number of Open Accounts'], bins, labels=group_names)
data1['Number of Open Accounts'].value_counts()

2    29238
1    16051
4    11541
3    10660
Name: Number of Open Accounts, dtype: int64

In [146]:
#Current Credit Balance
data1['Current Credit Balance'] = pd.qcut(data1['Current Credit Balance'], bins, labels=group_names)
data1['Current Credit Balance'].value_counts()

2    26998
3    13498
1    13498
4    13496
Name: Current Credit Balance, dtype: int64

In [147]:
#Maximum Open Credit
data1['Maximum Open Credit'] = pd.qcut(data1['Maximum Open Credit'], bins, labels=group_names)
data1['Maximum Open Credit'].value_counts()

2    26994
1    13500
4    13498
3    13498
Name: Maximum Open Credit, dtype: int64

### label encoding for categorical variables

In [149]:
def labelencoding(df, var_list):
    for var in var_list:
        df[var]=df[var].astype('category')
        df[var] = df[var].cat.codes
    return df

cols= ["Loan Status", "Term","Years in current job","Home Ownership","Purpose","Number of Credit Problems","Bankruptcies"]
data1=labelencoding(data1, cols)

## dicision tree (ID3)


### spliting data en X et Y

In [150]:
X= data1.drop('Loan Status', axis=1)
Y= data1['Loan Status']

### spliting the data to train and test

In [151]:
X_train , X_test , Y_train , Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 123, stratify=Y)

In [152]:
ID3 = Id3Estimator()
ID3.fit(X_train, Y_train)
y_predict= ID3.predict(X_test)


In [153]:
## accuracy function 
def accuracy(confusion_matrix):
    return confusion_matrix.trace() / confusion_matrix.sum()

## evaluating the model
print(confusion_matrix(Y_test,y_predict))
print("----------")
print(classification_report(Y_test, y_predict))
print("----------")
print("accuracy: ",accuracy(confusion_matrix(Y_test,y_predict)))


[[ 718 2525]
 [ 497 9758]]
----------
              precision    recall  f1-score   support

           0       0.59      0.22      0.32      3243
           1       0.79      0.95      0.87     10255

    accuracy                           0.78     13498
   macro avg       0.69      0.59      0.59     13498
weighted avg       0.75      0.78      0.74     13498

----------
accuracy:  0.7761149799970366


## dicision tree (CART)

In [154]:
### spliting data en X et Y
X= data.drop('Loan Status', axis=1)
Y= data['Loan Status']
### One hot encoding
X= pd.get_dummies(X, drop_first= True)
Y= pd.get_dummies(Y, drop_first= True)
## spliting the data to train and test
X_train , X_test , Y_train , Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 123, stratify=Y)
##Training the model
dtc = DecisionTreeClassifier(random_state=1234)
dtc.fit(X_train, Y_train)
y_predict= dtc.predict(X_test)

In [155]:
## evaluating the model
print(confusion_matrix(Y_test,y_predict))
print("----------")
print(classification_report(Y_test, y_predict))
print("----------")
print("accuracy: ",dtc.score(X_test, Y_test))

[[1566 1677]
 [1809 8446]]
----------
              precision    recall  f1-score   support

           0       0.46      0.48      0.47      3243
           1       0.83      0.82      0.83     10255

    accuracy                           0.74     13498
   macro avg       0.65      0.65      0.65     13498
weighted avg       0.75      0.74      0.74     13498

----------
accuracy:  0.7417395169654764
