In [13]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score


In [14]:
data= pd.read_csv('../3_credit_qmvd_cmvd.csv')
data.head()

Unnamed: 0,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies
0,Fully Paid,445412.0,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,6.0,1.0,228190.0,416746.0,>=1
1,Fully Paid,99999999.0,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,18.0,1.0,297996.0,750090.0,0.0
2,Fully Paid,347666.0,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,9.0,0.0,256329.0,386958.0,0.0
3,Charged Off,206602.0,Short Term,7290.0,896857.0,10+ years,Home Mortgage,Debt Consolidation,16367.74,17.3,6.0,0.0,215308.0,272448.0,0.0
4,Fully Paid,217646.0,Short Term,730.0,1184194.0,< 1 year,Home Mortgage,Debt Consolidation,10855.08,19.6,13.0,1.0,122170.0,272052.0,>=1


### Imbalanced dataset

In [15]:
data['Loan Status'].value_counts()

Fully Paid     51276
Charged Off    16214
Name: Loan Status, dtype: int64

### spliting data en X et Y

In [16]:
X= data.drop('Loan Status', axis=1)
Y= data['Loan Status']

### One hot encoding

In [17]:
X= pd.get_dummies(X, drop_first= True)
Y= pd.get_dummies(Y, drop_first= True)


### spliting the data to train and test

In [18]:
X_train , X_test , Y_train , Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 123, stratify=Y)

In [19]:
Y_test['Fully Paid'].value_counts()


1    10255
0     3243
Name: Fully Paid, dtype: int64

## Random Forest & XGBoost
### Random Forest

In [20]:
##training the model
rfc= RandomForestClassifier(n_estimators=500, n_jobs=-1,random_state= 123)
rfc.fit(X_train, Y_train)
y_predict= rfc.predict(X_test)

In [21]:
## evaluating the model
print(confusion_matrix(Y_test,y_predict))
print("----------")
print(classification_report(Y_test, y_predict))
print("----------")
print("accuracy: ",rfc.score(X_test, Y_test))

[[  920  2323]
 [   50 10205]]
----------
              precision    recall  f1-score   support

           0       0.95      0.28      0.44      3243
           1       0.81      1.00      0.90     10255

    accuracy                           0.82     13498
   macro avg       0.88      0.64      0.67     13498
weighted avg       0.85      0.82      0.79     13498

----------
accuracy:  0.8241961772114388


### XGBoost

In [22]:
## converting Y_train & X_test & Y_train & Y_test to numpy array pour XGBoost
X_train_xg = X_train.values
X_test_xg = X_test.values
Y_train_xg = Y_train.values
Y_test_xg = Y_test.values

In [23]:
##training the model
xgb= XGBClassifier()
xgb.fit(X_train_xg, Y_train_xg)
y_predict= xgb.predict(X_test_xg)

In [24]:
## evaluating the model
print(confusion_matrix(Y_test_xg,y_predict))
print("----------")
print(classification_report(Y_test_xg, y_predict))
print("----------")
print("accuracy: ",xgb.score(X_test_xg, Y_test_xg))

[[  878  2365]
 [    3 10252]]
----------
              precision    recall  f1-score   support

           0       1.00      0.27      0.43      3243
           1       0.81      1.00      0.90     10255

    accuracy                           0.82     13498
   macro avg       0.90      0.64      0.66     13498
weighted avg       0.86      0.82      0.78     13498

----------
accuracy:  0.8245666024596237


## Cross Validation (K-Fold)


In [25]:
scores1= cross_val_score(estimator=rfc, X=X_train, y=Y_train, cv=10)
scores2= cross_val_score(estimator=xgb, X=X_train_xg, y=Y_train_xg, cv=10)

In [32]:
scores1_list= [round(elem, 3) for elem in list(scores1)]
scores2_list= [round(elem, 3) for elem in list(scores2)]
print("Decision Tree: ")
print("---------------")
print("scores: ", scores1_list)
print("Mean score: ", round(scores1.mean(), 3))
print("standart deviation: ", round(scores1.std(), 3))
print("")
print("XGBoost: ")
print("---------------")
print("scores: ", scores2_list)
print("Mean score: ", round(scores2.mean(), 3))
print("standart deviation: ", round(scores2.std(), 3))



Decision Tree: 
---------------
scores:  [0.828, 0.819, 0.828, 0.825, 0.819, 0.823, 0.816, 0.824, 0.827, 0.818]
Mean score:  0.823
standart deviation:  0.004

XGBoost: 
---------------
scores:  [0.829, 0.819, 0.829, 0.829, 0.818, 0.822, 0.817, 0.823, 0.826, 0.821]
Mean score:  0.823
standart deviation:  0.004
