In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [2]:
loan = pd.read_csv("loan_data.csv")
y = loan.pop("not.fully.paid")

In [3]:
loan.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0


In [4]:
loan.columns

Index(['credit.policy', 'purpose', 'int.rate', 'installment', 'log.annual.inc',
       'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec'],
      dtype='object')

In [5]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
dtypes: float64(6), int64(6), object(1)
memory usage: 972.9+ KB


In [6]:
loan.isna().sum()

credit.policy        0
purpose              0
int.rate             0
installment          0
log.annual.inc       0
dti                  0
fico                 0
days.with.cr.line    0
revol.bal            0
revol.util           0
inq.last.6mths       0
delinq.2yrs          0
pub.rec              0
dtype: int64

In [7]:
y.value_counts()

0    8045
1    1533
Name: not.fully.paid, dtype: int64

In [8]:
loan["purpose"].value_counts()

debt_consolidation    3957
all_other             2331
credit_card           1262
home_improvement       629
small_business         619
major_purchase         437
educational            343
Name: purpose, dtype: int64

In [9]:
loan["credit.policy"].value_counts()

1    7710
0    1868
Name: credit.policy, dtype: int64

In [10]:
encodedln = pd.get_dummies(loan[['credit.policy','purpose']])

In [11]:
X = pd.concat([encodedln,loan.drop(['credit.policy','purpose'],axis=1)],axis=1)

In [12]:
X

Unnamed: 0,credit.policy,purpose_all_other,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec
0,1,0,0,1,0,0,0,0,0.1189,829.10,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0
1,1,0,1,0,0,0,0,0,0.1071,228.22,11.082143,14.29,707,2760.000000,33623,76.7,0,0,0
2,1,0,0,1,0,0,0,0,0.1357,366.86,10.373491,11.63,682,4710.000000,3511,25.6,1,0,0
3,1,0,0,1,0,0,0,0,0.1008,162.34,11.350407,8.10,712,2699.958333,33667,73.2,1,0,0
4,1,0,1,0,0,0,0,0,0.1426,102.92,11.299732,14.97,667,4066.000000,4740,39.5,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,1,0,0,0,0,0,0,0.1461,344.76,12.180755,10.39,672,10474.000000,215372,82.1,2,0,0
9574,0,1,0,0,0,0,0,0,0.1253,257.70,11.141862,0.21,722,4380.000000,184,1.1,5,0,0
9575,0,0,0,1,0,0,0,0,0.1071,97.81,10.596635,13.09,687,3450.041667,10036,82.9,8,0,0
9576,0,0,0,0,0,1,0,0,0.1600,351.58,10.819778,19.18,692,1800.000000,0,3.2,5,0,0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [14]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)

0.8460334029227558

In [15]:
def fit_predict(train, test, y_train, y_test, scaler, max_depth, 
                criterion = 'entropy', max_features = 1, min_samples_split = 4):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)        
    dt = DecisionTreeClassifier(criterion = criterion, max_depth=max_depth, 
                                random_state=42, max_features=max_features,
                               min_samples_split=min_samples_split)
    dt.fit(train_scaled, y_train)
    y_pred = dt.predict(test_scaled)
    print(accuracy_score(y_test, y_pred))

In [16]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.7343423799582464


In [17]:
#MAX Depth Tuning

In [18]:
max_depth = []
for i in range(1, 30):
    print("Accuracy using Max Depth =", i, end = ': ')
    fit_predict(X_train, X_test, y_train, y_test, StandardScaler(), i)

#Print("Max Depth:",max(max_depth))

Accuracy using Max Depth = 1: 0.8470772442588727
Accuracy using Max Depth = 2: 0.8470772442588727
Accuracy using Max Depth = 3: 0.8470772442588727
Accuracy using Max Depth = 4: 0.8470772442588727
Accuracy using Max Depth = 5: 0.8455114822546973
Accuracy using Max Depth = 6: 0.8455114822546973
Accuracy using Max Depth = 7: 0.8413361169102297
Accuracy using Max Depth = 8: 0.8423799582463466
Accuracy using Max Depth = 9: 0.8355949895615866
Accuracy using Max Depth = 10: 0.837160751565762
Accuracy using Max Depth = 11: 0.8308977035490606
Accuracy using Max Depth = 12: 0.8413361169102297
Accuracy using Max Depth = 13: 0.819937369519833
Accuracy using Max Depth = 14: 0.8277661795407099
Accuracy using Max Depth = 15: 0.826722338204593
Accuracy using Max Depth = 16: 0.7980167014613778
Accuracy using Max Depth = 17: 0.8100208768267223
Accuracy using Max Depth = 18: 0.7860125260960334
Accuracy using Max Depth = 19: 0.8011482254697286
Accuracy using Max Depth = 20: 0.7985386221294363
Accuracy usi

Max depth here is 5 also 6 is same
Accuracy = 0.8455114822546973

In [None]:
#MAX features split

In [20]:
for i in np.arange(0.1, 1.0, 0.1):
    print("Accuracy using Max Feature =", i, end = ': ')
    fit_predict(X_train, X_test, y_train, y_test, StandardScaler(), max_depth = 5, max_features= i)

Accuracy using Max Feature = 0.1: 0.8455114822546973
Accuracy using Max Feature = 0.2: 0.8387265135699373
Accuracy using Max Feature = 0.30000000000000004: 0.8444676409185804
Accuracy using Max Feature = 0.4: 0.8449895615866388
Accuracy using Max Feature = 0.5: 0.8418580375782881
Accuracy using Max Feature = 0.6: 0.8449895615866388
Accuracy using Max Feature = 0.7000000000000001: 0.8423799582463466
Accuracy using Max Feature = 0.8: 0.8434237995824635
Accuracy using Max Feature = 0.9: 0.842901878914405


In [None]:
#Min sample split

In [21]:
for i in range(2, 10):
    print("Accuracy using Min Sample Split =", i, end = ': ')
    fit_predict(X_train, X_test, y_train, y_test, StandardScaler(), max_depth = 5, min_samples_split= i)

Accuracy using Min Sample Split = 2: 0.8455114822546973
Accuracy using Min Sample Split = 3: 0.8455114822546973
Accuracy using Min Sample Split = 4: 0.8455114822546973
Accuracy using Min Sample Split = 5: 0.8455114822546973
Accuracy using Min Sample Split = 6: 0.8455114822546973
Accuracy using Min Sample Split = 7: 0.8455114822546973
Accuracy using Min Sample Split = 8: 0.8455114822546973
Accuracy using Min Sample Split = 9: 0.8455114822546973


In [None]:
#Criterion tuning

In [22]:
# Using my original result
for i in ['gini', 'entropy']:
    print("Accuracy using Criterion =", i, end = ': ')
    fit_predict(X_train, X_test, y_train, y_test, StandardScaler(), max_depth = 5, 
                criterion = i)

Accuracy using Criterion = gini: 0.8434237995824635
Accuracy using Criterion = entropy: 0.8455114822546973


In [23]:
def create_poly(train, test, degree):
    poly = PolynomialFeatures(degree = degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly, test_poly

In [24]:
for degree in [1, 2, 3, 4]:
    train_poly, test_poly = create_poly(X_train, X_test, degree)
    print('Polynomial Degree:', degree)
    fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), max_depth = 5)
    print(10*'-')
    
train_poly, test_poly = create_poly(X_train, X_test, 2)

Polynomial Degree: 1
0.8470772442588727
----------
Polynomial Degree: 2
0.8465553235908142
----------
Polynomial Degree: 3
0.8465553235908142
----------
Polynomial Degree: 4
0.8449895615866388
----------


In [25]:
train_poly, test_poly = create_poly(X_train, X_test, 2)

fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), max_depth = 5)

0.8465553235908142


In [None]:
#With Poly no improvement is there

In [None]:
#Random forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("RandomForest Accuracy:",accuracy_score(y_test, y_pred))

RandomForest Accuracy: 0.8475991649269311


In [29]:
rf.base_estimator

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [30]:
from sklearn.model_selection import GridSearchCV

In [31]:
params = {'n_estimators': [200,500,700], 'max_depth': [4, 5, 8], 'min_samples_leaf': [2, 3, 5]}

In [32]:
gsv = GridSearchCV(rf, params, verbose = 3)

In [33]:
gsv.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] max_depth=4, min_samples_leaf=2, n_estimators=200 ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=4, min_samples_leaf=2, n_estimators=200, score=0.838, total=   1.3s
[CV] max_depth=4, min_samples_leaf=2, n_estimators=200 ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV]  max_depth=4, min_samples_leaf=2, n_estimators=200, score=0.838, total=   1.3s
[CV] max_depth=4, min_samples_leaf=2, n_estimators=200 ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.5s remaining:    0.0s


[CV]  max_depth=4, min_samples_leaf=2, n_estimators=200, score=0.838, total=   1.2s
[CV] max_depth=4, min_samples_leaf=2, n_estimators=200 ...............
[CV]  max_depth=4, min_samples_leaf=2, n_estimators=200, score=0.838, total=   1.2s
[CV] max_depth=4, min_samples_leaf=2, n_estimators=200 ...............
[CV]  max_depth=4, min_samples_leaf=2, n_estimators=200, score=0.838, total=   1.4s
[CV] max_depth=4, min_samples_leaf=2, n_estimators=500 ...............
[CV]  max_depth=4, min_samples_leaf=2, n_estimators=500, score=0.838, total=   3.6s
[CV] max_depth=4, min_samples_leaf=2, n_estimators=500 ...............
[CV]  max_depth=4, min_samples_leaf=2, n_estimators=500, score=0.838, total=   3.2s
[CV] max_depth=4, min_samples_leaf=2, n_estimators=500 ...............
[CV]  max_depth=4, min_samples_leaf=2, n_estimators=500, score=0.838, total=   3.1s
[CV] max_depth=4, min_samples_leaf=2, n_estimators=500 ...............
[CV]  max_depth=4, min_samples_leaf=2, n_estimators=500, score=0.838, 

[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed:  7.5min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [35]:
gsv.best_params_

{'max_depth': 8, 'min_samples_leaf': 2, 'n_estimators': 200}

In [37]:
gsv.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=8, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [38]:
rf1 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                            max_depth=8, max_features='auto', max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=2, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
                            oob_score=True, random_state=101, verbose=0, warm_start=False)

In [39]:
rf1.fit(X_train,y_train)
y_pred = rf1.predict(X_test)
print("RandomForest Accuracy:",accuracy_score(y_test, y_pred))

RandomForest Accuracy: 0.8465553235908142


In [40]:
rf1.oob_score_

0.838162359697207

In [41]:
FI = rf1.feature_importances_

In [42]:
FI

array([0.07097821, 0.00861289, 0.0053644 , 0.00945318, 0.0041964 ,
       0.00555532, 0.00273178, 0.03272634, 0.11955092, 0.10158486,
       0.08780292, 0.07304338, 0.09101804, 0.08812812, 0.08800779,
       0.09732716, 0.09525968, 0.00993244, 0.00872619])

In [43]:
sorted(list(zip(FI, X_train.columns)), reverse=True)

[(0.11955091945517711, 'int.rate'),
 (0.10158485969390921, 'installment'),
 (0.09732715977400413, 'revol.util'),
 (0.09525967622364867, 'inq.last.6mths'),
 (0.09101803789563567, 'fico'),
 (0.08812811939787592, 'days.with.cr.line'),
 (0.0880077918675052, 'revol.bal'),
 (0.08780291839021943, 'log.annual.inc'),
 (0.07304337731104658, 'dti'),
 (0.07097821237190456, 'credit.policy'),
 (0.03272633636595737, 'purpose_small_business'),
 (0.009932435783639582, 'delinq.2yrs'),
 (0.00945317667390575, 'purpose_debt_consolidation'),
 (0.008726193110468421, 'pub.rec'),
 (0.008612887511500666, 'purpose_all_other'),
 (0.005555320985297026, 'purpose_home_improvement'),
 (0.0053643964544972085, 'purpose_credit_card'),
 (0.004196396634815846, 'purpose_educational'),
 (0.0027317840989916054, 'purpose_major_purchase')]

In [44]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,roc_auc_score, roc_curve

In [None]:
#Confusion matrix & Classification Report

In [46]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1622    1]
 [ 293    0]]
              precision    recall  f1-score   support

           0       0.85      1.00      0.92      1623
           1       0.00      0.00      0.00       293

    accuracy                           0.85      1916
   macro avg       0.42      0.50      0.46      1916
weighted avg       0.72      0.85      0.78      1916

