In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [2]:
df = pd.read_csv("loan_data.csv")

In [3]:
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB


In [5]:
x = df.copy()

In [6]:
x['purpose']=x['purpose'].astype('category')

In [7]:
encodedDF = pd.get_dummies(x[['purpose']])

In [8]:
x = pd.concat([encodedDF,x.drop(['purpose'],axis=1)],axis=1)

In [9]:
y = x.pop("not.fully.paid")

In [10]:
x.head()

Unnamed: 0,purpose_all_other,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec
0,0,0,1,0,0,0,0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0
1,0,1,0,0,0,0,0,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0
2,0,0,1,0,0,0,0,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0
3,0,0,1,0,0,0,0,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0
4,0,1,0,0,0,0,0,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0


In [11]:
y.value_counts()

0    8045
1    1533
Name: not.fully.paid, dtype: int64

### Train and Test Fit

In [12]:
train, test, y_train, y_test = train_test_split(
     x, y, test_size=0.2, random_state=101)

### Fit and Predict the Model

In [13]:
lr = LogisticRegression()
lr.fit(train, y_train)
y_pred = lr.predict(test)

### Evaluting the model

In [14]:
print('Accuracy:',accuracy_score(y_test,y_pred))
print('Confusion Matrix:',confusion_matrix(y_test,y_pred))
print('Classification Report:',classification_report(y_test,y_pred))
print('ROC_AUC_Score:',roc_auc_score(y_test,y_pred))

Accuracy: 0.8460334029227558
Confusion Matrix: [[1617    6]
 [ 289    4]]
Classification Report:               precision    recall  f1-score   support

           0       0.85      1.00      0.92      1623
           1       0.40      0.01      0.03       293

    accuracy                           0.85      1916
   macro avg       0.62      0.50      0.47      1916
weighted avg       0.78      0.85      0.78      1916

ROC_AUC_Score: 0.504977509731063


In [15]:
def fit_predict(train, test, y_train, y_test, scaler, max_depth, 
                criterion = 'entropy', max_features = 1, min_samples_split = 4):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)        
    dt = DecisionTreeClassifier(criterion = criterion, max_depth=max_depth, 
                                random_state=42, max_features=max_features,
                               min_samples_split=min_samples_split)
    dt.fit(train_scaled, y_train)
    y_pred = dt.predict(test_scaled)
    print(accuracy_score(y_test, y_pred))

In [16]:
dt = DecisionTreeClassifier()
dt.fit(train, y_train)
y_pred = dt.predict(test)
print(accuracy_score(y_test, y_pred))

0.7286012526096033


### Max depth tuning

In [17]:
for i in range(1, 30):
    print("Accuracy score using max_depth = ", i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), i)

Accuracy score using max_depth =  1: 0.8470772442588727
Accuracy score using max_depth =  2: 0.8470772442588727
Accuracy score using max_depth =  3: 0.8470772442588727
Accuracy score using max_depth =  4: 0.8470772442588727
Accuracy score using max_depth =  5: 0.8470772442588727
Accuracy score using max_depth =  6: 0.8455114822546973
Accuracy score using max_depth =  7: 0.8465553235908142
Accuracy score using max_depth =  8: 0.8434237995824635
Accuracy score using max_depth =  9: 0.8402922755741128
Accuracy score using max_depth =  10: 0.837160751565762
Accuracy score using max_depth =  11: 0.8355949895615866
Accuracy score using max_depth =  12: 0.8319415448851775
Accuracy score using max_depth =  13: 0.826722338204593
Accuracy score using max_depth =  14: 0.8308977035490606
Accuracy score using max_depth =  15: 0.81419624217119
Accuracy score using max_depth =  16: 0.7875782881002088
Accuracy score using max_depth =  17: 0.8021920668058455
Accuracy score using max_depth =  18: 0.8016

### Max feature tuning

In [18]:
for i in np.arange(0.1,1.0,0.1):
    print("Accuracy score using max_features = ", i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), max_depth=1, max_features=i)

Accuracy score using max_features =  0.1: 0.8470772442588727
Accuracy score using max_features =  0.2: 0.8470772442588727
Accuracy score using max_features =  0.30000000000000004: 0.8470772442588727
Accuracy score using max_features =  0.4: 0.8470772442588727
Accuracy score using max_features =  0.5: 0.8470772442588727
Accuracy score using max_features =  0.6: 0.8470772442588727
Accuracy score using max_features =  0.7000000000000001: 0.8470772442588727
Accuracy score using max_features =  0.8: 0.8470772442588727
Accuracy score using max_features =  0.9: 0.8470772442588727


### Min Samples split tunning

In [19]:
for i in range(2,10):
    print("Accuracy score using max_features = ", i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), max_depth=1, max_features=1, min_samples_split=i)

Accuracy score using max_features =  2: 0.8470772442588727
Accuracy score using max_features =  3: 0.8470772442588727
Accuracy score using max_features =  4: 0.8470772442588727
Accuracy score using max_features =  5: 0.8470772442588727
Accuracy score using max_features =  6: 0.8470772442588727
Accuracy score using max_features =  7: 0.8470772442588727
Accuracy score using max_features =  8: 0.8470772442588727
Accuracy score using max_features =  9: 0.8470772442588727


### Criterion Tunning

In [20]:
for i in ['gini', 'entropy']:
    print("Accuracy score using criterion = ", i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), 1, max_features=1, min_samples_split=2, criterion=i) 

Accuracy score using criterion =  gini: 0.8470772442588727
Accuracy score using criterion =  entropy: 0.8470772442588727


In [21]:
def create_poly(train, test, degree):
    poly = PolynomialFeatures(degree=degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly, test_poly

In [22]:
for degree in [1,2,3,4]:
    train_poly, test_poly = create_poly(train, test, degree)
    print("Polynominal degree", degree)
    fit_predict(train, test, y_train, y_test, StandardScaler(), 25, max_features=0.3, min_samples_split=2, criterion='entropy')
    print(10*'-')

Polynominal degree 1
0.7620041753653445
----------
Polynominal degree 2
0.7620041753653445
----------
Polynominal degree 3
0.7620041753653445
----------
Polynominal degree 4
0.7620041753653445
----------


### Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
rf = RandomForestClassifier()

In [25]:
rf.fit(train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [26]:
pred_rf = rf.predict(test)

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
params = {'n_estimators': [200,500,700], 'max_depth': [10,15,18,20], 'min_samples_leaf': [3,5,7]}

In [29]:
gs = GridSearchCV(rf, params, verbose=3)

In [30]:
gs.fit(train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.838, total=   1.6s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s


[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.838, total=   1.6s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.1s remaining:    0.0s


[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.839, total=   1.6s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.837, total=   1.6s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.839, total=   1.6s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=500, score=0.838, total=   4.2s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=500, score=0.838, total=   4.1s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=500, score=0.839, total=   4.4s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=500, score=

[CV]  max_depth=15, min_samples_leaf=3, n_estimators=700, score=0.840, total=   7.0s
[CV] max_depth=15, min_samples_leaf=3, n_estimators=700 ..............
[CV]  max_depth=15, min_samples_leaf=3, n_estimators=700, score=0.837, total=   7.1s
[CV] max_depth=15, min_samples_leaf=3, n_estimators=700 ..............
[CV]  max_depth=15, min_samples_leaf=3, n_estimators=700, score=0.838, total=   7.3s
[CV] max_depth=15, min_samples_leaf=3, n_estimators=700 ..............
[CV]  max_depth=15, min_samples_leaf=3, n_estimators=700, score=0.838, total=   7.1s
[CV] max_depth=15, min_samples_leaf=3, n_estimators=700 ..............
[CV]  max_depth=15, min_samples_leaf=3, n_estimators=700, score=0.836, total=   6.7s
[CV] max_depth=15, min_samples_leaf=5, n_estimators=200 ..............
[CV]  max_depth=15, min_samples_leaf=5, n_estimators=200, score=0.839, total=   1.9s
[CV] max_depth=15, min_samples_leaf=5, n_estimators=200 ..............
[CV]  max_depth=15, min_samples_leaf=5, n_estimators=200, score=

[CV]  max_depth=18, min_samples_leaf=5, n_estimators=200, score=0.839, total=   2.1s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=200 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=200, score=0.837, total=   2.0s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=500 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=500, score=0.838, total=   5.3s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=500 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=500, score=0.838, total=   6.7s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=500 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=500, score=0.839, total=  11.1s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=500 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=500, score=0.838, total=  18.0s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=500 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=500, score=

[CV]  max_depth=20, min_samples_leaf=5, n_estimators=700, score=0.838, total=   7.4s
[CV] max_depth=20, min_samples_leaf=5, n_estimators=700 ..............
[CV]  max_depth=20, min_samples_leaf=5, n_estimators=700, score=0.839, total=   7.3s
[CV] max_depth=20, min_samples_leaf=5, n_estimators=700 ..............
[CV]  max_depth=20, min_samples_leaf=5, n_estimators=700, score=0.837, total=   7.4s
[CV] max_depth=20, min_samples_leaf=5, n_estimators=700 ..............
[CV]  max_depth=20, min_samples_leaf=5, n_estimators=700, score=0.837, total=   7.1s
[CV] max_depth=20, min_samples_leaf=7, n_estimators=200 ..............
[CV]  max_depth=20, min_samples_leaf=7, n_estimators=200, score=0.839, total=   2.0s
[CV] max_depth=20, min_samples_leaf=7, n_estimators=200 ..............
[CV]  max_depth=20, min_samples_leaf=7, n_estimators=200, score=0.838, total=   2.1s
[CV] max_depth=20, min_samples_leaf=7, n_estimators=200 ..............
[CV]  max_depth=20, min_samples_leaf=7, n_estimators=200, score=

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 22.5min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [31]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=18, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [32]:
gs.best_params_

{'max_depth': 18, 'min_samples_leaf': 5, 'n_estimators': 200}

In [33]:
rf1 = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=18, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [34]:
rf1.fit(train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=18, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [35]:
pred_rf1 = rf1.predict(test)

In [36]:
print(accuracy_score(y_test, pred_rf1))

0.8455114822546973


In [37]:
rf1.oob_score_

0.8377708170190551

In [38]:
rf1.feature_importances_

array([0.01094488, 0.00602383, 0.01291699, 0.00348789, 0.00536287,
       0.00208528, 0.01425853, 0.02508659, 0.10585947, 0.11325063,
       0.10849161, 0.10361217, 0.08055788, 0.10901699, 0.10936263,
       0.11109133, 0.06215936, 0.00975128, 0.00667982])

In [39]:
sorted(list(zip(rf1.feature_importances_, train.columns)), reverse=True)

[(0.11325063376454285, 'installment'),
 (0.11109132871748051, 'revol.util'),
 (0.10936262528363982, 'revol.bal'),
 (0.10901699216447819, 'days.with.cr.line'),
 (0.108491607406004, 'log.annual.inc'),
 (0.10585946616092703, 'int.rate'),
 (0.10361216563055291, 'dti'),
 (0.08055787528181609, 'fico'),
 (0.06215935657014243, 'inq.last.6mths'),
 (0.02508659480408237, 'credit.policy'),
 (0.014258526585200477, 'purpose_small_business'),
 (0.012916985811859339, 'purpose_debt_consolidation'),
 (0.01094487777037561, 'purpose_all_other'),
 (0.009751282666393857, 'delinq.2yrs'),
 (0.006679818282049111, 'pub.rec'),
 (0.006023828541726235, 'purpose_credit_card'),
 (0.005362866933447172, 'purpose_home_improvement'),
 (0.0034878902416897455, 'purpose_educational'),
 (0.0020852773835920166, 'purpose_major_purchase')]

### Conclusion


##### Modal predicts overall 84% of accuracy. In the confusion matrix, 305 predications are risky to give the loan
