#### building a machine learning model that can accurately predict if a borrower will pay off their loan on time or not

In [5]:
import pandas as pd
loans_2007 = pd.read_csv('loans_2007.csv')
print(loans_2007.head(1))
print('number of columns:', loans_2007.shape[1])

        id  member_id  loan_amnt  funded_amnt  funded_amnt_inv        term  \
0  1077501  1296599.0     5000.0       5000.0           4975.0   36 months   

  int_rate  installment grade sub_grade    ...    last_pymnt_amnt  \
0   10.65%       162.87     B        B2    ...             171.62   

  last_credit_pull_d collections_12_mths_ex_med  policy_code application_type  \
0           Jun-2016                        0.0          1.0       INDIVIDUAL   

  acc_now_delinq chargeoff_within_12_mths delinq_amnt pub_rec_bankruptcies  \
0            0.0                      0.0         0.0                  0.0   

  tax_liens  
0       0.0  

[1 rows x 52 columns]
number of columns: 52


  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
loans_2007 = loans_2007.drop(['id', 'member_id', 'funded_amnt', 'funded_amnt_inv', 'grade', 'sub_grade', 'emp_title', 'issue_d'], axis =1)
print(loans_2007.shape)

(42538, 44)


In [7]:
loans_2007 = loans_2007.drop(['zip_code', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp'], axis=1)
print(loans_2007.shape)

(42538, 38)


In [8]:
loans_2007 = loans_2007.drop(['total_rec_int', 'total_rec_late_fee','recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt'], axis=1)
print(loans_2007.shape)

(42538, 32)


In [9]:
loan_status = loans_2007['loan_status'].value_counts()
print(loan_status)

Fully Paid                                             33136
Charged Off                                             5634
Does not meet the credit policy. Status:Fully Paid      1988
Current                                                  961
Does not meet the credit policy. Status:Charged Off      761
Late (31-120 days)                                        24
In Grace Period                                           20
Late (16-30 days)                                          8
Default                                                    3
Name: loan_status, dtype: int64


In [10]:
loans_2007 = loans_2007[(loans_2007['loan_status'] == "Fully Paid") | (loans_2007['loan_status'] == "Charged Off")]

status_replace = {
    "loan_status" : {
        "Fully Paid": 1,
        "Charged Off": 0,
    }
}

loans_2007 = loans_2007.replace(status_replace)

In [11]:
loan_status = loans_2007['loan_status'].value_counts()
print(loan_status)

1    33136
0     5634
Name: loan_status, dtype: int64


In [12]:
# ensure all columns with one unique value are dropped. 
columns = loans_2007.columns
drop_columns = []
for col in columns:
    non_null = loans_2007[col].dropna()
    unique_non_null = non_null.unique()
    num_true_unique = len(unique_non_null)
    if num_true_unique == 1:
        drop_columns.append(col)
        loans_2007 = loans_2007.drop([col], axis=1)
print(loans_2007.shape)

(38770, 23)


#### Preparing the features

In [13]:
import pandas as pd
loans = pd.read_csv('filtered_loans_2007.csv')
null_counts = loans.isnull().sum()
print(null_counts)

loan_amnt                  0
term                       0
int_rate                   0
installment                0
emp_length              1036
home_ownership             0
annual_inc                 0
verification_status        0
loan_status                0
purpose                    0
title                     11
addr_state                 0
dti                        0
delinq_2yrs                0
earliest_cr_line           0
inq_last_6mths             0
open_acc                   0
pub_rec                    0
revol_bal                  0
revol_util                50
total_acc                  0
last_credit_pull_d         2
pub_rec_bankruptcies     697
dtype: int64


Employment length is frequently used in assessing how risky a potential borrower is, keep this column despite its relatively large amount of missing values

In [14]:
print(loans.pub_rec_bankruptcies.value_counts(normalize=True, dropna=False))

 0.0    0.939438
 1.0    0.042456
NaN     0.017978
 2.0    0.000129
Name: pub_rec_bankruptcies, dtype: float64


pub_rec_bankruptcies column offers very little variability, nearly 94% of values are in the same category. It probably won't have much predictive value. will drop it

In [15]:
loans = loans.drop('pub_rec_bankruptcies', axis=1)
loans = loans.dropna(axis=0)
print(loans.dtypes.value_counts())

object     11
float64    10
int64       1
dtype: int64


In [16]:
object_columns_df = loans.select_dtypes(object)
print(object_columns_df.head(1))

         term int_rate emp_length home_ownership verification_status  \
0   36 months   10.65%  10+ years           RENT            Verified   

       purpose     title addr_state earliest_cr_line revol_util  \
0  credit_card  Computer         AZ         Jan-1985      83.7%   

  last_credit_pull_d  
0           Jun-2016  


In [17]:
cols = ['home_ownership', 'verification_status', 'emp_length', 'term', 'addr_state']
for col in cols:
    print(col,'\n', loans[col].value_counts(), '\n')

home_ownership 
 RENT        18112
MORTGAGE    16686
OWN          2778
OTHER          96
NONE            3
Name: home_ownership, dtype: int64 

verification_status 
 Not Verified       16281
Verified           11856
Source Verified     9538
Name: verification_status, dtype: int64 

emp_length 
 10+ years    8545
< 1 year     4513
2 years      4303
3 years      4022
4 years      3353
5 years      3202
1 year       3176
6 years      2177
7 years      1714
8 years      1442
9 years      1228
Name: emp_length, dtype: int64 

term 
  36 months    28234
 60 months     9441
Name: term, dtype: int64 

addr_state 
 CA    6776
NY    3614
FL    2704
TX    2613
NJ    1776
IL    1447
PA    1442
VA    1347
GA    1323
MA    1272
OH    1149
MD    1008
AZ     807
WA     788
CO     748
NC     729
CT     711
MI     678
MO     648
MN     581
NV     466
SC     454
WI     427
OR     422
AL     420
LA     420
KY     311
OK     285
UT     249
KS     249
AR     229
DC     209
RI     194
NM     180
WV     164
H

In [18]:
print('title', '\n', loans['title'].value_counts(), '\n')
print('purpose', '\n', loans['purpose'].value_counts())

title 
 Debt Consolidation                                                        2068
Debt Consolidation Loan                                                   1599
Personal Loan                                                              624
Consolidation                                                              488
debt consolidation                                                         466
Credit Card Consolidation                                                  345
Home Improvement                                                           336
Debt consolidation                                                         314
Small Business Loan                                                        298
Credit Card Loan                                                           294
Personal                                                                   290
Consolidation Loan                                                         250
Home Improvement Loan                       

In [19]:
mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0
    }
}

loans = loans.drop(["last_credit_pull_d", "earliest_cr_line", "addr_state", "title"], axis=1)
loans["int_rate"] = loans["int_rate"].str.rstrip("%").astype("float")
loans["revol_util"] = loans["revol_util"].str.rstrip("%").astype("float")
loans = loans.replace(mapping_dict)

In [20]:
dummy_df = pd.get_dummies(loans[['home_ownership', 'verification_status', 'purpose', 'term']])
loans = pd.concat([loans, dummy_df], axis=1)
loans = loans.drop(['home_ownership', 'verification_status', 'purpose', 'term'], axis=1)

#### Making Predictions

In [21]:
import pandas as pd
loans = pd.read_csv('cleaned_loans_2007.csv')
print(loans.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38708 entries, 0 to 38707
Data columns (total 38 columns):
loan_amnt                              38708 non-null float64
int_rate                               38708 non-null float64
installment                            38708 non-null float64
emp_length                             38708 non-null int64
annual_inc                             38708 non-null float64
loan_status                            38708 non-null int64
dti                                    38708 non-null float64
delinq_2yrs                            38708 non-null float64
inq_last_6mths                         38708 non-null float64
open_acc                               38708 non-null float64
pub_rec                                38708 non-null float64
revol_bal                              38708 non-null float64
revol_util                             38708 non-null float64
total_acc                              38708 non-null float64
home_ownership_MORTGAGE    

#### Picking an error metric
    False Positive Rate = False Positives / (False Positive + True Negatives)
    True Positive Rate =  True Positives / (True Positives + False Negatives)

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
lr = LogisticRegression()
predictions = cross_val_predict(lr, features, target, cv=3)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])
# Rates
tpr = tp  / (tp + fn)
fpr = fp  / (fp + tn)
print(tpr)
print(fpr)

0.9989121566494424
0.9967943009795192


The classifier doesn't account for the imbalance in the classes (more 1's than 0's in the data set). Need t use oversampling and undersampling to ensure that the classifier gets input that has a balanced number of each class and tell the classifier to penalize misclassifications of the less prevalent class more than the other class.

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
lr = LogisticRegression(class_weight='balanced')
predictions = cross_val_predict(lr, features, target)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])
# Rates
tpr = tp  / (tp + fn)
fpr = fp  / (fp + tn)
print(tpr)
print(fpr)

0.6579034841205089
0.38290293855743546


The true positive rate is lowered to 67%, and false positive rate lowered to 40%, help to avoid bad loans. However, funding only 67% of the total loans (true positive rate) means rejecting a good amount of loans.  It's better to set manual penalty.  Setting class_weight to balanced automatically set a penalty based on the number of 1s and 0s in the column.

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
penalty = {
    0: 10,
    1: 1
}
lr = LogisticRegression(class_weight=penalty)
predictions = cross_val_predict(lr, features, target)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])
# Rates
tpr = tp  / (tp + fn)
fpr = fp  / (fp + tn)
print(tpr)
print(fpr)

0.24005076602302602
0.09029385574354408


Assigning manual penalties lowered the false positive rate, and thus lowered the risk. But it also causes missed opportunities to fund more loans.

#### Using Random Forest

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict

rf = RandomForestClassifier(random_state =1, class_weight='balanced')
predictions = cross_val_predict(rf, features, target)
predictions = pd.Series(predictions)
# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])
# Rates
tpr = tp  / (tp + fn)
fpr = fp  / (fp + tn)
print(tpr)
print(fpr)



0.9708699725017376
0.9271593944790739
