## Data preparation and pre-processing for Machine learning Modeling

In [1]:
#Import the required libaries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Read each of the issued and declined loan data

#The loan issued in 2007 - 2011
#data path
issued2011loan = pd.read_csv(r"C:\Users\Semiu\Desktop\Lendingclub\LoanStats3a.csv", low_memory = False)

#The loan issued in 2012 - 2013
issued2013loan = pd.read_csv(r"C:\Users\Semiu\Desktop\Lendingclub\LoanStats3b.csv", low_memory = False)

#The loan issued in 2014
issued2014loan = pd.read_csv(r"C:\Users\Semiu\Desktop\Lendingclub\LoanStats3c.csv", low_memory = False)

#The loan declined in 2012
declined2012loan = pd.read_csv(r"C:\Users\Semiu\Desktop\Lendingclub\RejectStatsA.csv", low_memory = False)

#The loan declined in 2014
declined2014loan = pd.read_csv(r"C:\Users\Semiu\Desktop\Lendingclub\RejectStatsB.csv", low_memory = False)

In [5]:
li = []
li.append(issued2011loan['id'])

In [6]:
li

[0        1077501
 1        1077430
 2        1077175
 3        1076863
 4        1075358
           ...   
 42533      73582
 42534      72998
 42535      72176
 42536      71623
 42537      70686
 Name: id, Length: 42538, dtype: object]

In [8]:
max(int(issued2011loan['id']))

TypeError: cannot convert the series to <class 'int'>

In [4]:
#check the first five rows of the issued2011loan data
issued2011loan.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_il_tl,mo_sin_old_il_acct,num_actv_rev_tl,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,total_rev_hi_lim,num_rev_tl_bal_gt_0,num_op_rev_tl,tot_coll_amt,policy_code
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,...,,,,,,,,,,1.0
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,...,,,,,,,,,,1.0
2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,...,,,,,,,,,,1.0
3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,...,,,,,,,,,,1.0
4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,...,,,,,,,,,,1.0


In [8]:
#96 features
issued2011loan.shape

(42538, 96)

In [6]:
declined2012loan.head()

Unnamed: 0,Amount Requested,Application Date,Loan Title,Risk_Score,Debt-To-Income Ratio,City,State,Employment Length,Policy Code
0,1000.0,5/26/2007,Wedding Covered but No Honeymoon,693.0,10%,Taylor,NM,4 years,0
1,1000.0,5/26/2007,Consolidating Debt,703.0,10%,Amherst,MA,< 1 year,0
2,11000.0,5/27/2007,Want to consolidate my debt,715.0,10%,Baltimore,MD,1 year,0
3,6000.0,5/27/2007,waksman,698.0,38.64%,Concord,MA,< 1 year,0
4,1500.0,5/27/2007,mdrigo,509.0,9.43%,Silver SPRING,MD,< 1 year,0


In [7]:
declined2012loan.shape

(756563, 9)

In [9]:
#Join all the issued data
issuedloandata = pd.concat([issued2011loan,issued2013loan,issued2014loan])

In [10]:
#Join all the declined data
declinedloandata = pd.concat([declined2012loan,declined2014loan])

In [12]:
issuedloandata.shape

(379060, 96)

In [13]:
#check for null values

declinedloandata.isnull().sum()

Amount Requested            0
Application Date            0
Loan Title                 14
Risk_Score              62617
Debt-To-Income Ratio        0
City                       22
State                      20
Employment Length       24143
Policy Code                 0
dtype: int64

In [14]:
mergloandata = pd.concat([issuedloandata,declinedloandata])

In [15]:
mergloandata.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,policy_code,Amount Requested,Application Date,Loan Title,Risk_Score,Debt-To-Income Ratio,City,State,Employment Length,Policy Code
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,...,1.0,,,,,,,,,
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,...,1.0,,,,,,,,,
2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,...,1.0,,,,,,,,,
3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,...,1.0,,,,,,,,,
4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,...,1.0,,,,,,,,,



Merging the declinedloandata and the issuedloandata

Features in the declinedloandata are compared to those in issuedloandata, and it's realized that these pairs mean the same thing:

Amount Requested (in declined loan) and loan_amnt (in issued loan),

Loan Title(in declined loan) and purpose (in issued loan),

Policy Code(in declined loan) and policy_code (in issued loan),

fico_range_high (in issued loan) and Risk_Score (in declined loan),

dti (in issued loan) and Debt-To-Income Ratio (in declined loan), and

emp_length (in issued loan) and Employment Length (in declined loan)


In [16]:
#For these features to have the same column names in both data, a rename is done on the declined data.
declinedloandata['loan_amnt'] = declinedloandata['Amount Requested']
declinedloandata['purpose'] = declinedloandata['Loan Title']
declinedloandata['policy_code'] = declinedloandata['Policy Code']
declinedloandata['fico_range_high'] = declinedloandata['Risk_Score']
declinedloandata['dti'] = declinedloandata['Debt-To-Income Ratio']
declinedloandata['emp_length'] = declinedloandata['Employment Length']

In [17]:
#The six columns are the columns to be retained for the ML modeling. Therefore, other columns will be dropped in the declinedloandata.

useless_cols_dec = [cols for cols in declinedloandata.columns if cols not in ('loan_amnt', 'purpose', 'policy_code', 'fico_range_high', 'dti', 'emp_length')]
for useless_col_d in useless_cols_dec:
    declinedloandata.drop(useless_col_d, axis =1,inplace=True)

In [18]:
#Other columns will be dropped the issuedloandata.
useless_cols_iss = [cols for cols in issuedloandata.columns if cols not in ('loan_amnt', 'purpose', 'policy_code', 'fico_range_high', 'dti', 'emp_length')]
for useless_col_i in useless_cols_iss:
    issuedloandata.drop(useless_col_i, axis =1, inplace=True)

In [20]:
#Create the class column and populate all rows with a value
#I prefer the name is_safe, so I decided to drop the isSafe I earlier did

declinedloandata['is_safe'] = 0
issuedloandata['is_safe'] = 1

In [21]:
#Join the declined and issued loan data to form a single dataframe
loandata = pd.concat([declinedloandata,issuedloandata])

In [22]:
loandata

Unnamed: 0,loan_amnt,purpose,policy_code,fico_range_high,dti,emp_length,is_safe
0,1000.0,Wedding Covered but No Honeymoon,0.0,693.0,10%,4 years,0
1,1000.0,Consolidating Debt,0.0,703.0,10%,< 1 year,0
2,11000.0,Want to consolidate my debt,0.0,715.0,10%,1 year,0
3,6000.0,waksman,0.0,698.0,38.64%,< 1 year,0
4,1500.0,mdrigo,0.0,509.0,9.43%,< 1 year,0
...,...,...,...,...,...,...,...
138730,525.0,,2.0,,,,1
138731,2500.0,,2.0,,,,1
138732,2160.0,,2.0,,,,1
138733,16550.0,,2.0,,,,1


In [25]:
#Save the loan data for data cleansing and EDA stages
loandata.to_csv(r"C:\Users\Semiu\Desktop\Lendingclub\LCloandata.csv", encoding='utf-8', index=None)

In [26]:
loandata.shape

(2184197, 7)