In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
#load data
df_train = pd.read_csv('data/loan_train.csv')
df_test = pd.read_csv('data/loan_test.csv')

In [3]:
df_train

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,573354,737474,10000,10000,9950.00000,36 months,7.51%,311.11,A,A4,...,,,,,0.0,0.0,,,,
1,476321,603324,15000,15000,14800.00000,36 months,8.94%,476.58,A,A5,...,,,,,0.0,0.0,,,,
2,451484,556265,2000,2000,2000.00000,36 months,13.57%,67.94,C,C3,...,,,,,0.0,0.0,,,,
3,1018129,1246557,35000,35000,33951.84413,60 months,20.89%,944.71,F,F1,...,,,,,2.0,0.0,,,,
4,800018,1005270,14000,14000,14000.00000,60 months,17.49%,351.64,D,D5,...,,,,,0.0,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24994,450579,554340,5500,5500,5500.00000,36 months,14.96%,190.55,D,D2,...,,,,,0.0,0.0,,,,
24995,788033,991661,11450,11450,11450.00000,36 months,8.49%,361.40,A,A5,...,,,,,0.0,0.0,,,,
24996,568459,731299,24000,24000,21100.31424,60 months,20.53%,642.96,G,G2,...,,,,,0.0,0.0,,,,
24997,397594,428786,14000,14000,11411.12089,36 months,17.58%,503.19,F,F2,...,,,,,1.0,0.0,,,,


In [5]:
# information about different column
print(df_train.info())
df_train.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24999 entries, 0 to 24998
Columns: 111 entries, id to total_il_high_credit_limit
dtypes: float64(74), int64(13), object(24)
memory usage: 21.2+ MB
None


id                              int64
member_id                       int64
loan_amnt                       int64
funded_amnt                     int64
funded_amnt_inv               float64
                               ...   
tax_liens                     float64
tot_hi_cred_lim               float64
total_bal_ex_mort             float64
total_bc_limit                float64
total_il_high_credit_limit    float64
Length: 111, dtype: object

In [6]:
# total number of missing values in the DataFrame
print(df_train.isnull().values.sum())
# column-wise distribution of null values
print(df_train.isnull().sum())

1422620
id                                0
member_id                         0
loan_amnt                         0
funded_amnt                       0
funded_amnt_inv                   0
                              ...  
tax_liens                        17
tot_hi_cred_lim               24999
total_bal_ex_mort             24999
total_bc_limit                24999
total_il_high_credit_limit    24999
Length: 111, dtype: int64


In [7]:
# Replace % sign
df_train['int_rate'] = df_train['int_rate'].str.replace('%', '')
df_test['int_rate'] = df_test['int_rate'].str.replace('%', '')

df_train['revol_util'] = df_train['revol_util'].str.replace('%', '')
df_test['revol_util'] = df_test['int_rate'].str.replace('%', '')

df_train['int_rate'] = df_train['int_rate'].astype(np.float64)
df_test['int_rate'] = df_test['int_rate'].astype(np.float64)

df_train['revol_util'] = df_train['revol_util'].astype(np.float64)
df_test['revol_util'] = df_test['revol_util'].astype(np.float64)

In [8]:
### Remove Current value rows from loan_status
df_train = df_train[df_train['loan_status'] != 'Current']
df_test = df_test[df_test['loan_status'] != 'Current']
### Drop useless columns - Date columns, titles (redundant in purpose column) , zipcode (encrypted) , policy_code (single unique value)
df_train.drop(['last_pymnt_d','last_credit_pull_d','earliest_cr_line', 'url','emp_title','title','zip_code','policy_code', 'emp_length'] , axis = 1, inplace = True)
df_test.drop(['last_pymnt_d','last_credit_pull_d','earliest_cr_line', 'url','emp_title','title','zip_code','policy_code', 'emp_length'] , axis = 1, inplace = True)
### Drop columns with more than 90% null data
drop_cols = df_train.columns[df_train.isnull().mean() > 0.9]
df_train.drop(drop_cols, axis = 1, inplace = True)
df_test.drop(drop_cols, axis = 1, inplace = True)
print('Columns dropped with 90% null data are: ', drop_cols)


Columns dropped with 90% null data are:  Index(['mths_since_last_record', 'next_pymnt_d', 'mths_since_last_major_derog',
       'annual_inc_joint', 'dti_joint', 'verification_status_joint',
       'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_il_6m',
       'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il',
       'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util',
       'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op',
       'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_recent_bc',
       'mths_since_recent_bc_dlq', 'mths_since_recent_inq',
       'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd',
       'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl',
       'num_il_tl', 'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0',
       'num_sats', 'num_tl_120dpd_2m'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [9]:
df_train.head

<bound method NDFrame.head of            id  member_id  ...  pub_rec_bankruptcies  tax_liens
0      573354     737474  ...                   0.0        0.0
1      476321     603324  ...                   0.0        0.0
2      451484     556265  ...                   0.0        0.0
4      800018    1005270  ...                   0.0        0.0
5      471391     595223  ...                   0.0        0.0
...       ...        ...  ...                   ...        ...
24994  450579     554340  ...                   0.0        0.0
24995  788033     991661  ...                   0.0        0.0
24996  568459     731299  ...                   0.0        0.0
24997  397594     428786  ...                   1.0        0.0
24998  660367     844601  ...                   0.0        0.0

[24301 rows x 46 columns]>

In [10]:
for column in df_train.columns:
    if df_train[column].isna().sum() != 0:
        missing = df_train[column].isna().sum()
        portion = (missing / df_train.shape[0]) * 100
        print(f"'{column}': number of missing values '{missing}' ==> '{portion:.3f}%'")

'desc': number of missing values '7943' ==> '32.686%'
'mths_since_last_delinq': number of missing values '15767' ==> '64.882%'
'revol_util': number of missing values '29' ==> '0.119%'
'collections_12_mths_ex_med': number of missing values '29' ==> '0.119%'
'chargeoff_within_12_mths': number of missing values '29' ==> '0.119%'
'pub_rec_bankruptcies': number of missing values '417' ==> '1.716%'
'tax_liens': number of missing values '17' ==> '0.070%'


In [11]:
df_train.drop('mths_since_last_delinq', axis = 1, inplace = True)
df_test.drop('mths_since_last_delinq', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [12]:
df_train.shape

(24301, 45)

In [13]:
print([column for column in df_train.columns if df_train[column].dtype == object])

['term', 'grade', 'sub_grade', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'desc', 'purpose', 'addr_state', 'initial_list_status', 'application_type']


In [14]:
df_train.term.unique()

array([' 36 months', ' 60 months'], dtype=object)

In [15]:
term_values = {' 36 months': 36, ' 60 months': 60}
df_train['term'] = df_train.term.map(term_values)
df_test['term'] = df_test.term.map(term_values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
# Drop grade because it is just a sub feature of sub_grade
df_train.drop('grade', axis=1, inplace=True)
df_test.drop('grade', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [17]:
# Drop issue date column as it is uncertain whether a loan will be issued for our problem statement
df_train.drop('issue_d', axis=1, inplace=True)
df_test.drop('issue_d', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [18]:
df_train['loan_status'] = df_train.loan_status.map({'Fully Paid':1, 'Charged Off': -1})
df_test['loan_status'] = df_test.loan_status.map({'Fully Paid':1, 'Charged Off': -1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [19]:
df_train['sub_grade'].unique()

array(['A4', 'A5', 'C3', 'D5', 'D2', 'B5', 'C2', 'C1', 'B1', 'B2', 'A3',
       'E3', 'F5', 'C5', 'G2', 'F1', 'F4', 'D1', 'B3', 'B4', 'A1', 'F2',
       'C4', 'D3', 'A2', 'F3', 'E2', 'D4', 'E1', 'E5', 'G1', 'E4', 'G4',
       'G5', 'G3'], dtype=object)

In [20]:
df_train['home_ownership'].unique()

array(['RENT', 'OWN', 'MORTGAGE', 'OTHER'], dtype=object)

In [21]:
df_train['verification_status'].unique()

array(['Source Verified', 'Not Verified', 'Verified'], dtype=object)

In [22]:
df_train['pymnt_plan'].unique()

array(['n'], dtype=object)

In [23]:
# Drop single unique value
df_train.drop('pymnt_plan', axis=1, inplace=True)
df_test.drop('pymnt_plan', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [24]:
df_train['desc'].head(20)

0           Borrower added on 08/30/10 > thank you<br/>
1       Borrower added on 01/14/10 > Green city hous...
2                                                      
4       Borrower added on 06/29/11 > thanks for the ...
5                                                   NaN
6       Borrower added on 05/25/10 > I obtained a di...
7       Borrower added on 01/30/11 > Money will be u...
8       Borrower added on 10/03/11 > Loan to consoli...
10     560296 added on 10/20/09 > I am a graphic des...
11                                                  NaN
12      Borrower added on 04/26/10 > My mom's store,...
13                                                  NaN
14    I have been doing professional motion graphics...
15      Borrower added on 03/14/11 > This fund is fo...
16                                                  NaN
17    I will use the money to  consolidate credit ca...
18                                                  NaN
20      Borrower added on 01/09/11 > need to con

In [25]:
# Drop desc as not relevant info can be obtained from it
df_train.drop('desc', axis=1, inplace=True)
df_test.drop('desc', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [26]:
df_train['purpose'].unique()

array(['home_improvement', 'other', 'major_purchase',
       'debt_consolidation', 'credit_card', 'small_business', 'wedding',
       'medical', 'moving', 'car', 'educational', 'vacation', 'house',
       'renewable_energy'], dtype=object)

In [27]:
df_train['addr_state'].unique()

array(['NJ', 'GA', 'FL', 'CA', 'MO', 'MI', 'OR', 'CT', 'PA', 'IL', 'MN',
       'AZ', 'NY', 'NC', 'WI', 'UT', 'VA', 'MD', 'CO', 'TX', 'OK', 'OH',
       'AL', 'NV', 'AK', 'DC', 'VT', 'SC', 'WA', 'DE', 'KS', 'HI', 'MA',
       'KY', 'WY', 'NH', 'NM', 'LA', 'WV', 'AR', 'RI', 'IA', 'TN', 'MT',
       'NE', 'SD', 'MS', 'ME', 'ID', 'IN'], dtype=object)

In [28]:
df_train['initial_list_status'].unique()

array(['f'], dtype=object)

In [29]:
# Drop single unique value
df_train.drop('initial_list_status', axis=1, inplace=True)
df_test.drop('initial_list_status', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [30]:
df_train['application_type'].unique()

array(['INDIVIDUAL'], dtype=object)

In [31]:
# Drop single unique value
df_train.drop('application_type', axis=1, inplace=True)
df_test.drop('application_type', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [32]:
print(df_train.shape)
print(df_test.shape)

(24301, 39)
(14276, 39)


In [33]:
df_train

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,sub_grade,home_ownership,annual_inc,verification_status,loan_status,purpose,addr_state,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,573354,737474,10000,10000,9950.00000,36,7.51,311.11,A4,RENT,30000.0,Source Verified,-1,home_improvement,NJ,5.00,0,3,19,0,1450,1.2,48,0.0,0.0,2247.210000,2235.98,1509.91,350.87,0.0,386.43,3.96,311.11,0.0,0,0.0,0,0.0,0.0
1,476321,603324,15000,15000,14800.00000,36,8.94,476.58,A5,RENT,147000.0,Not Verified,1,other,GA,3.47,0,0,6,0,4910,14.0,17,0.0,0.0,15112.760000,14911.26,15000.00,112.76,0.0,0.00,0.00,15114.03,0.0,0,0.0,0,0.0,0.0
2,451484,556265,2000,2000,2000.00000,36,13.57,67.94,C3,OWN,36000.0,Not Verified,1,major_purchase,GA,7.83,0,0,8,0,1790,34.4,10,0.0,0.0,2354.966827,2354.97,2000.00,354.97,0.0,0.00,0.00,101.78,0.0,0,0.0,0,0.0,0.0
4,800018,1005270,14000,14000,14000.00000,60,17.49,351.64,D5,MORTGAGE,50000.0,Not Verified,1,debt_consolidation,FL,21.24,1,1,9,1,553,9.4,27,0.0,0.0,20804.230020,20804.23,14000.00,6804.23,0.0,0.00,0.00,3943.27,0.0,0,0.0,0,0.0,0.0
5,471391,595223,12000,12000,11900.00000,36,8.94,381.26,A5,MORTGAGE,294000.0,Not Verified,1,other,CA,0.50,0,0,11,0,5306,2.8,21,0.0,0.0,12344.811770,12241.94,12000.00,344.81,0.0,0.00,0.00,11204.30,0.0,0,0.0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24994,450579,554340,5500,5500,5500.00000,36,14.96,190.55,D2,MORTGAGE,52000.0,Not Verified,1,debt_consolidation,FL,18.74,0,2,11,0,10655,58.2,37,0.0,0.0,6825.066783,6825.07,5500.00,1325.07,0.0,0.00,0.00,1122.20,0.0,0,0.0,0,0.0,0.0
24995,788033,991661,11450,11450,11450.00000,36,8.49,361.40,A5,RENT,40000.0,Source Verified,1,debt_consolidation,WA,21.21,0,0,10,0,10192,34.0,15,0.0,0.0,13010.206030,13010.21,11450.00,1560.21,0.0,0.00,0.00,382.90,0.0,0,0.0,0,0.0,0.0
24996,568459,731299,24000,24000,21100.31424,60,20.53,642.96,G2,MORTGAGE,74454.0,Verified,1,debt_consolidation,GA,19.95,0,3,9,0,29108,77.2,24,0.0,0.0,33765.068310,26978.96,24000.00,9765.07,0.0,0.00,0.00,16500.39,0.0,0,0.0,0,0.0,0.0
24997,397594,428786,14000,14000,11411.12089,36,17.58,503.19,F2,MORTGAGE,86000.0,Verified,1,debt_consolidation,OH,21.93,0,0,13,1,18991,95.0,26,0.0,0.0,17917.012900,14191.06,14000.00,3917.01,0.0,0.00,0.00,3857.88,0.0,0,0.0,0,1.0,0.0


In [34]:
df_train.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'sub_grade', 'home_ownership',
       'annual_inc', 'verification_status', 'loan_status', 'purpose',
       'addr_state', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'acc_now_delinq',
       'chargeoff_within_12_mths', 'delinq_amnt', 'pub_rec_bankruptcies',
       'tax_liens'],
      dtype='object')

In [35]:
df_train.describe()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,annual_inc,loan_status,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
count,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24272.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24301.0,24272.0,24301.0,24272.0,24301.0,23884.0,24284.0
mean,675923.5,841913.0,11108.384429,10836.688819,10275.873993,42.022468,11.976828,323.714343,69802.25,0.714086,13.293911,0.145426,0.870335,9.347475,0.053413,13427.547385,49.162996,22.141393,0.0,0.0,11968.517641,11367.899976,9723.016746,2143.968805,1.358008,100.17416,12.354023,2771.94458,0.0,0.0,0.0,0.0,0.041953,0.0
std,208045.2,263006.6,7295.67766,7032.562475,6965.327422,10.405459,3.698571,206.729425,69999.15,0.700073,6.64629,0.494832,1.071179,4.398961,0.235061,15801.73578,28.273209,11.364283,0.0,0.0,8770.861953,8652.249737,6971.62505,2388.945182,7.39747,719.351788,139.801427,4505.872171,0.0,0.0,0.0,0.0,0.201319,0.0
min,55742.0,73673.0,500.0,500.0,0.0,36.0,5.42,16.08,4080.0,-1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,512671.0,661903.0,5500.0,5500.0,5000.0,36.0,8.94,167.78,42000.0,1.0,8.18,0.0,0.0,6.0,0.0,3794.0,25.8,14.0,0.0,0.0,5598.284713,5149.52,4700.0,664.42,0.0,0.0,0.0,223.36,0.0,0.0,0.0,0.0,0.0,0.0
50%,655764.0,838222.0,10000.0,9600.0,8925.0,36.0,11.83,280.97,60000.0,1.0,13.39,0.0,1.0,9.0,0.0,8906.0,49.7,20.0,0.0,0.0,9884.01,9263.59,8000.0,1336.55,0.0,0.0,0.0,586.18,0.0,0.0,0.0,0.0,0.0,0.0
75%,826987.0,1035246.0,15000.0,15000.0,14025.0,60.0,14.46,426.47,84000.0,1.0,18.53,0.0,1.0,12.0,0.0,17102.0,72.7,29.0,0.0,0.0,16204.15357,15439.11,13250.0,2711.22,0.0,0.0,0.0,3514.26,0.0,0.0,0.0,0.0,0.0,0.0
max,1076863.0,1304884.0,35000.0,35000.0,35000.0,60.0,24.4,1305.19,6000000.0,1.0,29.99,11.0,8.0,44.0,4.0,149588.0,99.9,79.0,0.0,0.0,58480.13992,58438.37,35000.01,23480.14,166.429711,29623.35,5602.72,36115.2,0.0,0.0,0.0,0.0,2.0,0.0


In [36]:
# Remove all columns with single values whose mean, std, min, max all are 0.0
df_train.drop(['collections_12_mths_ex_med'] , 1, inplace = True)
df_test.drop(['collections_12_mths_ex_med'] , 1, inplace = True)

df_train.drop(['chargeoff_within_12_mths'] , 1, inplace = True)
df_test.drop(['chargeoff_within_12_mths'] , 1, inplace = True)

df_train.drop(['tax_liens'] , 1, inplace = True)
df_test.drop(['tax_liens'] , 1, inplace = True)

df_train.drop(['delinq_amnt'] , 1, inplace = True)
df_test.drop(['delinq_amnt'] , 1, inplace = True)

df_train.drop(['acc_now_delinq'] , 1, inplace = True)
df_test.drop(['acc_now_delinq'] , 1, inplace = True)

df_train.drop(['out_prncp'] , 1, inplace = True)
df_test.drop(['out_prncp'] , 1, inplace = True)

df_train.drop(['out_prncp_inv'] , 1, inplace = True)
df_test.drop(['out_prncp_inv'] , 1, inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [37]:
df_train.shape

(24301, 32)

In [38]:
df_train

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,sub_grade,home_ownership,annual_inc,verification_status,loan_status,purpose,addr_state,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,pub_rec_bankruptcies
0,573354,737474,10000,10000,9950.00000,36,7.51,311.11,A4,RENT,30000.0,Source Verified,-1,home_improvement,NJ,5.00,0,3,19,0,1450,1.2,48,2247.210000,2235.98,1509.91,350.87,0.0,386.43,3.96,311.11,0.0
1,476321,603324,15000,15000,14800.00000,36,8.94,476.58,A5,RENT,147000.0,Not Verified,1,other,GA,3.47,0,0,6,0,4910,14.0,17,15112.760000,14911.26,15000.00,112.76,0.0,0.00,0.00,15114.03,0.0
2,451484,556265,2000,2000,2000.00000,36,13.57,67.94,C3,OWN,36000.0,Not Verified,1,major_purchase,GA,7.83,0,0,8,0,1790,34.4,10,2354.966827,2354.97,2000.00,354.97,0.0,0.00,0.00,101.78,0.0
4,800018,1005270,14000,14000,14000.00000,60,17.49,351.64,D5,MORTGAGE,50000.0,Not Verified,1,debt_consolidation,FL,21.24,1,1,9,1,553,9.4,27,20804.230020,20804.23,14000.00,6804.23,0.0,0.00,0.00,3943.27,0.0
5,471391,595223,12000,12000,11900.00000,36,8.94,381.26,A5,MORTGAGE,294000.0,Not Verified,1,other,CA,0.50,0,0,11,0,5306,2.8,21,12344.811770,12241.94,12000.00,344.81,0.0,0.00,0.00,11204.30,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24994,450579,554340,5500,5500,5500.00000,36,14.96,190.55,D2,MORTGAGE,52000.0,Not Verified,1,debt_consolidation,FL,18.74,0,2,11,0,10655,58.2,37,6825.066783,6825.07,5500.00,1325.07,0.0,0.00,0.00,1122.20,0.0
24995,788033,991661,11450,11450,11450.00000,36,8.49,361.40,A5,RENT,40000.0,Source Verified,1,debt_consolidation,WA,21.21,0,0,10,0,10192,34.0,15,13010.206030,13010.21,11450.00,1560.21,0.0,0.00,0.00,382.90,0.0
24996,568459,731299,24000,24000,21100.31424,60,20.53,642.96,G2,MORTGAGE,74454.0,Verified,1,debt_consolidation,GA,19.95,0,3,9,0,29108,77.2,24,33765.068310,26978.96,24000.00,9765.07,0.0,0.00,0.00,16500.39,0.0
24997,397594,428786,14000,14000,11411.12089,36,17.58,503.19,F2,MORTGAGE,86000.0,Verified,1,debt_consolidation,OH,21.93,0,0,13,1,18991,95.0,26,17917.012900,14191.06,14000.00,3917.01,0.0,0.00,0.00,3857.88,1.0


In [39]:
dummies = ['sub_grade', 'home_ownership', 'verification_status', 'purpose', 'addr_state']
df_train = pd.get_dummies(df_train, columns=dummies, drop_first=True)
df_test = pd.get_dummies(df_test, columns=dummies, drop_first=True)

In [40]:
# Remove id and member_id columns becuase those are irrelevant to loan status
df_train.drop(['id'] , 1, inplace = True)
df_test.drop(['id'] , 1, inplace = True)

df_train.drop(['member_id'] , 1, inplace = True)
df_test.drop(['member_id'] , 1, inplace = True)

In [41]:
print(df_train.shape)
print(df_test.shape)

(24301, 126)
(14276, 126)


In [42]:
df_train

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,annual_inc,loan_status,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,pub_rec_bankruptcies,sub_grade_A2,sub_grade_A3,sub_grade_A4,sub_grade_A5,sub_grade_B1,sub_grade_B2,sub_grade_B3,sub_grade_B4,sub_grade_B5,sub_grade_C1,sub_grade_C2,sub_grade_C3,sub_grade_C4,sub_grade_C5,sub_grade_D1,...,addr_state_GA,addr_state_HI,addr_state_IA,addr_state_ID,addr_state_IL,addr_state_IN,addr_state_KS,addr_state_KY,addr_state_LA,addr_state_MA,addr_state_MD,addr_state_ME,addr_state_MI,addr_state_MN,addr_state_MO,addr_state_MS,addr_state_MT,addr_state_NC,addr_state_NE,addr_state_NH,addr_state_NJ,addr_state_NM,addr_state_NV,addr_state_NY,addr_state_OH,addr_state_OK,addr_state_OR,addr_state_PA,addr_state_RI,addr_state_SC,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY
0,10000,10000,9950.00000,36,7.51,311.11,30000.0,-1,5.00,0,3,19,0,1450,1.2,48,2247.210000,2235.98,1509.91,350.87,0.0,386.43,3.96,311.11,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,15000,15000,14800.00000,36,8.94,476.58,147000.0,1,3.47,0,0,6,0,4910,14.0,17,15112.760000,14911.26,15000.00,112.76,0.0,0.00,0.00,15114.03,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2000,2000,2000.00000,36,13.57,67.94,36000.0,1,7.83,0,0,8,0,1790,34.4,10,2354.966827,2354.97,2000.00,354.97,0.0,0.00,0.00,101.78,0.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,14000,14000,14000.00000,60,17.49,351.64,50000.0,1,21.24,1,1,9,1,553,9.4,27,20804.230020,20804.23,14000.00,6804.23,0.0,0.00,0.00,3943.27,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,12000,12000,11900.00000,36,8.94,381.26,294000.0,1,0.50,0,0,11,0,5306,2.8,21,12344.811770,12241.94,12000.00,344.81,0.0,0.00,0.00,11204.30,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24994,5500,5500,5500.00000,36,14.96,190.55,52000.0,1,18.74,0,2,11,0,10655,58.2,37,6825.066783,6825.07,5500.00,1325.07,0.0,0.00,0.00,1122.20,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
24995,11450,11450,11450.00000,36,8.49,361.40,40000.0,1,21.21,0,0,10,0,10192,34.0,15,13010.206030,13010.21,11450.00,1560.21,0.0,0.00,0.00,382.90,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
24996,24000,24000,21100.31424,60,20.53,642.96,74454.0,1,19.95,0,3,9,0,29108,77.2,24,33765.068310,26978.96,24000.00,9765.07,0.0,0.00,0.00,16500.39,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
24997,14000,14000,11411.12089,36,17.58,503.19,86000.0,1,21.93,0,0,13,1,18991,95.0,26,17917.012900,14191.06,14000.00,3917.01,0.0,0.00,0.00,3857.88,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [43]:
df_train.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'annual_inc', 'loan_status', 'dti', 'delinq_2yrs',
       ...
       'addr_state_SD', 'addr_state_TN', 'addr_state_TX', 'addr_state_UT',
       'addr_state_VA', 'addr_state_VT', 'addr_state_WA', 'addr_state_WI',
       'addr_state_WV', 'addr_state_WY'],
      dtype='object', length=126)

In [44]:
print(df_train.dtypes)
print(df_train.info())

loan_amnt            int64
funded_amnt          int64
funded_amnt_inv    float64
term                 int64
int_rate           float64
                    ...   
addr_state_VT        uint8
addr_state_WA        uint8
addr_state_WI        uint8
addr_state_WV        uint8
addr_state_WY        uint8
Length: 126, dtype: object
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24301 entries, 0 to 24998
Columns: 126 entries, loan_amnt to addr_state_WY
dtypes: float64(15), int64(10), uint8(101)
memory usage: 7.2 MB
None


In [45]:
# Check and remove duplicates columns & features
print(f"Shape before removing duplicates: {df_train.shape}")

# Remove duplicate Features
df_train = df_train.T.drop_duplicates()
df_train = df_train.T

# Remove Duplicate Rows
df_train.drop_duplicates(inplace=True)

print(f"Shape after removing duplicates: {df_train.shape}")

Shape before removing duplicates: (24301, 126)
Shape after removing duplicates: (24301, 126)


In [46]:
# Comparing Fully paid Vs Charged off data in train 
df_train.loan_status.value_counts()

 1.0    20827
-1.0     3474
Name: loan_status, dtype: int64

In [47]:
df_train.isnull().values.any()
df_train.isnull().sum().sum()

446

In [48]:
df_train.isnull().any().sum()

2

In [49]:
print(df_train.isnull().values.any())
print(df_train.isnull().sum().sum())
print(df_train.isnull().any().sum())
print(df_train.isna().any()[lambda x: x])


True
446
2
revol_util              True
pub_rec_bankruptcies    True
dtype: bool


In [50]:
# replace NaN with respective mean of that column
df_train['revol_util'].fillna((df_train['revol_util'].mean()), inplace=True)
df_test['revol_util'].fillna((df_train['revol_util'].mean()), inplace=True)

df_train['pub_rec_bankruptcies'].fillna((df_train['pub_rec_bankruptcies'].mean()), inplace=True)
df_test['pub_rec_bankruptcies'].fillna((df_train['pub_rec_bankruptcies'].mean()), inplace=True)

In [51]:
print(df_train.isnull().values.any())
print(df_train.isnull().sum().sum())
print(df_train.isnull().any().sum())
print(df_train.isna().any()[lambda x: x])

False
0
0
Series([], dtype: bool)


In [52]:
# Method to print classifer details
def print_classifier(clf):
  print(f"Classifier:"
        f"\nMax Features: {clf.max_features}"
        f"\nNumber of Trees: {clf.n_estimators}"
        f"\nLearning Rate: {clf.learning_rate}\n")

In [53]:
# Method helps to print the classification and accoracy scores
def get_scores(clf, true, pred, eval):
  if eval == "Train" and clf != "None":
    print_classifier(clf)
  # targets = ["Charged of", "Fully Paid"]
  # clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True, target_names=targets))
  print("===========================================================================")
  print(eval, " Result:\n===========================================================================")
  print(f"Precision : {precision_score(true, pred)}")
  print(f"Recall : {recall_score(true, pred)}")
  print(f"Accuracy: {accuracy_score(true, pred) * 100:.2f}%")
  print("===========================================================================")

In [54]:
X_train, y_train = df_train.drop('loan_status', axis=1), df_train.loan_status
X_test, y_test = df_test.drop('loan_status', axis=1), df_test.loan_status

# Normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [55]:
# Method takes input the classifier and fits it to training data and return train, test prediction
def predict(clf):
  # Training
  clf.fit(X_train, y_train)
  
  # Training and Testing prediction
  y_train_pred = clf.predict(X_train)
  y_test_pred = clf.predict(X_test)

  return y_train_pred, y_test_pred

In [56]:
# Print accuracy and other classification metric scores
def print_results(clf, y_train_pred, y_test_pred):
  get_scores(clf, y_train, y_train_pred, "Train")
  print("\n")
  get_scores(clf, y_test, y_test_pred, "Test")

In [57]:
# Method takes GradientBoostingClassifier parameters and returns the classifier
def build_gbc_models(l_r=0.1, n_est=100, max_f=None):
  gradBoostClf = GradientBoostingClassifier(learning_rate=l_r, n_estimators=n_est, max_features=max_f)
  return gradBoostClf


# Parameter we will consider for building different models
**learning_rate**: float, default=0.1
> Learning rate shrinks the contribution of each tree by learning_rate. There is a trade-off between learning_rate and n_estimators.
---
**n_estimators**: int, default=100
> The number of boosting stages to perform. 
---
**max_features**: {‘auto’, ‘sqrt’, ‘log2’}, int or float, default=None
> The number of features to consider when looking for the best split. Choosing max_features < n_features leads to a reduction of variance and an increase in bias.

In [81]:
# Classifier/Model-0
gradBoostClf = build_gbc_models(l_r=0.1, n_est=20, max_f=None)
y_train_pred, y_test_pred = predict(gradBoostClf)
print_results(gradBoostClf, y_train_pred, y_test_pred)

Classifier:
Max Features: None
Number of Trees: 20
Learning Rate: 0.1

Train  Result:
Precision : 0.9820350811014712
Recall : 1.0
Accuracy: 98.43%


Test  Result:
Precision : 0.9794780641512483
Recall : 1.0
Accuracy: 98.22%


In [68]:
# Classifier/Model-1
gradBoostClf = build_gbc_models(l_r=0.1, n_est=50, max_f=None)
y_train_pred, y_test_pred = predict(gradBoostClf)
print_results(gradBoostClf, y_train_pred, y_test_pred)

Classifier:
Max Features: None
Number of Trees: 50
Learning Rate: 0.1

Train  Result:
Precision : 0.9907240034249833
Recall : 1.0
Accuracy: 99.20%


Test  Result:
Precision : 0.9883417577042231
Recall : 1.0
Accuracy: 99.00%


In [69]:
# Classifier/Model-2
gradBoostClf = build_gbc_models(l_r=0.1, n_est=75, max_f=None)
y_train_pred, y_test_pred = predict(gradBoostClf)
print_results(gradBoostClf, y_train_pred, y_test_pred)

Classifier:
Max Features: None
Number of Trees: 75
Learning Rate: 0.1

Train  Result:
Precision : 0.9947461431914792
Recall : 1.0
Accuracy: 99.55%


Test  Result:
Precision : 0.9925495333224169
Recall : 1.0
Accuracy: 99.36%


In [70]:
# Classifier/Model-3
gradBoostClf = build_gbc_models(l_r=0.1, n_est=100, max_f=None)
y_train_pred, y_test_pred = predict(gradBoostClf)
print_results(gradBoostClf, y_train_pred, y_test_pred)

Classifier:
Max Features: None
Number of Trees: 100
Learning Rate: 0.1

Train  Result:
Precision : 0.9964118266194623
Recall : 1.0
Accuracy: 99.69%


Test  Result:
Precision : 0.9940949725252194
Recall : 0.9998350243339107
Accuracy: 99.48%


In [71]:
# Classifier/Model-4
gradBoostClf = build_gbc_models(l_r=0.1, n_est=125, max_f=None)
y_train_pred, y_test_pred = predict(gradBoostClf)
print_results(gradBoostClf, y_train_pred, y_test_pred)

Classifier:
Max Features: None
Number of Trees: 125
Learning Rate: 0.1

Train  Result:
Precision : 0.9972706378088488
Recall : 1.0
Accuracy: 99.77%


Test  Result:
Precision : 0.9943396226415094
Recall : 0.9998350243339107
Accuracy: 99.50%


In [74]:
# Classifier/Model-5
gradBoostClf = build_gbc_models(l_r=0.1, n_est=150, max_f=None)
y_train_pred, y_test_pred = predict(gradBoostClf)
print_results(gradBoostClf, y_train_pred, y_test_pred)

Classifier:
Max Features: None
Number of Trees: 150
Learning Rate: 0.1

Train  Result:
Precision : 0.9979874454933154
Recall : 1.0
Accuracy: 99.83%


Test  Result:
Precision : 0.9945027896291434
Recall : 0.9998350243339107
Accuracy: 99.52%


In [75]:
# Classifier/Model-6
gradBoostClf = build_gbc_models(l_r=0.1, n_est=200, max_f=None)
y_train_pred, y_test_pred = predict(gradBoostClf)
print_results(gradBoostClf, y_train_pred, y_test_pred)

Classifier:
Max Features: None
Number of Trees: 200
Learning Rate: 0.1

Train  Result:
Precision : 0.9989927091327705
Recall : 1.0
Accuracy: 99.91%


Test  Result:
Precision : 0.9951559934318555
Recall : 0.9998350243339107
Accuracy: 99.57%


In [77]:
# Classifier/Model-7
gradBoostClf = build_gbc_models(l_r=0.1, n_est=300, max_f=None)
y_train_pred, y_test_pred = predict(gradBoostClf)
print_results(gradBoostClf, y_train_pred, y_test_pred)

Classifier:
Max Features: None
Number of Trees: 300
Learning Rate: 0.1

Train  Result:
Precision : 0.9999519877088535
Recall : 1.0
Accuracy: 100.00%


Test  Result:
Precision : 0.9958104000657192
Recall : 0.9999175121669553
Accuracy: 99.64%


In [78]:
# Classifier/Model-6
gradBoostClf = build_gbc_models(l_r=0.1, n_est=500, max_f=None)
y_train_pred, y_test_pred = predict(gradBoostClf)
print_results(gradBoostClf, y_train_pred, y_test_pred)

Classifier:
Max Features: None
Number of Trees: 500
Learning Rate: 0.1

Train  Result:
Precision : 1.0
Recall : 1.0
Accuracy: 100.00%


Test  Result:
Precision : 0.9965474722564734
Recall : 1.0
Accuracy: 99.71%


In [61]:
# Classifier/Model-7
gradBoostClf = build_gbc_models(l_r=0.5, n_est=150, max_f='log2')
y_train_pred, y_test_pred = predict(gradBoostClf)
print_results(gradBoostClf, y_train_pred, y_test_pred)

Classifier:
Max Features: log2
Number of Trees: 150
Learning Rate: 0.5

Train  Result:
Precision : 0.9987531770009112
Recall : 1.0
Accuracy: 99.89%


Test  Result:
Precision : 0.9906182987848463
Recall : 0.9145426049657676
Accuracy: 92.01%


In [63]:
# Random Forests using sklearn
rfClf = RandomForestClassifier()
y_train_pred, y_test_pred = predict(rfClf)
print(f"Classifier: {rfClf}")
print_results("None", y_train_pred, y_test_pred)

Classifier: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Train  Result:
Precision : 1.0
Recall : 1.0
Accuracy: 100.00%


Test  Result:
Precision : 0.9910853030179112
Recall : 0.9995875608347768
Accuracy: 99.20%
