# Import

In [1]:
import pickle

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from catboost import CatBoostClassifier, Pool, cv

# Load data

In [2]:
X_train = pd.read_csv("X_train.csv",sep=',')
Y_train = pd.read_csv("y_train.csv",sep=',')

In [3]:
sorted(X_train.columns)

['acc_now_delinq',
 'addr_state',
 'annual_inc',
 'chargeoff_within_12_mths',
 'collections_12_mths_ex_med',
 'delinq_2yrs',
 'dti',
 'earliest_cr_line',
 'emp_length',
 'fico_range_high',
 'fico_range_low',
 'funded_amnt',
 'home_ownership',
 'index',
 'inq_last_12m',
 'installment',
 'int_rate',
 'issue_d',
 'loan_amnt',
 'mort_acc',
 'mths_since_last_delinq',
 'mths_since_recent_bc_dlq',
 'mths_since_recent_inq',
 'num_accts_ever_120_pd',
 'num_actv_bc_tl',
 'num_rev_accts',
 'num_sats',
 'num_tl_120dpd_2m',
 'num_tl_30dpd',
 'num_tl_90g_dpd_24m',
 'num_tl_op_past_12m',
 'open_acc',
 'open_il_24m',
 'open_rv_24m',
 'percent_bc_gt_75',
 'pub_rec',
 'pub_rec_bankruptcies',
 'purpose',
 'revol_util',
 'tax_liens',
 'term',
 'title',
 'total_acc',
 'verification_status',
 'zip_code']

In [4]:
X_train.set_index('index',inplace = True)
Y_train.set_index('index',inplace = True)

In [5]:
single_val_cols = list(X_train.columns[X_train.nunique()==1])
single_val_cols

[]

In [6]:
two_val_cols = list(X_train.columns[X_train.nunique()==2])
two_val_cols

['term']

In [7]:
X_train.nunique()

acc_now_delinq                    7
addr_state                       51
annual_inc                    59051
chargeoff_within_12_mths         11
collections_12_mths_ex_med       13
delinq_2yrs                      32
dti                            4747
earliest_cr_line                748
emp_length                       11
fico_range_high                  38
fico_range_low                   38
funded_amnt                    1562
home_ownership                    6
inq_last_12m                     45
installment                   81217
int_rate                        669
issue_d                         160
loan_amnt                      1562
mort_acc                         39
mths_since_last_delinq          167
mths_since_recent_bc_dlq        168
mths_since_recent_inq            26
num_accts_ever_120_pd            43
num_actv_bc_tl                   36
num_rev_accts                   111
num_sats                         83
num_tl_120dpd_2m                  6
num_tl_30dpd                

In [8]:
X_train.sample(n=10)

Unnamed: 0_level_0,acc_now_delinq,addr_state,annual_inc,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_2yrs,dti,earliest_cr_line,emp_length,fico_range_high,fico_range_low,funded_amnt,home_ownership,inq_last_12m,installment,int_rate,issue_d,loan_amnt,mort_acc,mths_since_last_delinq,mths_since_recent_bc_dlq,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_rev_accts,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,open_acc,open_il_24m,open_rv_24m,percent_bc_gt_75,pub_rec,pub_rec_bankruptcies,purpose,revol_util,tax_liens,term,title,total_acc,verification_status,zip_code
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1
344233,0.0,CA,87500.0,0.0,0.0,2.0,33.89,Oct-1975,10+ years,709.0,705.0,28000.0,RENT,3.0,791.43,23.13%,Feb-2016,28000.0,0.0,22.0,,0.0,0.0,7.0,21.0,24.0,0.0,0.0,2.0,7.0,24.0,3.0,4.0,37.5,0.0,0.0,debt_consolidation,42.8%,0.0,60 months,Debt consolidation,40.0,Source Verified,900xx
691399,0.0,NY,100000.0,0.0,0.0,1.0,21.71,Oct-2007,2 years,704.0,700.0,15000.0,MORTGAGE,5.0,499.15,12.13%,May-2018,15000.0,1.0,15.0,15.0,10.0,0.0,3.0,5.0,12.0,0.0,0.0,0.0,1.0,12.0,1.0,1.0,100.0,0.0,0.0,home_improvement,73.3%,0.0,36 months,Home improvement,23.0,Verified,124xx
503760,0.0,CA,52000.0,0.0,0.0,0.0,27.6,Mar-2001,10+ years,699.0,695.0,20000.0,RENT,,632.93,8.67%,Jan-2015,20000.0,0.0,,,,0.0,5.0,11.0,9.0,0.0,0.0,0.0,0.0,9.0,,,42.9,0.0,0.0,debt_consolidation,53.3%,0.0,36 months,Debt consolidation,13.0,Not Verified,926xx
1019177,0.0,CA,150000.0,0.0,0.0,0.0,22.66,May-2006,10+ years,679.0,675.0,4500.0,RENT,1.0,158.19,15.99%,Apr-2017,4500.0,0.0,,,1.0,0.0,5.0,13.0,12.0,0.0,0.0,0.0,1.0,12.0,2.0,1.0,80.0,0.0,0.0,credit_card,91.5%,0.0,36 months,Credit card refinancing,20.0,Source Verified,900xx
316251,0.0,NV,16000.0,0.0,0.0,0.0,37.21,Aug-2003,,674.0,670.0,1500.0,RENT,1.0,55.75,20.00%,Jul-2019,1500.0,0.0,95.0,,5.0,1.0,3.0,14.0,9.0,0.0,0.0,0.0,2.0,9.0,4.0,1.0,33.3,1.0,1.0,credit_card,26.1%,0.0,36 months,Credit card refinancing,21.0,Verified,891xx
358742,0.0,NJ,108200.0,0.0,0.0,0.0,14.25,Oct-1990,10+ years,699.0,695.0,14000.0,MORTGAGE,,330.5,14.65%,Jul-2015,14000.0,1.0,,,9.0,0.0,8.0,28.0,19.0,0.0,0.0,0.0,6.0,19.0,,,60.0,0.0,0.0,debt_consolidation,63.4%,0.0,60 months,Debt consolidation,33.0,Source Verified,070xx
569905,0.0,MN,40000.0,0.0,0.0,0.0,12.33,Nov-2006,2 years,799.0,795.0,9375.0,MORTGAGE,0.0,282.33,5.32%,Aug-2017,9375.0,2.0,,,,0.0,0.0,4.0,9.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,other,0%,0.0,36 months,Other,17.0,Not Verified,550xx
765851,0.0,FL,81000.0,0.0,0.0,1.0,26.57,Apr-1983,9 years,669.0,665.0,20000.0,RENT,12.0,693.21,14.99%,Jan-2017,20000.0,0.0,21.0,32.0,0.0,6.0,5.0,22.0,15.0,0.0,0.0,0.0,9.0,15.0,2.0,13.0,0.0,0.0,0.0,debt_consolidation,21.6%,0.0,36 months,Debt consolidation,31.0,Verified,324xx
1113625,0.0,AL,76000.0,0.0,0.0,0.0,22.79,Sep-1992,8 years,719.0,715.0,20000.0,MORTGAGE,,680.26,13.66%,Jan-2015,20000.0,2.0,58.0,,0.0,3.0,11.0,22.0,22.0,0.0,0.0,0.0,10.0,22.0,,,8.3,0.0,0.0,debt_consolidation,46.6%,0.0,36 months,Debt consolidation,49.0,Source Verified,360xx
925042,0.0,FL,80000.0,0.0,0.0,0.0,0.0,Sep-1986,< 1 year,789.0,785.0,1000.0,RENT,0.0,38.92,23.40%,Nov-2018,1000.0,0.0,,,,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,,0.0,0.0,debt_consolidation,0%,0.0,36 months,Debt consolidation,3.0,Source Verified,333xx


In [9]:
X_train.isna().sum()

acc_now_delinq                     0
addr_state                         0
annual_inc                         0
chargeoff_within_12_mths          37
collections_12_mths_ex_med        37
delinq_2yrs                        0
dti                                0
earliest_cr_line                   0
emp_length                     71747
fico_range_high                    0
fico_range_low                     0
funded_amnt                        0
home_ownership                     0
inq_last_12m                  567215
installment                        0
int_rate                           0
issue_d                            0
loan_amnt                          0
mort_acc                       31541
mths_since_last_delinq        607443
mths_since_recent_bc_dlq      917196
mths_since_recent_inq         149781
num_accts_ever_120_pd          45009
num_actv_bc_tl                 45009
num_rev_accts                  45010
num_sats                       37295
num_tl_120dpd_2m               90608
n

In [10]:
#months_since_last_delinquent
mean = X_train['mths_since_last_delinq'].mean()
X_train['mths_since_last_delinq'].fillna(value=mean,inplace=True)


In [11]:
#X_train['emp_length'].unique()
#X_train.info()
X_train['earliest_cr_line'] = pd.to_datetime(X_train['earliest_cr_line'],format = '%b-%Y')
X_train['earliest_cr_line'].sample(n=10)


index
472310    2004-12-01
459358    2004-11-01
814180    2000-10-01
1147299   2002-06-01
1108996   2000-04-01
769066    2005-12-01
209494    2004-06-01
354704    2005-05-01
400134    1992-10-01
959942    1992-11-01
Name: earliest_cr_line, dtype: datetime64[ns]

In [12]:
X_train['issue_d'] = pd.to_datetime(X_train['issue_d'],format = '%b-%Y')
X_train['issue_d'].sample(n=10)

index
18024     2016-05-01
102232    2017-03-01
741435    2014-11-01
184695    2009-10-01
115504    2019-02-01
92751     2013-05-01
1014147   2015-05-01
491760    2016-10-01
591188    2016-04-01
106499    2017-06-01
Name: issue_d, dtype: datetime64[ns]

In [13]:
X_train['int_rate'] = X_train['int_rate'].str.rstrip("%").astype(float)/100

In [14]:
X_train['revol_util'] = X_train['revol_util'].str.rstrip("%").astype(float)/100
X_train['revol_util'].sample(n=10)

index
1063635    0.781
981413     0.576
796907     0.644
606031     0.288
1129165    0.334
715591     0.000
921001     0.402
457739     0.901
127743     0.316
144915     0.469
Name: revol_util, dtype: float64

In [15]:
#X_train['percent_bc_gt_75'].plot(kind='kde')

In [16]:
X_train['mths_since_recent_bc_dlq'].value_counts()

25.0     4360
26.0     4341
28.0     4304
19.0     4240
29.0     4235
13.0     4229
27.0     4216
30.0     4203
24.0     4200
40.0     4183
46.0     4177
12.0     4176
31.0     4176
38.0     4173
33.0     4171
48.0     4164
22.0     4160
44.0     4159
45.0     4155
37.0     4153
42.0     4153
32.0     4143
36.0     4135
18.0     4133
20.0     4131
34.0     4130
15.0     4125
16.0     4124
35.0     4120
41.0     4115
14.0     4107
21.0     4106
23.0     4088
43.0     4082
39.0     4074
47.0     4055
17.0     4039
11.0     3735
9.0      3699
49.0     3525
10.0     3524
8.0      3475
6.0      3471
7.0      3369
70.0     3060
73.0     3057
74.0     3055
71.0     3028
72.0     3024
68.0     3003
69.0     2999
66.0     2994
75.0     2991
63.0     2984
5.0      2974
67.0     2965
62.0     2945
65.0     2928
64.0     2892
61.0     2843
60.0     2832
57.0     2781
56.0     2774
76.0     2759
54.0     2732
58.0     2725
59.0     2704
52.0     2655
4.0      2655
51.0     2583
55.0     2569
53.0  

In [17]:
#inc_last_12_months
median = X_train['inq_last_12m'].median()
#median
#X_train['inq_last_12m'].fillna(value=mean,inplace=True)
X_train['inq_last_12m'].fillna(value=median,inplace=True)

In [18]:
#Morgage_accounts
median = X_train['mort_acc'].median()
X_train['mort_acc'].fillna(value=median,inplace=True)

In [19]:
#Months_since_recent_bancrupt_dlq
mean = X_train['mths_since_recent_bc_dlq'].mean()

X_train['mths_since_recent_bc_dlq'].fillna(value=mean,inplace=True)

In [20]:
#mths_since_recent_inq
mean = X_train['mths_since_recent_inq'].mean()
X_train['mths_since_recent_inq'].fillna(value=mean,inplace=True)

In [21]:
#num_accts_ever_120_pd
median = X_train['num_accts_ever_120_pd'].median()
X_train['num_accts_ever_120_pd'].fillna(value=median,inplace=True)

In [22]:
#num_actv_bc_tl
mean = X_train['num_actv_bc_tl'].mean()
X_train['num_actv_bc_tl'].fillna(value=mean,inplace=True)

In [23]:
X_train['open_rv_24m'].value_counts()

1.0     128900
2.0     120478
3.0      93807
0.0      91678
4.0      66371
5.0      45160
6.0      29431
7.0      19642
8.0      12453
9.0       8149
10.0      5352
11.0      3545
12.0      2241
13.0      1651
14.0      1156
15.0       656
16.0       543
17.0       383
18.0       269
19.0       197
20.0       127
21.0       104
22.0        81
24.0        53
23.0        41
25.0        38
26.0        25
27.0        23
28.0        21
29.0        14
30.0        13
31.0         8
32.0         6
37.0         5
33.0         3
34.0         3
38.0         3
35.0         3
36.0         3
45.0         3
42.0         2
39.0         2
40.0         1
53.0         1
54.0         1
44.0         1
Name: open_rv_24m, dtype: int64

In [24]:
mean = X_train['num_rev_accts'].mean()
X_train['num_rev_accts'].fillna(value=mean,inplace=True)

In [25]:
mean = X_train['num_sats'].mean()
X_train['num_sats'].fillna(value=mean,inplace=True)

In [26]:
mean = X_train['num_tl_op_past_12m'].mean()

X_train['num_tl_op_past_12m'].fillna(value=mean,inplace=True)

In [27]:
median = X_train['open_il_24m'].median()
X_train['open_il_24m'].fillna(value=median,inplace=True)

In [28]:
mean = X_train['open_rv_24m'].mean()

X_train['open_rv_24m'].fillna(value=mean,inplace=True)

In [29]:
X_train.isna().sum()

acc_now_delinq                    0
addr_state                        0
annual_inc                        0
chargeoff_within_12_mths         37
collections_12_mths_ex_med       37
delinq_2yrs                       0
dti                               0
earliest_cr_line                  0
emp_length                    71747
fico_range_high                   0
fico_range_low                    0
funded_amnt                       0
home_ownership                    0
inq_last_12m                      0
installment                       0
int_rate                          0
issue_d                           0
loan_amnt                         0
mort_acc                          0
mths_since_last_delinq            0
mths_since_recent_bc_dlq          0
mths_since_recent_inq             0
num_accts_ever_120_pd             0
num_actv_bc_tl                    0
num_rev_accts                     0
num_sats                          0
num_tl_120dpd_2m              90608
num_tl_30dpd                

In [30]:
drop_na_cols = ['percent_bc_gt_75','pub_rec_bankruptcies','revol_util','tax_liens','zip_code']
X_train.dropna(subset = drop_na_cols,inplace=True)

In [31]:
#X_train = X_train.drop(['open_il_24m','open_rv_24m','mths_since_recent_inq','mths_since_recent_bc_dlq','mths_since_last_delinq',
                 #      'inq_last_12m'],axis=1)

In [32]:
indexes = X_train.index.values
Y_train = Y_train.loc[indexes]
Y_train.head

<bound method NDFrame.head of          loan_status
index               
0                  1
1                  0
2                  1
3                  0
4                  0
...              ...
1199856            0
1199857            0
1199858            0
1199859            0
1199860            1

[1155413 rows x 1 columns]>

# Train

In [33]:
cat_cols = ['acc_now_delinq','addr_state','chargeoff_within_12_mths','collections_12_mths_ex_med',
            'home_ownership','purpose','term','verification_status','emp_length','num_tl_120dpd_2m','num_tl_30dpd','num_tl_90g_dpd_24m','zip_code']
exclude_cols = ['title','index']

In [34]:
feature_names = set(X_train.columns) - set(exclude_cols)
feature_names = list(feature_names)
feature_names


['fico_range_high',
 'revol_util',
 'earliest_cr_line',
 'loan_amnt',
 'installment',
 'emp_length',
 'home_ownership',
 'zip_code',
 'inq_last_12m',
 'fico_range_low',
 'purpose',
 'acc_now_delinq',
 'pub_rec',
 'open_rv_24m',
 'pub_rec_bankruptcies',
 'total_acc',
 'dti',
 'mths_since_recent_bc_dlq',
 'num_tl_30dpd',
 'delinq_2yrs',
 'open_acc',
 'funded_amnt',
 'num_rev_accts',
 'addr_state',
 'int_rate',
 'open_il_24m',
 'num_tl_90g_dpd_24m',
 'num_accts_ever_120_pd',
 'chargeoff_within_12_mths',
 'num_actv_bc_tl',
 'mths_since_recent_inq',
 'term',
 'mort_acc',
 'annual_inc',
 'percent_bc_gt_75',
 'issue_d',
 'num_sats',
 'num_tl_120dpd_2m',
 'num_tl_op_past_12m',
 'tax_liens',
 'verification_status',
 'mths_since_last_delinq',
 'collections_12_mths_ex_med']

In [35]:
X_train = X_train[feature_names].copy()

In [36]:
X_train['acc_now_delinq'] = X_train['acc_now_delinq'].astype(str)
X_train['chargeoff_within_12_mths'] = X_train['chargeoff_within_12_mths'].astype(str)
X_train['collections_12_mths_ex_med'] = X_train['collections_12_mths_ex_med'].astype(str)
X_train['num_tl_120dpd_2m'] = X_train['num_tl_120dpd_2m'].astype(str)
X_train['num_tl_30dpd'] = X_train['num_tl_30dpd'].astype(str)
X_train['num_tl_90g_dpd_24m'] = X_train['num_tl_90g_dpd_24m'].astype(str)


In [37]:

X = X_train.copy()
Y = Y_train['loan_status']
X[cat_cols] = X[cat_cols].fillna('nan')
cat_cols_idx = [feature_names.index(c) for c in cat_cols]
xy = Pool(X,Y,cat_features=cat_cols_idx)

# Hyperopt

In [43]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

space = {
    'learning_rate': hp.loguniform('learning_rate', -5, -1),
    'depth': hp.quniform('depth', 4, 10, 1),
    'iterations': 2000,
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, 5),
    'bagging_temperature': hp.uniform('bagging_temperature', 0, 4),
    'random_strength': hp.uniform('random_strength', 0.2, 5),
    'eval_metric' : 'AUC',
    'loss_function' : "Logloss",
}

def objective(space):
    scores = cv(
        xy,
        space,
        iterations = 200,
        fold_count=5, 
        plot=False,
        early_stopping_rounds=100,
        logging_level = 'Silent',
    )
    return 1-scores['test-AUC-mean'].max()

In [44]:
best = fmin(fn=objective,
             space=space,
             algo=tpe.suggest,
             max_evals=100,
             trials=Trials(),
             verbose=1)

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

100%|██████████| 100/100 [22:10:59<00:00, 798.59s/trial, best loss: 0.27073911180882604]  


In [45]:
best

{'bagging_temperature': 2.4810650362556896,
 'depth': 9.0,
 'l2_leaf_reg': 4.114321503063239,
 'learning_rate': 0.1438566567932682,
 'random_strength': 0.8967869636206097}

# Best

In [38]:

#best = {'bagging_temperature': 2.4810650362556896,
# 'depth': 9.0,
# 'l2_leaf_reg': 4.114321503063239,
# 'learning_rate': 0.1438566567932682,
#'random_strength': 0.8967869636206097}

best = {'bagging_temperature': 2.4810650362556896,
 'depth': 9.0,
 'l2_leaf_reg': 4.114321503063239,
 'learning_rate': 0.1438566567932682,
 'random_strength': 0.8967869636206097}

params = {    
    'eval_metric': 'AUC',
    'custom_metric': 'Accuracy',
    "loss_function": "Logloss",
    'logging_level': 'Silent',
    'use_best_model': True,
    'iterations': 2000,
}
params.update(best)

In [39]:
scores = cv(xy,
            params,
            fold_count=5, 
            verbose=50,
            plot="True",
            early_stopping_rounds=100)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [40]:
scores.loc[scores.shape[0]-101]

iterations             581.000000
test-AUC-mean            0.730271
test-AUC-std             0.001003
test-Logloss-mean        0.437508
test-Logloss-std         0.000468
train-Logloss-mean       0.414410
train-Logloss-std        0.001495
test-Accuracy-mean       0.810475
test-Accuracy-std        0.000508
train-Accuracy-mean      0.822407
train-Accuracy-std       0.000751
Name: 581, dtype: float64

In [41]:
final_params = {'iterations': scores.shape[0]-101}
final_params.update(best)

cb_model = CatBoostClassifier(**final_params, verbose=False)

In [42]:
cb_model.fit(xy)

<catboost.core.CatBoostClassifier at 0x7f875b179c70>

# Predict

In [43]:
X_test = pd.read_csv("X_test.csv",sep=',')

In [44]:
X_test.shape

(590977, 45)

In [45]:
X_test.head()

Unnamed: 0,index,acc_now_delinq,addr_state,annual_inc,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_2yrs,dti,earliest_cr_line,emp_length,fico_range_high,fico_range_low,funded_amnt,home_ownership,inq_last_12m,installment,int_rate,issue_d,loan_amnt,mort_acc,mths_since_last_delinq,mths_since_recent_bc_dlq,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_rev_accts,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,open_acc,open_il_24m,open_rv_24m,percent_bc_gt_75,pub_rec,pub_rec_bankruptcies,purpose,revol_util,tax_liens,term,title,total_acc,verification_status,zip_code
0,0,0.0,GA,46209.0,0.0,0.0,0.0,34.98,Apr-2009,8 years,669.0,665.0,15700.0,MORTGAGE,6.0,437.65,22.45%,Jan-2016,15700.0,3.0,48.0,48.0,2.0,1.0,4.0,9.0,11.0,0.0,0.0,0.0,4.0,11.0,3.0,3.0,75.0,0.0,0.0,debt_consolidation,61.7%,0.0,60 months,Debt consolidation,20.0,Verified,302xx
1,1,0.0,PA,43500.0,0.0,0.0,0.0,20.25,Jun-1982,5 years,704.0,700.0,7200.0,OWN,,166.79,13.80%,Apr-2011,7200.0,,,,,,,,,,,,,14.0,,,,0.0,0.0,debt_consolidation,58.6%,0.0,60 months,POFF,32.0,Not Verified,151xx
2,2,0.0,OH,72000.0,0.0,0.0,0.0,15.98,Nov-1997,3 years,699.0,695.0,24000.0,MORTGAGE,,847.21,16.29%,Nov-2012,24000.0,2.0,63.0,63.0,2.0,5.0,3.0,23.0,18.0,0.0,0.0,0.0,1.0,18.0,,,40.0,0.0,0.0,credit_card,46.2%,0.0,36 months,credit crd,36.0,Verified,440xx
3,3,0.0,LA,120000.0,0.0,0.0,0.0,24.93,Jul-2001,10+ years,784.0,780.0,7800.0,MORTGAGE,1.0,234.9,5.32%,Jun-2017,7800.0,2.0,68.0,,2.0,0.0,9.0,16.0,21.0,0.0,0.0,0.0,5.0,21.0,2.0,6.0,0.0,0.0,0.0,other,3.7%,0.0,36 months,Other,31.0,Not Verified,710xx
4,4,0.0,IN,55000.0,0.0,0.0,1.0,23.22,Sep-1990,10+ years,689.0,685.0,10500.0,MORTGAGE,4.0,331.9,8.59%,Sep-2016,10500.0,7.0,21.0,,1.0,0.0,4.0,9.0,15.0,0.0,0.0,0.0,1.0,15.0,3.0,0.0,50.0,0.0,0.0,credit_card,68.1%,0.0,36 months,Credit card refinancing,28.0,Not Verified,465xx


In [46]:
X_test.set_index('index',inplace = True)

In [47]:
#months_since_last_delinquent
mean = X_test['mths_since_last_delinq'].mean()
X_test['mths_since_last_delinq'].fillna(value=mean,inplace=True)
#X_train['emp_length'].unique()
#X_train.info()
X_test['earliest_cr_line'] = pd.to_datetime(X_test['earliest_cr_line'],format = '%b-%Y')
X_test['earliest_cr_line'].sample(n=10)
X_test['issue_d'] = pd.to_datetime(X_test['issue_d'],format = '%b-%Y')
X_test['issue_d'].sample(n=10)
X_test['int_rate'] = X_test['int_rate'].str.rstrip("%").astype(float)/100
X_test['revol_util'] = X_test['revol_util'].str.rstrip("%").astype(float)/100
X_test['revol_util'].sample(n=10)
#inc_last_12_months
median = X_test['inq_last_12m'].median()
#median
#X_train['inq_last_12m'].fillna(value=mean,inplace=True)
X_test['inq_last_12m'].fillna(value=median,inplace=True)
#Morgage_accounts
median = X_test['mort_acc'].median()
X_test['mort_acc'].fillna(value=median,inplace=True)
#Months_since_recent_bancrupt_dlq
mean = X_test['mths_since_recent_bc_dlq'].mean()

X_test['mths_since_recent_bc_dlq'].fillna(value=mean,inplace=True)
#mths_since_recent_inq
mean = X_test['mths_since_recent_inq'].mean()
X_test['mths_since_recent_inq'].fillna(value=mean,inplace=True)
#num_accts_ever_120_pd
median = X_test['num_accts_ever_120_pd'].median()
X_test['num_accts_ever_120_pd'].fillna(value=median,inplace=True)
#num_actv_bc_tl
mean = X_test['num_actv_bc_tl'].mean()
X_test['num_actv_bc_tl'].fillna(value=mean,inplace=True)
mean = X_test['num_rev_accts'].mean()
X_test['num_rev_accts'].fillna(value=mean,inplace=True)
mean = X_test['num_sats'].mean()
X_test['num_sats'].fillna(value=mean,inplace=True)
mean = X_test['num_tl_op_past_12m'].mean()

X_test['num_tl_op_past_12m'].fillna(value=mean,inplace=True)
median = X_test['open_il_24m'].median()
X_test['open_il_24m'].fillna(value=median,inplace=True)
mean = X_test['open_rv_24m'].mean()

X_test['open_rv_24m'].fillna(value=mean,inplace=True)

In [53]:
X_test.isna().sum()

acc_now_delinq                    0
addr_state                        0
annual_inc                        0
chargeoff_within_12_mths         19
collections_12_mths_ex_med       19
delinq_2yrs                       0
dti                               0
earliest_cr_line                  0
emp_length                    35630
fico_range_high                   0
fico_range_low                    0
funded_amnt                       0
home_ownership                    0
inq_last_12m                      0
installment                       0
int_rate                          0
issue_d                           0
loan_amnt                         0
mort_acc                          0
mths_since_last_delinq            0
mths_since_recent_bc_dlq          0
mths_since_recent_inq             0
num_accts_ever_120_pd             0
num_actv_bc_tl                    0
num_rev_accts                     0
num_sats                          0
num_tl_120dpd_2m              45008
num_tl_30dpd                

In [49]:
mean = X_test['percent_bc_gt_75'].mean()
X_test['percent_bc_gt_75'].fillna(value=mean,inplace=True)

In [50]:
mean = X_test['pub_rec_bankruptcies'].mean()
X_test['pub_rec_bankruptcies'].fillna(value=mean,inplace=True)

In [51]:
mean = X_test['revol_util'].mean()
X_test['revol_util'].fillna(value=mean,inplace=True)

In [52]:
mean = X_test['tax_liens'].mean()
X_test['tax_liens'].fillna(value=mean,inplace=True)

In [78]:
#drop_na_cols = ['percent_bc_gt_75','pub_rec_bankruptcies','revol_util','tax_liens','zip_code']
#X_test.dropna(subset = drop_na_cols,inplace=True)

In [54]:
X_test = X_test[feature_names].copy() 

In [55]:
X_test['acc_now_delinq'] = X_test['acc_now_delinq'].astype(str)
X_test['chargeoff_within_12_mths'] = X_test['chargeoff_within_12_mths'].astype(str)
X_test['collections_12_mths_ex_med'] = X_test['collections_12_mths_ex_med'].astype(str)
X_test['num_tl_120dpd_2m'] = X_test['num_tl_120dpd_2m'].astype(str)
X_test['num_tl_30dpd'] = X_test['num_tl_30dpd'].astype(str)
X_test['num_tl_90g_dpd_24m'] = X_test['num_tl_90g_dpd_24m'].astype(str)

In [56]:
X = X_test.copy()
X[cat_cols] = X[cat_cols].fillna('nan')
cat_cols_idx = [feature_names.index(c) for c in cat_cols]
xx = Pool(X,cat_features=cat_cols_idx)

In [57]:
#new pool = xx
#cb_model = ...
y_test = cb_model.predict_proba(xx)[:,1]

In [58]:
y_test

array([0.60362298, 0.20712263, 0.13890234, ..., 0.2244198 , 0.05002272,
       0.21097105])

In [59]:
answer = pd.DataFrame(data = y_test,columns = ['loan_status'])

In [60]:
answer.reset_index(inplace=True)

In [61]:
answer.head()

Unnamed: 0,index,loan_status
0,0,0.603623
1,1,0.207123
2,2,0.138902
3,3,0.047742
4,4,0.08891


In [62]:
import os

In [63]:
os.getcwd()

'/home/man_with_axe/Tonya'

In [117]:
answer.to_csv(r'/home/man_with_axe/Tonya/answer.csv',index=False, sep=",")

In [119]:
y_answer.head()

Unnamed: 0,index,loan_status
0,0,1
1,1,0
2,2,0
3,3,0
4,4,0
