# Scienaptic Data Science Assignment II - Models

Author: Sharmeen Malik

Date: May 18, 2020

In [1]:
#import modules
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
import collections, numpy
import time

from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

pd.options.display.max_columns=999
pd.options.display.max_rows=3000

data_file = pd.read_excel('Collections Case Study Data.xlsx', sheet_name=None)

  import pandas.util.testing as tm


## Data

In [2]:
#assign data to new dataframe and display keys
df = data_file
df.keys()

dict_keys(['Loan Details', 'Loan_Status_MartoMay', 'Loan_Status_AprtoJun', 'Historical 6 Months Details', 'Loan_ID mapping', 'Call Details'])

In [3]:
#creates stratums for data based on month
mar_data = df['Loan_Status_MartoMay'].loc[df['Loan_Status_MartoMay']['Month'] == 'March']
apr_data = df['Loan_Status_MartoMay'].loc[df['Loan_Status_MartoMay']['Month'] == 'April']
may_data = df['Loan_Status_MartoMay'].loc[df['Loan_Status_MartoMay']['Month'] == 'May']
marapr_data = df['Loan_Status_AprtoJun'].loc[df['Loan_Status_AprtoJun']['Month'] == 'April']
aprmay_data = df['Loan_Status_AprtoJun'].loc[df['Loan_Status_AprtoJun']['Month'] == 'May']
mayjun_data = df['Loan_Status_AprtoJun'].loc[df['Loan_Status_AprtoJun']['Month'] == 'June']

In [4]:
#QC checking to see if our assumption is correct
print(len(mar_data) == len(marapr_data))
print(len(apr_data) == len(aprmay_data))
print(len(may_data) == len(mayjun_data))

True
True
True


In [5]:
#combines account information for each month with performance in the following month
mar_data1 = pd.merge(mar_data, marapr_data, how='left', on='Loan_id', suffixes=('_mar', '_marapr'))
apr_data1 = pd.merge(apr_data, aprmay_data, how='left', on='Loan_id', suffixes=('_apr', '_aprmay'))
may_data1 = pd.merge(may_data, mayjun_data, how='left', on='Loan_id', suffixes=('_may', '_mayjun'))

In [6]:
#merges monthly data into one dataframe
data1 = pd.merge(mar_data1, apr_data1, how='outer', on='Loan_id')
data1 = pd.merge(data1, may_data1, how='outer', on='Loan_id')

In [7]:
#checks length of dataframes
print(mar_data1.shape)
print(apr_data1.shape)
print(may_data1.shape)
print(data1.shape)

(24575, 13)
(27079, 13)
(29115, 13)
(40339, 37)


In [8]:
#displays column names of dataframe
data1.columns

Index(['Loan_id', 'Bucket_mar', 'TENURE_mar', 'PRINBALANCE_mar',
       'Months on Books_mar', 'Sanctioned Amount_mar', 'Month_mar',
       'Bucket_marapr', 'PRINBALANCE_marapr', 'TENURE_marapr',
       'Months on Books_marapr', 'Sanctioned Amount_marapr', 'Month_marapr',
       'Bucket_apr', 'TENURE_apr', 'PRINBALANCE_apr', 'Months on Books_apr',
       'Sanctioned Amount_apr', 'Month_apr', 'Bucket_aprmay',
       'PRINBALANCE_aprmay', 'TENURE_aprmay', 'Months on Books_aprmay',
       'Sanctioned Amount_aprmay', 'Month_aprmay', 'Bucket_may', 'TENURE_may',
       'PRINBALANCE_may', 'Months on Books_may', 'Sanctioned Amount_may',
       'Month_may', 'Bucket_mayjun', 'PRINBALANCE_mayjun', 'TENURE_mayjun',
       'Months on Books_mayjun', 'Sanctioned Amount_mayjun', 'Month_mayjun'],
      dtype='object')

### Roll Forward Data

This section calculates the roll forward for each account in order to create a target variable for our model.

In [9]:
#drop columns and assign to new dataframe in order to calculate roll forwards
data2 = data1.drop(columns=['Month_mar', 'TENURE_marapr', 'Sanctioned Amount_marapr', 
                            'Month_marapr', 'TENURE_apr', 'Sanctioned Amount_apr','Month_apr', 
                            'TENURE_aprmay', 'Sanctioned Amount_aprmay', 'Month_aprmay', 
                            'TENURE_may', 'Sanctioned Amount_may', 'Month_may', 'TENURE_mayjun', 
                            'Sanctioned Amount_mayjun', 'Month_mayjun'])

data2.describe()

Unnamed: 0,TENURE_mar,PRINBALANCE_mar,Months on Books_mar,Sanctioned Amount_mar,PRINBALANCE_marapr,Months on Books_marapr,PRINBALANCE_apr,Months on Books_apr,PRINBALANCE_aprmay,Months on Books_aprmay,PRINBALANCE_may,Months on Books_may,PRINBALANCE_mayjun,Months on Books_mayjun
count,24575.0,24575.0,24575.0,24575.0,24575.0,24575.0,27079.0,27079.0,27079.0,27079.0,29115.0,29115.0,29115.0,29115.0
mean,41.832838,33681720.0,15.600895,46035590.0,31833830.0,16.600895,33400310.0,15.773847,31577360.0,16.773847,34153420.0,15.400996,32315230.0,16.400996
std,7.599259,23970620.0,10.692061,25862280.0,23673010.0,10.692061,23640950.0,10.643053,23352230.0,10.643053,24974250.0,10.549727,24676540.0,10.549727
min,12.0,409000.0,0.0,10000000.0,200.0,1.0,74700.0,0.0,1000.0,1.0,237200.0,0.0,200.0,1.0
25%,36.0,17626500.0,6.0,30000000.0,15974120.0,7.0,17572700.0,7.0,15927100.0,8.0,18076750.0,6.0,16411800.0,7.0
50%,48.0,29068600.0,14.0,40000000.0,27577800.0,15.0,28931200.0,15.0,27401400.0,16.0,29406600.0,14.0,27983200.0,15.0
75%,48.0,44066200.0,23.0,55000000.0,42303000.0,24.0,43709550.0,23.0,42053150.0,24.0,44342250.0,22.0,42680200.0,23.0
max,239.0,375380200.0,74.0,420000000.0,373265800.0,75.0,374331400.0,78.0,372183100.0,79.0,1137036000.0,84.0,1131137000.0,85.0


In [10]:
data2.head()

Unnamed: 0,Loan_id,Bucket_mar,TENURE_mar,PRINBALANCE_mar,Months on Books_mar,Sanctioned Amount_mar,Bucket_marapr,PRINBALANCE_marapr,Months on Books_marapr,Bucket_apr,PRINBALANCE_apr,Months on Books_apr,Bucket_aprmay,PRINBALANCE_aprmay,Months on Books_aprmay,Bucket_may,PRINBALANCE_may,Months on Books_may,Bucket_mayjun,PRINBALANCE_mayjun,Months on Books_mayjun
0,A205346000,TB0,36.0,3693500.0,33.0,22000000.0,TB1,1332100.0,34.0,,,,,,,,,,,,
1,A675851816,TB0,48.0,24353700.0,30.0,40000000.0,TB1,20875500.0,31.0,,,,,,,,,,,,
2,A260852240,TB0,48.0,42514900.0,24.0,60000000.0,TB1,37947922.5,25.0,,,,,,,,,,,,
3,A374402177,TB0,36.0,12290300.0,19.0,20000000.0,TB0,11214900.0,20.0,TB0,11214900.0,20.0,TB0,10639500.0,21.0,,,,,,
4,A211889361,TB0,36.0,9396700.0,25.0,22000000.0,TB0,8666200.0,26.0,TB0,8666200.0,26.0,TB1,8666200.0,27.0,,,,,,


**Notes:**

In order to calculate roll forward for accounts. The status at the beginning of each month was assessed, with account status of any thing other than 'REGULAR' considered bad and therefore assigned a one.

June data for accounts (referenced as 'mayjun' was also assessed to estimate an ending status of accounts.

In [11]:
#calculates roll forward for each month
data2['RF1'] = np.where(data2['Bucket_mar']=='TB0', 1, 0)
data2['RF2'] = np.where((data2['Bucket_apr']=='TB0')|(data2['Bucket_apr']=='TB1'), 1, 0)
data2['RF3'] = np.where((data2['Bucket_may']=='TB0')|(data2['Bucket_may']=='TB1'), 1, 0)
data2['RF4'] = np.where((data2['Bucket_mayjun']=='TB0')|(data2['Bucket_mayjun']=='TB1'), 1, 0)

In [12]:
#displays value counts for each bucket

cols = ['RF1', 'RF2', 'RF3', 'RF4']

for c in cols:
    print(c)
    print(data2[c].value_counts().sort_index())
    print('\n')

RF1
0    15764
1    24575
Name: RF1, dtype: int64


RF2
0    13260
1    27079
Name: RF2, dtype: int64


RF3
0    11224
1    29115
Name: RF3, dtype: int64


RF4
0    38309
1     2030
Name: RF4, dtype: int64




### Target Variable 

In [13]:
#displays current information for the dataframe
data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40339 entries, 0 to 40338
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Loan_id                 40339 non-null  object 
 1   Bucket_mar              24575 non-null  object 
 2   TENURE_mar              24575 non-null  float64
 3   PRINBALANCE_mar         24575 non-null  float64
 4   Months on Books_mar     24575 non-null  float64
 5   Sanctioned Amount_mar   24575 non-null  float64
 6   Bucket_marapr           24575 non-null  object 
 7   PRINBALANCE_marapr      24575 non-null  float64
 8   Months on Books_marapr  24575 non-null  float64
 9   Bucket_apr              27079 non-null  object 
 10  PRINBALANCE_apr         27079 non-null  float64
 11  Months on Books_apr     27079 non-null  float64
 12  Bucket_aprmay           27079 non-null  object 
 13  PRINBALANCE_aprmay      27079 non-null  float64
 14  Months on Books_aprmay  27079 non-null

There are substantial null values which can be dropped as we are only interested in the roll forward columns for our target variable.

In [14]:
#drops columns not needed for calculating the target variable
target_var = data2.drop(columns=['Bucket_mar', 'TENURE_mar', 'PRINBALANCE_mar',
                               'Months on Books_mar', 'Sanctioned Amount_mar', 'Bucket_marapr',
                               'PRINBALANCE_marapr', 'Months on Books_marapr', 'Bucket_apr',
                               'PRINBALANCE_apr', 'Months on Books_apr', 'Bucket_aprmay',
                               'PRINBALANCE_aprmay', 'Months on Books_aprmay', 'Bucket_may',
                               'PRINBALANCE_may', 'Months on Books_may', 'Bucket_mayjun',
                               'PRINBALANCE_mayjun', 'Months on Books_mayjun'])

In [15]:
#creates a target variable by summing roll forwards
#threshold has been set at 2 for improved model performance
target_var['RF_sum'] = target_var['RF1']+ target_var['RF2']+ target_var['RF3']+ target_var['RF4']
target_var['target'] = target_var['RF_sum'].apply(lambda x: 1 if x >2 else 0)

target_var.head()

Unnamed: 0,Loan_id,RF1,RF2,RF3,RF4,RF_sum,target
0,A205346000,1,0,0,0,1,0
1,A675851816,1,0,0,0,1,0
2,A260852240,1,0,0,0,1,0
3,A374402177,1,1,0,0,2,0
4,A211889361,1,1,0,0,2,0


In [16]:
#cleaning up the dataframe further
target = target_var[['Loan_id', 'target']]
target.head()

Unnamed: 0,Loan_id,target
0,A205346000,0
1,A675851816,0
2,A260852240,0
3,A374402177,0
4,A211889361,0


In [17]:
#displays value counts for target variable
print(target.shape)
print('\n')
print(target['target'].value_counts())

(40339, 2)


0    24943
1    15396
Name: target, dtype: int64


There is a class imbalance will need to assess whether to under/over sample in order to refine model performance.

### TB0 Data

In this section we create a dataframe which provides information for each account based on the first instance of the account becoming TB0.

In [18]:
#creates dataframe and adds term completed as defined by 'month on books/tenure'
data_TB0 = pd.DataFrame(df['Loan_Status_MartoMay'])
data_TB0 = data_TB0.sort_values('Month',ascending=True)
data_TB0 = data_TB0.drop_duplicates(subset ="Loan_id", keep = 'first', inplace = False) 

data_TB0['term_completed'] = data_TB0['Months on Books'] / data_TB0['TENURE']

data_TB0 = data_TB0.drop(columns=['TENURE', 'Sanctioned Amount','Month', 'Bucket'])
data_TB0.info() 
                               

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40339 entries, 0 to 59792
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Loan_id          40339 non-null  object 
 1   PRINBALANCE      40339 non-null  float64
 2   Months on Books  40339 non-null  int64  
 3   term_completed   40339 non-null  float64
dtypes: float64(2), int64(1), object(1)
memory usage: 1.5+ MB


### Historical Data

In this section we create a dataframe which provides historical data for each account based on the first instance of the account becoming TB0.

In [19]:
#creates dataframe for 6 month historical data
data_6m = pd.DataFrame(df['Historical 6 Months Details'])
data_6m = data_6m.sort_values('Month',ascending=True)
data_6m = data_6m.drop_duplicates(subset ="Loan_id", keep = 'first', inplace = False) 
data_6m = data_6m.drop(columns=['rollf', 'Month'])
data_6m.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40339 entries, 0 to 59792
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Loan_id           40339 non-null  object
 1   paidcure          40339 non-null  int64 
 2   paiduncure        40339 non-null  int64 
 3   unpaid            40339 non-null  int64 
 4   rollb             40339 non-null  int64 
 5   num6mosdel        40339 non-null  int64 
 6   num3mosdel        40339 non-null  int64 
 7   num6mosdel_2plus  40339 non-null  int64 
 8   num3mosdel_2plus  40339 non-null  int64 
 9   max6del           40339 non-null  int64 
 10  max3del           40339 non-null  int64 
dtypes: int64(10), object(1)
memory usage: 3.7+ MB


### Call Data

In this section we create a dataframe which provides call center data for each account based on the first instance of the account becoming TB0.

In [20]:
#assigns 'Call Detail' table to dataframe
call_data = df['Call Details']
call_data.describe()

Unnamed: 0,month,Right_Party_Contact,Promise_to_pay,total_contacts
count,79614.0,79614.0,79614.0,79614.0
mean,4.040194,3.213895,1.119175,10.995001
std,0.813121,6.639031,1.489993,21.288761
min,3.0,0.0,0.0,1.0
25%,3.0,1.0,0.0,2.0
50%,4.0,2.0,1.0,4.0
75%,5.0,4.0,2.0,11.0
max,5.0,651.0,25.0,659.0


In [21]:
#groups call data by id and then aggregates
month_table = call_data.groupby(['Application_Id']).aggregate({'month':'nunique', 
                                                               'Right_Party_Contact':'sum', 
                                                               'Promise_to_pay':'sum', 
                                                               'total_contacts':'sum'})

#calculates Right Party Contacts (RPC) rate and Percentage of outbound calls resulting in Promise to Pay (PTP)
# month_table['avg_intensity'] = round(month_table['total_contacts'] / month_table['month'], 3)
month_table['RPC_rate'] = round(month_table['Right_Party_Contact'] / month_table['total_contacts'], 3)
month_table['PTP_rate'] = round(month_table['Promise_to_pay'] / month_table['total_contacts'], 3)
month_table.rename(columns={'month':'month_count'}, inplace=True)



In [22]:
#maps Application_id to Loan_ID and displays information
call_data = pd.merge(df['Loan_ID mapping'], month_table, how='outer', left_on='Application_id', right_on='Application_Id')

call_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40339 entries, 0 to 40338
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Loanid               40339 non-null  object 
 1   Application_id       40339 non-null  object 
 2   month_count          38051 non-null  float64
 3   Right_Party_Contact  38051 non-null  float64
 4   Promise_to_pay       38051 non-null  float64
 5   total_contacts       38051 non-null  float64
 6   RPC_rate             38051 non-null  float64
 7   PTP_rate             38051 non-null  float64
dtypes: float64(6), object(2)
memory usage: 2.8+ MB


There are null values in this dataframe.  We will handle these by substituting a 0 for the nulls as they represent no activity for the variables.

In [23]:
call_data.fillna(0, inplace=True)
call_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40339 entries, 0 to 40338
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Loanid               40339 non-null  object 
 1   Application_id       40339 non-null  object 
 2   month_count          40339 non-null  float64
 3   Right_Party_Contact  40339 non-null  float64
 4   Promise_to_pay       40339 non-null  float64
 5   total_contacts       40339 non-null  float64
 6   RPC_rate             40339 non-null  float64
 7   PTP_rate             40339 non-null  float64
dtypes: float64(6), object(2)
memory usage: 2.8+ MB


In [24]:
#rename column for consistency and display descriptives
call_data.rename(columns = {'Loanid': 'Loan_id'}, inplace = True)
call_data.describe()

Unnamed: 0,month_count,Right_Party_Contact,Promise_to_pay,total_contacts,RPC_rate,PTP_rate
count,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0
mean,1.857384,6.343018,2.20883,21.699993,0.440444,0.19743
std,0.944077,10.807385,2.813729,36.83255,0.322395,0.254779
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,0.0,3.0,0.188,0.0
50%,2.0,3.0,1.0,8.0,0.387,0.11
75%,3.0,8.0,3.0,24.0,0.667,0.286
max,3.0,651.0,43.0,662.0,1.0,1.0


### Features Combined

In [25]:
#creates one dataframe by combining all the dataframes from above
#displays info
dataframe = pd.merge(data_TB0, df['Loan Details'], how='left', on='Loan_id')
dataframe = pd.merge(dataframe, data_6m, how='left', on='Loan_id')
dataframe = pd.merge(dataframe, call_data, how='left', on='Loan_id')
dataframe = pd.merge(dataframe, target, how='left', on='Loan_id')
dataframe = dataframe.drop(columns=[ 'Loan_id','Application_id','rollb'])

dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40339 entries, 0 to 40338
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   PRINBALANCE           40339 non-null  float64
 1   Months on Books       40339 non-null  int64  
 2   term_completed        40339 non-null  float64
 3   Debt_to_burden_Ratio  40339 non-null  float64
 4   total_income          40339 non-null  float64
 5   TENURE                40339 non-null  int64  
 6   Sanctioned Amount     40339 non-null  int64  
 7   paidcure              40339 non-null  int64  
 8   paiduncure            40339 non-null  int64  
 9   unpaid                40339 non-null  int64  
 10  num6mosdel            40339 non-null  int64  
 11  num3mosdel            40339 non-null  int64  
 12  num6mosdel_2plus      40339 non-null  int64  
 13  num3mosdel_2plus      40339 non-null  int64  
 14  max6del               40339 non-null  int64  
 15  max3del            

No missing values, so we continue.

In [26]:
dataframe.describe()

Unnamed: 0,PRINBALANCE,Months on Books,term_completed,Debt_to_burden_Ratio,total_income,TENURE,Sanctioned Amount,paidcure,paiduncure,unpaid,num6mosdel,num3mosdel,num6mosdel_2plus,num3mosdel_2plus,max6del,max3del,month_count,Right_Party_Contact,Promise_to_pay,total_contacts,RPC_rate,PTP_rate,target
count,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0,40339.0
mean,34345020.0,14.904063,0.36459,0.288089,7975965.0,41.596445,46167520.0,5.157242,0.031334,0.080889,0.182652,0.128114,0.011379,0.005974,0.109993,0.099011,1.857384,6.343018,2.20883,21.699993,0.440444,0.19743,0.381665
std,25097950.0,10.798653,0.260053,0.158125,30651980.0,7.859246,26827440.0,1.559468,0.187517,0.287451,0.703402,0.446723,0.183737,0.098775,0.342466,0.320771,0.944077,10.807385,2.813729,36.83255,0.322395,0.254779,0.485801
min,74700.0,0.0,0.0,0.000192,800000.0,12.0,10000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18043050.0,6.0,0.138889,0.227182,5715942.0,36.0,30000000.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3.0,0.188,0.0,0.0
50%,29541600.0,13.0,0.3125,0.261264,7000000.0,48.0,40000000.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,1.0,8.0,0.387,0.11,0.0
75%,44742550.0,22.0,0.555556,0.327515,8514182.0,48.0,55000000.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,8.0,3.0,24.0,0.667,0.286,1.0
max,1137036000.0,84.0,1.0,11.1112,6100000000.0,240.0,1300000000.0,6.0,3.0,3.0,6.0,3.0,6.0,3.0,5.0,5.0,3.0,651.0,43.0,662.0,1.0,1.0,1.0


In [27]:
#calculates correlation of feature against target variable
dataframe.corr()['target'].sort_values()

max3del                -0.121919
max6del                -0.110035
num3mosdel             -0.095892
unpaid                 -0.088830
num6mosdel             -0.065881
PTP_rate               -0.056919
RPC_rate               -0.056415
PRINBALANCE            -0.047067
num3mosdel_2plus       -0.033571
num6mosdel_2plus       -0.027547
paiduncure             -0.023519
Debt_to_burden_Ratio   -0.020137
Sanctioned Amount      -0.008450
total_income           -0.006179
TENURE                  0.048770
term_completed          0.106211
Months on Books         0.119218
paidcure                0.193460
total_contacts          0.294263
Right_Party_Contact     0.304885
Promise_to_pay          0.420683
month_count             0.809482
target                  1.000000
Name: target, dtype: float64

In [28]:
#sorts absolute values of correlation coefficient
abs(dataframe.corr()['target']).sort_values(ascending=False)

target                  1.000000
month_count             0.809482
Promise_to_pay          0.420683
Right_Party_Contact     0.304885
total_contacts          0.294263
paidcure                0.193460
max3del                 0.121919
Months on Books         0.119218
max6del                 0.110035
term_completed          0.106211
num3mosdel              0.095892
unpaid                  0.088830
num6mosdel              0.065881
PTP_rate                0.056919
RPC_rate                0.056415
TENURE                  0.048770
PRINBALANCE             0.047067
num3mosdel_2plus        0.033571
num6mosdel_2plus        0.027547
paiduncure              0.023519
Debt_to_burden_Ratio    0.020137
Sanctioned Amount       0.008450
total_income            0.006179
Name: target, dtype: float64

Features with correlation greater than 0.20 are 'total_contacts', 'Right_Party_Contact', 'Promise_to_pay'.  However, this should be taken with the caveat that only loans that have gone bad would have positive increasing values for this field. These fields were extracted from the call center data.

This leads me to the point that having historical call center data for accounts to tie out with historical payment history would be more useful for analysis.

## Models

### XGB Classifier

In [29]:
#assigns data to variable and splits
X= dataframe.drop(columns=['target'])
y= dataframe['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
#prints shape of datasets and displays counts of target variable in train and test data
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(collections.Counter(y_train))
print(collections.Counter(y_test))

(32271, 22)
(32271,)
(8068, 22)
(8068,)
Counter({0: 19986, 1: 12285})
Counter({0: 4957, 1: 3111})


In [31]:
#instantiate, train and predict XGB classifier model

xgb = XGBClassifier(n_estimators=100)

training_start = time.perf_counter()
xgb.fit(X_train, y_train)
training_end = time.perf_counter()

prediction_start = time.perf_counter()
XGB_preds = xgb.predict(X_test)
prediction_end = time.perf_counter()

acc_xgb = (XGB_preds == y_test).sum().astype(float) / len(XGB_preds)*100

xgb_train_time = training_end-training_start
xgb_prediction_time = prediction_end-prediction_start

XGB_Accuracy = accuracy_score(y_test, XGB_preds)
XGB_AUC = roc_auc_score(y_test, XGB_preds)

print("XGBoost's prediction accuracy is: %3.2f" % (acc_xgb)+"%" )
print("Time consumed for training:       %4.3f" % (xgb_train_time))
print("Time consumed for prediction:     %6.5f seconds" % (xgb_prediction_time)) 
print("AUC:                              " + str(XGB_AUC))

XGBoost's prediction accuracy is: 94.42%
Time consumed for training:       1.343
Time consumed for prediction:     0.00997 seconds
AUC:                              0.9304292064438193


In [32]:
cf1 = confusion_matrix(y_test, XGB_preds)
cf1

array([[4911,   46],
       [ 404, 2707]])

The scores look pretty good.  Since we are trying to assess credit risk and the improper classification of a risky account could have an adverse affect on our bottom line and other business consideration, we are concerned with how the model handles the false negative. Ideally, we would like to minimize this value.

In [33]:
Accuracy1 = (cf1[0,0] + cf1[1,1]) / (cf1[0,0] + cf1[1,1] + cf1[0,1] + cf1[1,0]) * 100
Recall1 = (cf1[0,0]) / (cf1[0,0] + cf1[1,0]) * 100
Precision1 = (cf1[0,0]) / (cf1[0,0] + cf1[0,1]) * 100
fn1 = cf1[1,0]
print("XGBoost's Accuracy is:  %3.2f" % (Accuracy1) + "%")
print("XGBoost's Recall is:    %3.2f" % (Recall1) + "%")
print("XGBoost's Precision is: %3.2f" % (Precision1) + "%")
print("False Negative:        ", fn1)

XGBoost's Accuracy is:  94.42%
XGBoost's Recall is:    92.40%
XGBoost's Precision is: 99.07%
False Negative:         404


### Decision Tree Classifier

In [34]:
#instantiate, train and predict Decision Tree classifier model

DT_Model = DecisionTreeClassifier()

training_start = time.perf_counter()
DT_Model.fit(X_train, y_train)
training_end = time.perf_counter()

prediction_start = time.perf_counter()
DT_Predict = DT_Model.predict(X_test)
prediction_end = time.perf_counter()

DT_train_time = training_end-training_start
DT_prediction_time = prediction_end-prediction_start

DT_Accuracy = accuracy_score(y_test, DT_Predict)
DT_AUC = roc_auc_score(y_test, DT_Predict) 

print("DT's prediction accuracy is:  %3.2f" % (DT_Accuracy))
print("Time consumed for training:   %4.3f" % (DT_train_time))
print("Time consumed for prediction: %6.5f seconds" % (DT_prediction_time)) 
print("AUC:                          " + str(DT_AUC))

DT's prediction accuracy is:  0.91
Time consumed for training:   0.224
Time consumed for prediction: 0.00214 seconds
AUC:                          0.9063948348597682


In [35]:
cf2 = confusion_matrix(y_test, DT_Predict)
cf2

array([[4558,  399],
       [ 332, 2779]])

In [36]:
Accuracy2 = (cf2[0,0] + cf2[1,1]) / (cf2[0,0] + cf2[1,1] + cf2[0,1] + cf2[1,0]) * 100
Recall2 = (cf2[0,0]) / (cf2[0,0] + cf2[1,0]) * 100
Precision2 = (cf2[0,0]) / (cf2[0,0] + cf2[0,1]) * 100
fn2 = cf2[1,0]
print("DTC's Accuracy is:  %3.2f" % (Accuracy2) + "%")
print("DTC's Recall is:    %3.2f" % (Recall2) + "%")
print("DTC's Precision is: %3.2f" % (Precision2) + "%")
print("False Negative:    ", fn2)

DTC's Accuracy is:  90.94%
DTC's Recall is:    93.21%
DTC's Precision is: 91.95%
False Negative:     332


### Logistic Regression

In [37]:
logreg = LogisticRegression()

training_start = time.perf_counter()
logreg.fit(X_train, y_train)
training_end = time.perf_counter()

prediction_start = time.perf_counter()
logreg_Predict = logreg.predict(X_test)
prediction_end = time.perf_counter()

logreg_train_time = training_end-training_start
logreg_prediction_time = prediction_end-prediction_start

logreg_Accuracy = accuracy_score(y_test, logreg_Predict)
logreg_AUC = roc_auc_score(y_test, logreg_Predict) 

print("Logreg's prediction accuracy is: %3.2f" % (logreg_Accuracy))
print("Time consumed for training:      %4.3f" % (logreg_train_time))
print("Time consumed for prediction:    %6.5f seconds" % (logreg_prediction_time)) 
print("AUC:                             " + str(logreg_AUC))

Logreg's prediction accuracy is: 0.61
Time consumed for training:      0.061
Time consumed for prediction:    0.00146 seconds
AUC:                             0.4969749164576852


In [38]:
cf3 = confusion_matrix(y_test, logreg_Predict)
cf3

array([[4833,  124],
       [3052,   59]])

In [39]:
Accuracy3 = (cf3[0,0] + cf3[1,1]) / (cf3[0,0] + cf3[1,1] + cf3[0,1] + cf3[1,0]) * 100
Recall3 = (cf3[0,0]) / (cf3[0,0] + cf3[1,0]) * 100
Precision3 = (cf3[0,0]) / (cf3[0,0] + cf3[0,1]) *100
fn3 = cf3[1,0]
print("Logreg's Accuracy is:  %3.2f" % (Accuracy3) + "%")
print("Logreg's Recall is:    %3.2f" % (Recall3) + "%")
print("Logreg's Precision is: %3.2f" % (Precision3) + "%")
print("False Negative:       ", fn3)

Logreg's Accuracy is:  60.63%
Logreg's Recall is:    61.29%
Logreg's Precision is: 97.50%
False Negative:        3052


In [40]:
model_performance= pd.DataFrame({'Model': ['XGBClassifier','DecisionTreeClassifier', 'LogisticRegression'],
                                            'Accuracy': [XGB_Accuracy,DT_Accuracy, logreg_Accuracy],
                                            'AUC': [XGB_AUC, DT_AUC,logreg_AUC],
                                            'Recall': [Recall1, Recall2,Recall3],
                                            'Precision': [Precision1, Precision2,Precision3],
                                            'False Negative': [fn1, fn2, fn3]})
model_performance

Unnamed: 0,Model,Accuracy,AUC,Recall,Precision,False Negative
0,XGBClassifier,0.944224,0.930429,92.398871,99.072019,404
1,DecisionTreeClassifier,0.909395,0.906395,93.210634,91.950777,332
2,LogisticRegression,0.606346,0.496975,61.293595,97.498487,3052


The classifiers are performing better than the logistic regression model.  We can try and fine tune the features and parameters further in order to try and improve the model performance.