# Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Import Data

In [58]:
# Import Raw Data
credit = pd.read_csv('Data/default of credit card clients.csv', 
                     header = 1)

# Data Ordering
#### See 'Exploration' for backing analyses

In [59]:
# rename varuables :
# that ridiculous last column name to something workable
# PAY_.. etc to Status Month
# BILL_AMT.. to Bill Month
# PAY_AMT.. to Paid Month
credit.rename(columns = {'LIMIT_BAL' : 'Limit',
                         'SEX': 'Sex',
                         'EDUCATION': 'Education',
                         'MARRIAGE': 'Marriage',
                         'AGE': 'Age',
                         'default payment next month': 'Default',
                         'PAY_0': 'StatusSep', 'PAY_2': 'StatusAug', 'PAY_3': 'StatusJul',
                         'PAY_4': 'StatusJun', 'PAY_5': 'StatusMay', 'PAY_6': 'StatusApr',
                         'BILL_AMT1': 'BillSep', 'BILL_AMT2': 'BillAug', 'BILL_AMT3': 'BillJul',
                         'BILL_AMT4': 'BillJun', 'BILL_AMT5': 'BillMay', 'BILL_AMT6': 'BillApr',
                         'PAY_AMT1': 'PaidSep', 'PAY_AMT2': 'PaidAug', 'PAY_AMT3': 'PaidJul',
                         'PAY_AMT4': 'PaidJun', 'PAY_AMT5': 'PaidMay', 'PAY_AMT6': 'PaidApr',
                        },
                  inplace = True)

In [60]:
# Change the 'Education' values
# 0 = NA
# 4, 5, 6 = Other
# Change all above to 4 and intepret as Other/NA
# Assumption : In general, the order (1 > 2 > 3 > 4) will hold
credit['Education'] = credit['Education'].replace([0 , 5, 6], 4)

In [61]:
# Change the 'Marriage' values
# 0 = NA
# 1 = Married
# 2 = Single
# 3 = Other
# Note : both NA and Other groups are small
# Change 3 and 2 to 0, making two groups: Maried (1) or (probably) not (0)
credit['Marriage'].replace([2, 3], 0, inplace = True)

# Additionally : by setting it to a 0/1 value, pyhton reads it as a factor,
# and further dummification is not needed.

# Feature Engineering

In [62]:
%%time
# Create a variable :
# sum of the times that the payment is delayed

# for every row in credit
# 'SumDelay' = the sum of times when payements where delayed 
# (indiscriminatory of the amount of months a payment was delayed)
for i in range(len(credit)):
    credit.loc[i , 'timesDelay'] =  sum(credit.loc[i , 'StatusSep': 'StatusApr'] > 0)
    


Wall time: 1min 12s


In [63]:
%%time
# Create a variable : 
# sum of status > 0 of the last three months
# this gives a bit of additional info about the status of last months
# Note : sum of all statusses is collinear with the 'timesDelay' variable



# get names of relevant columns
columnsStatus = list(credit.loc[: , 'StatusSep': 'StatusJul'].columns) 


# for every observation in credit
    # culculate the sum of (columns in columnsStatus > 0)
sumStatus = []

for i in range(len(credit)):
    
    rowStatus = []
    
    for j in columnsStatus:
        if credit.loc[i , j] > 0:
            rowStatus.append(credit.loc[i , j])
    
    sumStatus.append(sum(rowStatus))
    
credit['sumStatus'] = sumStatus

Wall time: 2.16 s


# Feature Selection

In [64]:
# keep relevant features
credit = credit.loc[:, ['Limit', 'StatusSep', 'StatusAug', 'timesDelay', 'sumStatus', 'Default']]

In [66]:
# Save the resulting data set
credit.to_csv('Data/creditOne.csv', index = False)

# Confusion Matrix

In [65]:
# heat map : correlation matrix 
# corr_headers = ['Limit', 
  #              'StatusSep', 'StatusAug', 'StatusJul', 'StatusJun', 'StatusMay', 'StatusApr',
   #             'BillSep', 'BillAug', 'BillJul', 'BillJun', 'BillMay', 'BillApr',
    #            'PaidSep', 'PaidAug', 'PaidJul', 'PaidJun', 'PaidMay', 'PaidApr',
     #           'Default']


# corrMat = credit[corr_headers].corr()

credit.corr().style.background_gradient(cmap = 'seismic', 
                                        axis = None,
                                        low = 0.4).set_precision(2)

Unnamed: 0,Limit,StatusSep,StatusAug,timesDelay,sumStatus,Default
Limit,1.0,-0.27,-0.3,-0.24,-0.22,-0.15
StatusSep,-0.27,1.0,0.67,0.64,0.7,0.32
StatusAug,-0.3,0.67,1.0,0.66,0.74,0.26
timesDelay,-0.24,0.64,0.66,1.0,0.89,0.4
sumStatus,-0.22,0.7,0.74,0.89,1.0,0.39
Default,-0.15,0.32,0.26,0.4,0.39,1.0
