# PKL Take on the Home Credit Default Risk Kaggle Competition

In [5]:
# built-in imports
import os
# 3p imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, matthews_corrcoef
from sklearn.impute import SimpleImputer
import numpy as np
# custom imports
from helper_functions import load_data_frame, flag_columns_to_bool


## Data Loading

In [6]:
path_application_train = os.path.join("./", "home-credit-default-risk", "application_train.csv")

df = load_data_frame(path_application_train)
original_size = df.shape


## Missing Data Handling

If we try to omit all of the missing data (i.e. omit samples, where there is at least one missing feature) we discover, that we have reduced the dataset brutally, from approx $3 \times 10^{5}$ samples to less than $1 \times 10^{3}$ samples. While in some cases, omitting samples with some missing features might not be that big of a problem, in this case the dataset might loose its usability for any analysis whatsoever. This discovery leads to the necessity of missin data imputation.

In [7]:
null_count_df = df.isna().sum() # vypsani statistik poctu chybejicicih hodnot
# df = df.dropna() # odmaze vsechny null radky, no made-up data approach
# no_null_shape = df.shape
print(null_count_df[null_count_df > 0])

AMT_ANNUITY                       12
AMT_GOODS_PRICE                  278
NAME_TYPE_SUITE                 1292
OWN_CAR_AGE                   202929
OCCUPATION_TYPE                96391
                               ...  
AMT_REQ_CREDIT_BUREAU_DAY      41519
AMT_REQ_CREDIT_BUREAU_WEEK     41519
AMT_REQ_CREDIT_BUREAU_MON      41519
AMT_REQ_CREDIT_BUREAU_QRT      41519
AMT_REQ_CREDIT_BUREAU_YEAR     41519
Length: 67, dtype: int64


In [8]:
dtype_list = list(set(df.dtypes))
imputed_df_list = []

for dtype in dtype_list:
    df_dtype = df.select_dtypes(include=dtype)

    match str(dtype):

        case "int64":

            # for integer values median is more reasobable, because it has to be one of the values, 
            # while the mean might be integer in general
            imputer = SimpleImputer(missing_values=np.NaN, strategy="median")

        case "float64":

            # it may be benefitial to estimate skeweness of each particular float feature to asses 
            # whether or not it is a good idea to use mean value
            imputer = SimpleImputer(missing_values=np.NaN, strategy="mean")

        case "object":

            imputer = SimpleImputer(missing_values=np.NaN, strategy="most_frequent")

    imputed_df_list.append(pd.DataFrame(imputer.fit_transform(df_dtype)))
    imputed_df_list[-1].columns = df_dtype.columns
    imputed_df_list[-1].index = df_dtype.index

df_imputed = imputed_df_list[0].join(imputed_df_list[1:])
print(df_imputed)
null_count_df = df.isna().sum()
print(null_count_df[null_count_df > 0])

# replace the original data with the new ones
df = df_imputed

       NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  \
0              Cash loans           M            N               Y   
1              Cash loans           F            N               N   
2         Revolving loans           M            Y               Y   
3              Cash loans           F            N               Y   
4              Cash loans           M            N               Y   
...                   ...         ...          ...             ...   
307506         Cash loans           M            N               N   
307507         Cash loans           F            N               Y   
307508         Cash loans           F            N               Y   
307509         Cash loans           F            N               Y   
307510         Cash loans           F            N               N   

       NAME_TYPE_SUITE      NAME_INCOME_TYPE            NAME_EDUCATION_TYPE  \
0        Unaccompanied               Working  Secondary / secondary special   
1

## Data Preprocessing

1. If I understand the meaning, columns with "FLAG" string in the name are binary columns denoting presence of lack there of a certain variable. Thus I decide to replace the string "Y"/"N" values and integer 1/0 with proper boolean values (this may also lead to reduced memory usage, but this is not the main motivation).
2. The rest of the string variables (dtype("O")) needs to be properly encoded in order for the scikitlearn algorithms to be able to handle them. I need some analogy to OneHotEncoder for categorical and OrdinalEncoder for oridnal data. An example of categorical data is the "CODE_GENDER" or "NAME_HOUSING_TYPE". Perhaps the first one mentioned should be removed in future version of the model, because preserving it may lead to a sexist classifier. On the other hand, there are clearly some more ordinalish features, such as "NAME_EDUCATION_TYPE" where a certain commonly agreed ordering does exist (there is much bigger difference between "Primary" and "Masters" level than between "Doctorate" and "Masters"). I sould look up for ordinal features and encode them accordingly in future.

In [9]:
# deal with the boolean features
df = flag_columns_to_bool(df)
df = pd.get_dummies(df)

(307511, 122)
FLAG_OWN_CAR
FLAG_OWN_REALTY
FLAG_MOBIL
FLAG_EMP_PHONE
FLAG_WORK_PHONE
FLAG_CONT_MOBILE
FLAG_PHONE
FLAG_EMAIL
FLAG_DOCUMENT_2
FLAG_DOCUMENT_3
FLAG_DOCUMENT_4
FLAG_DOCUMENT_5
FLAG_DOCUMENT_6
FLAG_DOCUMENT_7
FLAG_DOCUMENT_8
FLAG_DOCUMENT_9
FLAG_DOCUMENT_10
FLAG_DOCUMENT_11
FLAG_DOCUMENT_12
FLAG_DOCUMENT_13
FLAG_DOCUMENT_14
FLAG_DOCUMENT_15
FLAG_DOCUMENT_16
FLAG_DOCUMENT_17
FLAG_DOCUMENT_18
FLAG_DOCUMENT_19
FLAG_DOCUMENT_20
FLAG_DOCUMENT_21
(307511, 122)
(307511, 244)


In [10]:
y = df["TARGET"]
X = df.drop(["TARGET"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape)
print(y_train.shape)

(230633, 243)
(230633,)


## Baseline Classifier for later comparison

In [11]:
baseline_classifier = MLPClassifier().fit(X_train, y_train)
y_pred = baseline_classifier.predict(X_test)
cf_mat = confusion_matrix(y_test, y_pred)
print(cf_mat)
print(matthews_corrcoef(y_test, y_pred))