# PKL Take on the Home Credit Default Risk Kaggle Competition

In [1]:
# built-in imports
import os
# 3p imports
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, matthews_corrcoef
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
# custom imports
from helper_functions import load_data_frame, flag_columns_to_bool


## Data Loading

In [2]:
path_application_train = os.path.join("./", "home-credit-default-risk", "application_train.csv")

df = load_data_frame(path_application_train)
df = df.sample(n=100000)
original_size = df.shape

# initialize target vector and feature matrix
y = df["TARGET"]
X = df.drop(["TARGET"], axis=1)

## Missing Data Handling

If we try to omit all of the missing data (i.e. omit samples, where there is at least one missing feature) we discover, that we have reduced the dataset brutally, from approx $3 \times 10^{5}$ samples to less than $1 \times 10^{3}$ samples. While in some cases, omitting samples with some missing features might not be that big of a problem, in this case the dataset might loose its usability for any analysis whatsoever. This discovery leads to the necessity of missin data imputation.

In [3]:
null_count_df = df.isna().sum() # vypsani statistik poctu chybejicicih hodnot
# df = df.dropna() # odmaze vsechny null radky, no made-up data approach
# no_null_shape = df.shape
print(null_count_df[null_count_df > 0])

AMT_ANNUITY                       2
AMT_GOODS_PRICE                  99
NAME_TYPE_SUITE                 424
OWN_CAR_AGE                   66125
OCCUPATION_TYPE               31342
                              ...  
AMT_REQ_CREDIT_BUREAU_DAY     13565
AMT_REQ_CREDIT_BUREAU_WEEK    13565
AMT_REQ_CREDIT_BUREAU_MON     13565
AMT_REQ_CREDIT_BUREAU_QRT     13565
AMT_REQ_CREDIT_BUREAU_YEAR    13565
Length: 66, dtype: int64


## Data Preprocessing

1. The rest of the string variables (dtype("O")) needs to be properly encoded in order for the scikitlearn algorithms to be able to handle them. I need some analogy to OneHotEncoder for categorical and OrdinalEncoder for oridnal data. An example of categorical data is the "CODE_GENDER" or "NAME_HOUSING_TYPE". Perhaps the first one mentioned should be removed in future version of the model, because preserving it may lead to a sexist classifier. On the other hand, there are clearly some more ordinalish features, such as "NAME_EDUCATION_TYPE" where a certain commonly agreed ordering does exist (there is much bigger difference between "Primary" and "Masters" level than between "Doctorate" and "Masters"). I sould look up for ordinal features and encode them accordingly in future.

In [5]:
column_transformer = make_column_transformer(
    (SimpleImputer(missing_values=np.NaN, strategy="median"), list(X.select_dtypes(include="int64").columns)),
    (make_pipeline(SimpleImputer(missing_values=np.NaN, strategy="mean"), StandardScaler()), 
        list(X.select_dtypes(include="float64").columns)),
    (make_pipeline(OneHotEncoder(handle_unknown="ignore"), SimpleImputer(missing_values=np.NaN, strategy="most_frequent")), 
        list(X.select_dtypes(include="object")))
    )

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape)
print(y_train.shape)

(75000, 121)
(75000,)


## Baseline Classifier for later comparison

In [8]:
classifier_pipeline = make_pipeline(column_transformer, MLPClassifier())
# cross_validation_pipeline = make_pipeline(column_transformer, cross_validate(MLPClassifier(), X, y))
score = cross_val_score(classifier_pipeline, X, y, cv=10)
print(score)