In [12]:
from typing import List, Tuple

import pandas as pd
from scipy.stats import pearsonr

# train_filename = "../data/v1/sample_application_train.csv"
train_filename = "../data/v1/application_train.csv"
test_filename = "../data/v1/application_test.csv"

y_column_name = "TARGET"
sk_id_column_name = "SK_ID_CURR"

def convert_to_numeric(data: pd.DataFrame) -> pd.DataFrame:
    # not the best way to handle null values
    x_raw = data.fillna(value=0, axis=1)

    object_x = x_raw.select_dtypes(include=["object"]).copy()

    for col in object_x.columns:
        # brute force and generic way to handle categorical columns
        object_x[col + "cat"] = object_x[col].astype("category").cat.codes

    object_cat_x = object_x[
        [col for col in object_x.columns if col.endswith("cat")]]

    numeric_x = x_raw.select_dtypes(include=["float", "int"]).copy()

    return pd.concat([numeric_x, object_cat_x], axis=1)


In [14]:
train_data = convert_to_numeric(pd.read_csv(train_filename))

column_names = train_data.columns[2:]
target = "TARGET"


corelations = {}
for f in column_names:
    data_temp = train_data[[f, target]]
    x1 = data_temp[f].values
    x2 = data_temp[target].values
    key = f + ' vs ' + target
    corelations[key] = pearsonr(x1, x2)[0]


data_corelation = pd.DataFrame(corelations, index=["Value"]).T



(120, 1)


In [21]:
data_corelation.loc[data_corelation["Value"].abs().sort_values(ascending=False).index]

Unnamed: 0,Value
EXT_SOURCE_2 vs TARGET,-0.159030
EXT_SOURCE_3 vs TARGET,-0.119572
DAYS_BIRTH vs TARGET,0.078239
EXT_SOURCE_1 vs TARGET,-0.064698
REGION_RATING_CLIENT_W_CITY vs TARGET,0.060893
REGION_RATING_CLIENT vs TARGET,0.058899
DAYS_LAST_PHONE_CHANGE vs TARGET,0.055217
NAME_EDUCATION_TYPEcat vs TARGET,0.054699
CODE_GENDERcat vs TARGET,0.054692
DAYS_ID_PUBLISH vs TARGET,0.051457


In [22]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

In [23]:
print_full(data_corelation.loc[data_corelation["Value"].abs().sort_values(ascending=False).index]

                                            Value
EXT_SOURCE_2 vs TARGET                  -0.159030
EXT_SOURCE_3 vs TARGET                  -0.119572
DAYS_BIRTH vs TARGET                     0.078239
EXT_SOURCE_1 vs TARGET                  -0.064698
REGION_RATING_CLIENT_W_CITY vs TARGET    0.060893
REGION_RATING_CLIENT vs TARGET           0.058899
DAYS_LAST_PHONE_CHANGE vs TARGET         0.055217
NAME_EDUCATION_TYPEcat vs TARGET         0.054699
CODE_GENDERcat vs TARGET                 0.054692
DAYS_ID_PUBLISH vs TARGET                0.051457
REG_CITY_NOT_WORK_CITY vs TARGET         0.050994
FLOORSMAX_AVG vs TARGET                 -0.049839
FLOORSMAX_MEDI vs TARGET                -0.049720
FLOORSMAX_MODE vs TARGET                -0.049458
NAME_INCOME_TYPEcat vs TARGET            0.046829
FLAG_EMP_PHONE vs TARGET                 0.045982
DAYS_EMPLOYED vs TARGET                 -0.044932
REG_CITY_NOT_LIVE_CITY vs TARGET         0.044395
FLAG_DOCUMENT_3 vs TARGET                0.044346


['Value']