In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler  # to standardize the features
from sklearn.decomposition import PCA  # to apply PCA

In [2]:
# OS-agnostic file path
working_dir = Path.cwd()
data_location = Path(working_dir, 'data', 'input')
app = pd.read_csv(data_location / 'application_train.csv')
bureau = pd.read_csv(data_location / 'bureau.csv')

In [85]:
dataset = app.merge(bureau, how='left', on='SK_ID_CURR')

# First, we will handle the missing features.

In [86]:
# Here I try to separate numeric and categorical data. However, in the numeric data set there still are 
# categorical features, like CNT_CHILDREN. Those will have to be tackled differently.
numerical_data = dataset.select_dtypes(include=[np.number])
categorial_data = dataset.select_dtypes(exclude=[np.number])
numerical_data = numerical_data.drop(columns=['SK_ID_CURR', 'TARGET'], axis=1)

In [87]:
for feature in numerical_data.columns:
    categorial = []
    if len(pd.unique(numerical_data[feature])) > 20: # after 20 unique values, we can safely assume it is not categorial
        median = numerical_data[feature].median()
        numeric_data = numerical_data.fillna({feature: median}, inplace=True)
    else:
        categorial.append(feature)

# Here we one-hot-encode the numeric categorical data
numeric_data_encoded = pd.get_dummies(data=numerical_data, columns=categorial, drop_first=True)
numeric_data_encoded.insert(0, 'SK_ID_CURR', dataset['SK_ID_CURR'])

# Now we can finally separate numerical and categorial features
numerical_data = numerical_data.drop(columns=categorial)

In [88]:
# Here we scale the numeric data using StandardScaler
ss = StandardScaler()
scaled_numeric_data = pd.DataFrame(ss.fit_transform(numerical_data))

In [89]:
# Here we one-hot-encode the categorical data

categorial_data_encoded = pd.get_dummies(data=categorial_data, drop_first=True)
categorial_data_encoded.insert(0, 'SK_ID_CURR', dataset['SK_ID_CURR'])

In [None]:
# This cell outputs the correlation matrix to an html file.

# corr = numeric_data.corr()
# with open('corrmatrix.html', 'a') as f:
#     f.write(corr.style.background_gradient(cmap='coolwarm').format(precision=2).to_html())

In [90]:
# Now we will reduce the dimensionality of the datasets using PCA

pca = PCA(n_components=20)

In [126]:
cat_data = pca.fit_transform(categorial_data_encoded)
cat_data = pd.DataFrame(cat_data)
cat_data.insert(0, 'SK_ID_CURR', dataset['SK_ID_CURR'])
cat_data.insert(1, 'TARGET', dataset['TARGET'])

In [120]:
scaled_numeric_data = scaled_numeric_data.dropna()
num_data = pca.fit_transform(scaled_numeric_data)
num_data = pd.DataFrame(num_data)

In [121]:
numeric_data_encoded = numeric_data_encoded.dropna()
num_cat_data = pca.fit_transform(numeric_data_encoded)
num_cat_data = pd.DataFrame(num_cat_data)

In [127]:
ready_dataset = pd.concat([cat_data, num_cat_data, num_data], axis=1)

In [131]:
ready_dataset = ready_dataset.dropna()

In [130]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

X_train = ready_dataset.drop(columns=['SK_ID_CURR', 'TARGET'])
y_train = ready_dataset['TARGET']

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

AttributeError: 'DataFrame' object has no attribute 'dtype'