In [97]:
import os
import math
import numpy as np
import datetime as dt
import time
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
import warnings
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

%matplotlib nbagg

warnings.filterwarnings('ignore')

### Read and Initial Clean

In [98]:
data = pd.read_csv('Data/TrainingData.csv', index_col=0)

data.replace('missing', np.nan, inplace=True)
data.replace('na', np.nan, inplace=True)
int_cols = list(set(data.columns) - {'mvar47'})
data[int_cols] = data[int_cols].astype(float)
data.describe()

Unnamed: 0,mvar1,mvar2,mvar3,mvar4,mvar5,mvar6,mvar7,mvar8,mvar9,mvar10,...,mvar39,mvar40,mvar41,mvar42,mvar43,mvar44,mvar45,mvar46,mvar48,default_ind
count,79267.0,77114.0,82465.0,82465.0,82465.0,63299.0,75326.0,63291.0,71318.0,82465.0,...,76671.0,17930.0,25736.0,80977.0,82111.0,74851.0,37080.0,59397.0,83000.0,83000.0
mean,1747.511865,1.054816,5.401784,0.461151,1.084012,1633.42928,17507.97,6822.332227,34030.596329,30228.57,...,0.087673,97.851158,72.01657,0.33155,6.553604,0.706595,0.070523,0.093187,556.021277,0.28741
std,94.830127,1.556682,11.091569,1.704292,5.743899,3667.183981,46772.22,10060.346814,50673.150005,66177.98,...,0.423403,29.530657,24.508469,0.373973,6.273107,0.245915,0.290383,0.329054,1106.953295,0.452557
min,1477.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.014,0.0,0.0,0.0,0.06066,0.0,0.0,2.0,0.0
25%,1680.0,0.1318,0.0,0.0,0.0,41.0,1750.25,496.0,3542.25,1153.0,...,0.0,90.1495,58.17475,0.0,2.0,0.51528,0.0,0.0,8.0,0.0
50%,1743.0,0.513,0.3,0.0,0.0,297.0,7020.5,2507.0,14389.0,9525.0,...,0.0,100.0,78.8395,0.2,5.0,0.7395,0.0,0.0,17.0,0.0
75%,1813.0,1.3862,6.595,0.0,0.0,1381.0,18113.0,9869.5,44413.0,34288.0,...,0.0,111.327,91.59675,0.57143,10.0,0.94512,0.0,0.0,111.0,1.0
max,1950.0,31.0181,399.334,25.754,165.492,94302.0,5637108.0,291810.0,840658.0,5647073.0,...,21.0,631.36,182.111,2.0,91.0,1.0,7.0,5.0,3247.0,1.0


In [99]:
y = data['default_ind']

X_cols = list(data.columns)
X_cols.remove('default_ind')

X = data[X_cols]

X.loc[X['mvar47'] == 'L', 'mvar47'] = 1
X.loc[X['mvar47'] == 'C', 'mvar47'] = 0

In [100]:
categorical = ['mvar47', 'mvar48']
numeric = list(X.columns)

for var in categorical:
    numeric.remove(var)

### Normalize

In [101]:
X_Scaler = StandardScaler()
X_scaled = pd.DataFrame(X_Scaler.fit_transform(X[numeric]), columns=X[numeric].columns, index=X.index)
X_scaled[categorical] = X[categorical]

### Imputation

In [102]:
def basic_impute(data, cols, type_='mean'):
    
    if type_ == 'mean':
        return data.fillna(data[cols].mean()) 
    
    if type_ == 'median':
        return data.fillna(data[cols].median())
    
    if type_ =='mode':
        md = data[cols].mode()
        return data.fillna(md.iloc[0]) 
    
    if type_ == 'CF': #CF - Customer friendly
        imp_vals = data.mean()
        v = [40,31,41,45,35,46,24,16,17,18,12,9,39,2,42,43]
        for i in v:
            imp_vals['mvar'+str(i)] = 0
        med = data.median()
        imp_vals['mvar11'] = med['mvar11']
        
        return data.fillna(imp_vals[cols])    

In [103]:
def KNN_impute(cols_to_impute, X, numeric, train_na_method='mean'):
    
    result = X.copy()

    X_nonan = basic_impute(X, X.columns, type_=train_na_method)

    for col in cols_to_impute:

        X_train = X_nonan.loc[~X[col].isna()].drop(col, axis=1)
        y_train = X_nonan.loc[~X[col].isna(), col]

        X_test = X_nonan.loc[X[col].isna()].drop(col, axis=1)
        
        if len(X_test) == 0:
            print('No need to impute', col)
            continue
        
        print(col, ': Number of NaNs -', len(X_test))
        
        # 0.1 % of the data as neighbours
        model = KNeighborsRegressor(int(0.001*len(X_train)))
        model.fit(X_train, y_train)

        result.loc[X_test.index, col] = model.predict(X_test)
        
        print('Imputed for', col)
        
    return result

In [104]:
cols_to_impute = numeric # Choose columns you want to impute using the KNN method

X_scaled_imputed = KNN_impute(cols_to_impute, X_scaled, numeric, train_na_method='CF')

mvar1 : Number of NaNs - 3733
Imputed for mvar1
mvar2 : Number of NaNs - 5886
Imputed for mvar2
mvar3 : Number of NaNs - 535
Imputed for mvar3
mvar4 : Number of NaNs - 535
Imputed for mvar4
mvar5 : Number of NaNs - 535
Imputed for mvar5
mvar6 : Number of NaNs - 19701
Imputed for mvar6
mvar7 : Number of NaNs - 7674
Imputed for mvar7
mvar8 : Number of NaNs - 19709
Imputed for mvar8
mvar9 : Number of NaNs - 11682
Imputed for mvar9
mvar10 : Number of NaNs - 535
Imputed for mvar10
mvar11 : Number of NaNs - 46717
Imputed for mvar11
mvar12 : Number of NaNs - 14578
Imputed for mvar12
mvar13 : Number of NaNs - 9689
Imputed for mvar13
No need to impute mvar14
mvar15 : Number of NaNs - 33519
Imputed for mvar15
mvar16 : Number of NaNs - 19243
Imputed for mvar16
mvar17 : Number of NaNs - 16499
Imputed for mvar17
mvar18 : Number of NaNs - 15359
Imputed for mvar18
mvar19 : Number of NaNs - 5
Imputed for mvar19
mvar20 : Number of NaNs - 535
Imputed for mvar20
mvar21 : Number of NaNs - 23462
Imputed fo

In [109]:
X_scaled_imputed.to_csv('Data\Training_scaled_imputed.csv')


### Remove Outliers

### Feature Engineering

#### Variance Inflation Factor

In [None]:
VIFs = {}

for var in data.columns:

    ### Remove this after encoding is done
    
    if var in ['mvar48']:
        continue
    
    #######################################
    
    var_y = data[var]
    var_X = data.drop(var, axis=1)
    
    linmod = LinearRegression()
    linmod.fit(var_X, var_y)
    R2 = linmod.score(var_X, var_y)
    VIF[var] = 1 / (1 - R2)

plt.figure()
plt.bar(range(len(VIFs)), VIFs.values(), width=0.7)
plt.ylabel('VIF')
plt.xlabel('Variable')
plt.title('VIF of Variables')
plt.xticks(range(len(VIFs)), VIFs.keys())
plt.show()

#### Correlation

In [None]:
cov_mat = np.round(np.cov(np.array(X_final[numeric]).T), 3)

fig, ax = plt.subplots(figsize=(40, 40))
img = ax.matshow(cov_mat, cmap='Reds')
fig.colorbar(img, aspect=50)
ax.set_title('Covariance')
ax.set_xticks(ticks=range(len(numeric)))
ax.set_yticks(ticks=range(len(numeric)))

for (i, j), z in np.ndenumerate(cov_mat):
    ax.text(j, i, '{:0.3f}'.format(z), ha='center', va='center')

#### Principal Component Analysis

#### Linear Discriminant Analysis