In [1]:
import os
import math
import numpy as np
import datetime as dt
import time
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
import warnings
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

%matplotlib nbagg

warnings.filterwarnings('ignore')

### Read and Initial Clean

In [22]:
data = pd.read_csv('Data/TrainingData.csv', index_col=0)

data.replace('missing', np.nan, inplace=True)
data.replace('na', np.nan, inplace=True)
int_cols = list(set(data.columns) - {'mvar47'})
data[int_cols] = data[int_cols].astype(float)
data.describe()

In [27]:
y = data['default_ind']

X_cols = list(data.columns)
X_cols.remove('default_ind')

X = data[X_cols]

X.loc[X['mvar47'] == 'L', 'mvar47'] = 1
X.loc[X['mvar47'] == 'C', 'mvar47'] = 0

In [28]:
categorical = ['mvar47', 'mvar48']
numeric = list(X.columns)

for var in categorical:
    numeric.remove(var)

### Normalize

In [29]:
X_Scaler = StandardScaler()
X_scaled = pd.DataFrame(X_Scaler.fit_transform(X[numeric]), columns=X[numeric].columns, index=X.index)
X_scaled[categorical] = X[categorical]

### Imputation

In [30]:
def basic_impute(data, cols, type_='mean'):
    
    if type_ == 'mean':
        return data.fillna(data[cols].mean()) 
    
    if type_ == 'median':
        return data.fillna(data[cols].median())
    
    if type_ =='mode':
        md = data[cols].mode()
        return data.fillna(md.iloc[0]) 
    
    if type_ == 'CF': #CF - Customer friendly
        imp_vals = data.mean()
        v = [40,31,41,45,35,46,24,16,17,18,12,9,39,2,42,43]
        for i in v:
            imp_vals['mvar'+str(i)] = 0
        med = data.median()
        imp_vals['mvar11'] = med['mvar11']
        
        return data.fillna(imp_vals[cols])    

In [31]:
def KNN_impute(cols_to_impute, X, numeric, train_na_method='mean'):
    
    for col in cols_to_impute:
        
        other_cols = [c for c in numeric if (c != col)]

        X_train = X.loc[~X[col].isna(), other_cols]
        y_train = X.loc[~X[col].isna(), col]

        X_test = X.loc[X[col].isna(), other_cols]
        
        # Impute KNN training data with basci impute
        X_train = basic_impute(X_train, other_cols, type_=train_na_method)
        X_test = basic_impute(X_test, other_cols, type_=train_na_method)
    
        # 0.1 % of the data as neighbours
        model = KNeighborsRegressor(int(0.001*len(X_train)))
        model.fit(X_train, y_train)

        X.loc[X_test.index, col] = model.predict(X_test)
        
    return X

In [None]:
cols_to_impute = ['mvar9'] # Choose columns you want to impute using the KNN method

X_scaled_imputed = KNN_impute(cols_to_impute, X_scaled, numeric, train_na_method='CF')

### Remove Outliers

In [None]:
### Remove Outliers

### Feature Engineering

#### Variance Inflation Factor

In [None]:
VIFs = {}

for var in data.columns:

    ### Remove this after encoding is done
    
    if var in ['mvar48']:
        continue
    
    #######################################
    
    var_y = data[var]
    var_X = data.drop(var, axis=1)
    
    linmod = LinearRegression()
    linmod.fit(var_X, var_y)
    R2 = linmod.score(var_X, var_y)
    VIF[var] = 1 / (1 - R2)

plt.figure()
plt.bar(range(len(VIFs)), VIFs.values(), width=0.7)
plt.ylabel('VIF')
plt.xlabel('Variable')
plt.title('VIF of Variables')
plt.xticks(range(len(VIFs)), VIFs.keys())
plt.show()

#### Correlation

In [None]:
cov_mat = np.round(np.cov(np.array(X_final[numeric]).T), 3)

fig, ax = plt.subplots(figsize=(40, 40))
img = ax.matshow(cov_mat, cmap='Reds')
fig.colorbar(img, aspect=50)
ax.set_title('Covariance')
ax.set_xticks(ticks=range(len(numeric)))
ax.set_yticks(ticks=range(len(numeric)))

for (i, j), z in np.ndenumerate(cov_mat):
    ax.text(j, i, '{:0.3f}'.format(z), ha='center', va='center')

#### Principal Component Analysis

#### Linear Discriminant Analysis