In [8]:
#!/usr/bin/env python3
# Reference: https://www.kaggle.com/residentmario/variance-inflation-factors-with-nyc-building-sales
# Reference: https://etav.github.io/python/vif_factor_python.html
# loan_small.csv is a tiny snapshot from https://www.kaggle.com/wordsforthewise/lending-club
import pandas as pd
import numpy as np
from patsy import dmatrices
from statsmodels.regression.linear_model import OLS

In [9]:
df = pd.read_csv('loan_small.csv')
df.dropna()
df = df._get_numeric_data() #drop non-numeric cols

In [10]:
feature_cols = ['annual_inc', 'int_rate', 'emp_length', 'dti', 'delinq_2yrs', 'revol_util', 'total_acc', 'loan_amnt', 'longest_credit_length']
target = 'bad_loan'
all_cols = feature_cols + [target]
df = df[all_cols].dropna() #subset the dataframe

df.head()

Unnamed: 0,annual_inc,int_rate,emp_length,dti,delinq_2yrs,revol_util,total_acc,loan_amnt,longest_credit_length,bad_loan
0,24000.0,10.65,10.0,27.65,0,83.7,9,5000,26,0
1,30000.0,15.27,0.0,1.0,0,9.4,4,2500,12,1
2,12252.0,15.96,10.0,8.72,0,98.5,10,2400,10,0
3,49200.0,13.49,10.0,20.0,0,21.0,37,10000,15,0
4,36000.0,7.9,3.0,11.2,0,28.3,12,5000,7,0


In [11]:
features = "+".join(feature_cols)

# Use the dmatrices method to construct the feature data frame (Panda). 
_, X = dmatrices(target + '~' + features, df, return_type='dataframe')

In [12]:
type(X)

pandas.core.frame.DataFrame

In [13]:
def VIF(exogenous, exogenous_idx):
    k_vars = exogenous.shape[1]
    x_i = exogenous[:, exogenous_idx]
    mask = np.arange(k_vars) != exogenous_idx
    x_noti = exogenous[:, mask]
    r_squared_i = OLS(x_i, x_noti).fit().rsquared
    vif = 1. / (1. - r_squared_i)
    return vif

In [14]:
# For each X, calculate VIF and save in dataframe
vif = pd.DataFrame()
vif["VIF Factor"] = [VIF(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

print(vif)

   VIF Factor               features
0   25.861010              Intercept
1    1.473199             annual_inc
2    1.484670               int_rate
3    1.066983             emp_length
4    1.282483                    dti
5    1.075329            delinq_2yrs
6    1.384624             revol_util
7    1.450820              total_acc
8    1.282063              loan_amnt
9    1.316389  longest_credit_length
