In [16]:
import pandas as pd
import statsmodels.formula.api as smf

def get_vif(exogs, data):
    '''Return VIF (variance inflation factor) DataFrame

    Args:
    exogs (list): list of exogenous/independent variables
    data (DataFrame): the df storing all variables

    Returns:
    VIF and Tolerance DataFrame for each exogenous variable

    Notes:
    Assume we have a list of exogenous variable [X1, X2, X3, X4].
    To calculate the VIF and Tolerance for each variable, we regress
    each of them against other exogenous variables. For instance, the
    regression model for X3 is defined as:
                        X3 ~ X1 + X2 + X4
    And then we extract the R-squared from the model to calculate:
                    VIF = 1 / (1 - R-squared)
                    Tolerance = 1 - R-squared
    The cutoff to detect multicollinearity:
                    VIF > 10 or Tolerance < 0.1
    '''

    # initialize dictionaries
    vif_dict, tolerance_dict = {}, {}

    # create formula for each exogenous variable
    for exog in exogs:
        not_exog = [i for i in exogs if i != exog]
        formula = f"{exog} ~ {' + '.join(not_exog)}"

        # extract r-squared from the fit
        r_squared = smf.ols(formula, data=data).fit().rsquared

        # calculate VIF
        vif = 1/(1 - r_squared)
        vif_dict[exog] = vif

        # calculate tolerance
        tolerance = 1 - r_squared
        tolerance_dict[exog] = tolerance

    # return VIF DataFrame
    df_vif = pd.DataFrame({'VIF': vif_dict, 'Tolerance': tolerance_dict})

    return df_vif

In [17]:
X = pd.read_csv('C:/Users/12676/Desktop/Winter 20/STAT/Imputed_V2.csv')

In [18]:
variables=['Age',
'Female',
'Educ',
'Unmarried',
'Income',
'Insured',
#'Weight',
'Height',
'BMI',
#'Obese',
#'Waist',
#'SBP',
'DBP',
'HDL',
'LDL',
#'Total_Chol',
'Dyslipidemia',
'PVD',
'Activity',
'PoorVision',
'Smoker',
'Hypertension',
'Fam_Hypertension',
'Diabetes',
'Fam_Diabetes',
'Stroke',
'CVD',
#'Fam_CVD',
'CHF',
'Anemia']

In [19]:
get_vif(variables,X)



Unnamed: 0,VIF,Tolerance
Age,2.245541,0.445327
Female,2.149866,0.465145
Educ,1.200258,0.833154
Unmarried,1.11088,0.900187
Income,1.264115,0.791067
Insured,1.213015,0.824392
Weight,92.219397,0.010844
Height,24.369573,0.041035
BMI,72.246792,0.013841
Obese,2.592898,0.385669
