https://stackoverflow.com/questions/42658379/variance-inflation-factor-in-python

In [11]:
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.formula.api as smf

In [21]:
a = [1, 1, 2, 3, 4]
b = [2, 2, 3, 2, 1]
c = [4, 6, 7, 8, 9]
d = [4, 3, 4, 5, 4]

In [22]:
ck = np.column_stack([a, b, c, d])
cc = sp.corrcoef(ck, rowvar=False)
VIF = np.linalg.inv(cc)
VIF.diagonal()

array([22.95,  3.  , 12.95,  3.  ])

In [23]:
df = pd.DataFrame({'a':a,'b':b,'c':c,'d':d})
df_cor = df.corr()
pd.DataFrame(np.linalg.inv(df.corr().values), index = df_cor.index, columns=df_cor.columns)

Unnamed: 0,a,b,c,d
a,22.95,6.453681,-16.301917,-6.453681
b,6.453681,3.0,-4.080441,-2.0
c,-16.301917,-4.080441,12.95,4.080441
d,-6.453681,-2.0,4.080441,3.0


In [9]:
print("The diagonal elements give VIF.")

The diagonal elements give VIF.


. Use formula in statasmodels:

In [12]:
def get_vif(exogs, data):
    '''Return VIF (variance inflation factor) DataFrame

    Args:
    exogs (list): list of exogenous/independent variables
    data (DataFrame): the df storing all variables

    Returns:
    VIF and Tolerance DataFrame for each exogenous variable

    Notes:
    Assume we have a list of exogenous variable [X1, X2, X3, X4].
    To calculate the VIF and Tolerance for each variable, we regress
    each of them against other exogenous variables. For instance, the
    regression model for X3 is defined as:
                        X3 ~ X1 + X2 + X4
    And then we extract the R-squared from the model to calculate:
                    VIF = 1 / (1 - R-squared)
                    Tolerance = 1 - R-squared
    The cutoff to detect multicollinearity:
                    VIF > 10 or Tolerance < 0.1
    '''

    # initialize dictionaries
    vif_dict, tolerance_dict = {}, {}

    # create formula for each exogenous variable
    for exog in exogs:
        not_exog = [i for i in exogs if i != exog]
        formula = f"{exog} ~ {' + '.join(not_exog)}"

        # extract r-squared from the fit
        r_squared = smf.ols(formula, data=data).fit().rsquared

        # calculate VIF
        vif = 1/(1 - r_squared)
        vif_dict[exog] = vif

        # calculate tolerance
        tolerance = 1 - r_squared
        tolerance_dict[exog] = tolerance

    # return VIF DataFrame
    df_vif = pd.DataFrame({'VIF': vif_dict, 'Tolerance': tolerance_dict})

    return df_vif

Use LinearRegression in sklearn:

In [24]:
# import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.linear_model import LinearRegression

def sklearn_vif(exogs, data):

    # initialize dictionaries
    vif_dict, tolerance_dict = {}, {}

    # form input data for each exogenous variable
    for exog in exogs:
        not_exog = [i for i in exogs if i != exog]
        X, y = data[not_exog], data[exog]

        # extract r-squared from the fit
        r_squared = LinearRegression().fit(X, y).score(X, y)

        # calculate VIF
        vif = 1/(1 - r_squared)
        vif_dict[exog] = vif

        # calculate tolerance
        tolerance = 1 - r_squared
        tolerance_dict[exog] = tolerance

    # return VIF DataFrame
    df_vif = pd.DataFrame({'VIF': vif_dict, 'Tolerance': tolerance_dict})

    return df_vif

In [25]:
exogs = ['b','c','d']

In [37]:
df

Unnamed: 0,a,b,c,d
0,1,2,4,4
1,1,2,6,3
2,2,3,7,4
3,3,2,8,5
4,4,1,9,4


In [36]:
get_vif(exogs=exogs, data=df)

Unnamed: 0,VIF,Tolerance
b,1.185185,0.84375
c,1.37037,0.72973
d,1.185185,0.84375


In [35]:
sklearn_vif(exogs=exogs, data=df)

  linalg.lstsq(X, y)


Unnamed: 0,VIF,Tolerance
b,1.185185,0.84375
c,1.37037,0.72973
d,1.185185,0.84375
