# Example One

VIF can be computed for each regressor by fitting an OLS model that has the regressor in question as a target variable and all other regressors as features. VIF should be smaller than 5 (small datasets) or 10 (larger datasets).

- droping of correlated regressors 

- PCA  

- penalized regression methods e.g. ridge and lasso

In [2]:
import pandas as pd

# the dataset
df = pd.read_csv('../data/BMI.csv')
df.head()

Unnamed: 0,Gender,Height,Weight,Index
0,Male,174,96,4
1,Male,189,87,2
2,Female,185,110,4
3,Female,195,104,3
4,Male,149,61,3


In [50]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# dummy variable for gender
df['Gender'] = df['Gender'].map({'Male':0, 'Female':1})

X = df[['Gender', 'Height', 'Weight']]

for i in range(len(X.columns)):
    print (variance_inflation_factor(X.values, i))

2.0288639241099915
11.623103405710753
10.688377415326789


#### Height and weight are highly correlated and only one of them should be considerd.

variance_inflation_factor(exog, exog_idx)

exog: an array containing features on which linear regression is performed.

exog_idx: index of the additional feature whose influence on the other features is to be measured

# Example Two

In [68]:
df=pd.read_csv('../data/salary.csv')

# handle missing values
df.dropna(inplace=True)

In [69]:
X=df[['Gender', 'Age', 'Years of service', 'Education level']] # Alternatively: X = df.iloc[:,:-1]

for i in range(len(X.columns)):
    print(variance_inflation_factor(X.values, i))

2.2071549745904337
13.706320280830282
10.299486200207912
2.4092629110214583


In [72]:
# Combining ‘Age’ and ‘Years of experience’ allows us to capture the information in both the variables.

df_new = df.copy()
df_new['Age_at_joining'] = df_new.apply(lambda x: x['Age'] - x['Years of service'],axis=1)

X_new=df_new[['Gender', 'Age_at_joining', 'Education level']]

for i in range(len(X_new.columns)):
    print(variance_inflation_factor(X_new.values, i))
    


2.168067939256931
3.3269912119960123
2.407695153124809


# Example Three

In [87]:
#Import boston dataset from sklearn
import pandas as pd
from sklearn.datasets import load_boston
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import statsmodels.api as sm
import numpy as np

boston= load_boston()
boston_features_df = pd.DataFrame(data=boston.data,columns=boston.feature_names)
boston_target_df = pd.DataFrame(data=boston.target,columns=['MEDV'])

X = boston_features_df
Y = boston_target_df
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, test_size = 0.20, random_state = 5)

In [85]:
# Model statistics
model = sm.OLS(Y_train, sm.add_constant(X_train)).fit() #Must add constant for y-intercept
Y_pred = model.predict(sm.add_constant(X_test))
print_model = model.summary()
print(print_model)


                            OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.738
Model:                            OLS   Adj. R-squared:                  0.730
Method:                 Least Squares   F-statistic:                     84.65
Date:                Mon, 03 May 2021   Prob (F-statistic):          8.21e-105
Time:                        21:01:03   Log-Likelihood:                -1202.0
No. Observations:                 404   AIC:                             2432.
Df Residuals:                     390   BIC:                             2488.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         37.9125      5.775      6.565      0.0

In [86]:
x_temp = sm.add_constant(X_train)

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(x_temp.values, i) for i in range(x_temp.values.shape[1])]
vif["features"] = x_temp.columns
print(vif.round(1))

    VIF Factor features
0        578.6    const
1          1.7     CRIM
2          2.4       ZN
3          4.1    INDUS
4          1.1     CHAS
5          4.6      NOX
6          1.8       RM
7          2.9      AGE
8          4.0      DIS
9          8.1      RAD
10         9.8      TAX
11         1.8  PTRATIO
12         1.3        B
13         2.9    LSTAT


In [88]:
#ADRESS ISSUES: START WITH ORIGINAL DATASET
boston_new=boston_features_df.drop(['RAD'], axis=1)
boston_new.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,222.0,18.7,396.9,5.33


In [89]:
#Partition the data

X_1 = boston_new
Y_1 = boston_target_df

X1_train, X1_test, Y1_train, Y1_test = sklearn.model_selection.train_test_split(X_1, Y_1, test_size = 0.20, random_state = 5)

# For each X, calculate VIF and save in dataframe
x_temp = sm.add_constant(X1_train)

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(x_temp.values, i) for i in range(x_temp.values.shape[1])]
vif["features"] = x_temp.columns
print(vif.round(1))

    VIF Factor features
0        532.0    const
1          1.6     CRIM
2          2.3       ZN
3          3.8    INDUS
4          1.1     CHAS
5          4.6      NOX
6          1.8       RM
7          2.9      AGE
8          4.0      DIS
9          3.6      TAX
10         1.8  PTRATIO
11         1.3        B
12         2.9    LSTAT


In [90]:
model = sm.OLS(Y1_train, sm.add_constant(X1_train)).fit()
Y1_pred = model.predict(sm.add_constant(X1_test))
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.724
Model:                            OLS   Adj. R-squared:                  0.715
Method:                 Least Squares   F-statistic:                     85.46
Date:                Mon, 03 May 2021   Prob (F-statistic):          2.63e-101
Time:                        21:05:02   Log-Likelihood:                -1212.8
No. Observations:                 404   AIC:                             2452.
Df Residuals:                     391   BIC:                             2504.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         30.3242      5.680      5.339      0.0

In [95]:
import sklearn
from sklearn import metrics

print('Mean Absolute Error (Base):', metrics.mean_absolute_error(Y_test, Y_pred))  
print('')
print('Mean Absolute Error (Base):', metrics.mean_squared_error(Y_test, Y_pred))  
print('')
print('Mean Absolute Error (Not collinear):', metrics.mean_absolute_error(Y_test, Y1_pred))
print('')
print('Mean Absolute Error (Not collinear):', metrics.mean_squared_error(Y_test, Y1_pred))

Mean Absolute Error (Base): 3.2132704958423868

Mean Absolute Error (Base): 20.86929218377079

Mean Absolute Error (Not collinear): 3.1398352911129916

Mean Absolute Error (Not collinear): 20.461656141085523
