# VIF

In [1]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.datasets import load_diabetes



In [2]:
diabetes_data = load_diabetes()

In [3]:
# Function to calculate VIF
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
    return vif_data

### Calculate VIF for diabetes_df

In [4]:
# Create a DataFrame with the features
diabetes_df = pd.DataFrame(diabetes_data.data, columns=diabetes_data.feature_names)

diabetes_df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [5]:
# Calculate VIF
vif_df = calculate_vif(diabetes_df)
print("VIF values:")
print(vif_df)

VIF values:
  Feature        VIF
0     age   1.217307
1     sex   1.278071
2     bmi   1.509437
3      bp   1.459428
4      s1  59.202510
5      s2  39.193370
6      s3  15.402156
7      s4   8.890986
8      s5  10.075967
9      s6   1.484623


## Approach to remove VIF
- APPROACH 1 :Set a threshold and remove features at once -- Wrong
- APPROACH 2 :Set a threshold and remove feature one by one with highest VIF


### APPROACH 1 - Wrong

In [6]:
# Assuming diabetes_df is your DataFrame containing the diabetes dataset features
diabetes_df_filtered = diabetes_df.copy()  # Make a copy of the original DataFrame

vif_df = calculate_vif(diabetes_df_filtered)
print(vif_df)

# Set a threshold for VIF
threshold = 5

# Remove features with VIF above the threshold
high_vif_features = vif_df[vif_df['VIF'] > threshold]['Feature']
diabetes_df_filtered = diabetes_df.drop(high_vif_features, axis=1)

  Feature        VIF
0     age   1.217307
1     sex   1.278071
2     bmi   1.509437
3      bp   1.459428
4      s1  59.202510
5      s2  39.193370
6      s3  15.402156
7      s4   8.890986
8      s5  10.075967
9      s6   1.484623


In [7]:
print("Removed features with high VIF:")
diabetes_df_filtered.head(3)

Removed features with high VIF:


Unnamed: 0,age,sex,bmi,bp,s6
0,0.038076,0.05068,0.061696,0.021872,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.02593


### % of features left

In [8]:
print(f"Percentage of featues left = {diabetes_df_filtered.shape[1]/diabetes_df.shape[1]*100} %")

Percentage of featues left = 50.0 %


## APPROACH 2 :Set a threshold and remove feature one by one with highest VIF

In [9]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Assuming diabetes_df is your DataFrame containing the diabetes dataset features
diabetes_df_filtered = diabetes_df.copy()  # Make a copy of the original DataFrame


In [10]:
diabetes_df_filtered.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [11]:
print(calculate_vif(diabetes_df_filtered))

  Feature        VIF
0     age   1.217307
1     sex   1.278071
2     bmi   1.509437
3      bp   1.459428
4      s1  59.202510
5      s2  39.193370
6      s3  15.402156
7      s4   8.890986
8      s5  10.075967
9      s6   1.484623


In [12]:
# Set a threshold for VIF
threshold = 5

print(calculate_vif(diabetes_df_filtered))

# Iteratively remove features with high VIF until all VIF values are below the threshold
while True:
    vif_df = calculate_vif(diabetes_df_filtered)
    max_vif_feature = vif_df.loc[vif_df['VIF'].idxmax(), 'Feature']
    max_vif_value = vif_df['VIF'].max()
    if max_vif_value <= threshold:
        break
    diabetes_df_filtered.drop(max_vif_feature, axis=1, inplace=True)

    # Print VIF value and feature name after removing the feature with highest VIF
    print(f"Removed feature with highest VIF: {max_vif_feature} (VIF: {max_vif_value})")
    print("VIF values after removing feature with highest VIF:")
    print(calculate_vif(diabetes_df_filtered))



  Feature        VIF
0     age   1.217307
1     sex   1.278071
2     bmi   1.509437
3      bp   1.459428
4      s1  59.202510
5      s2  39.193370
6      s3  15.402156
7      s4   8.890986
8      s5  10.075967
9      s6   1.484623
Removed feature with highest VIF: s1 (VIF: 59.202510134318615)
VIF values after removing feature with highest VIF:
  Feature       VIF
0     age  1.216892
1     sex  1.275049
2     bmi  1.502320
3      bp  1.457413
4      s2  2.926535
5      s3  3.736890
6      s4  7.818670
7      s5  2.172865
8      s6  1.484410
Removed feature with highest VIF: s4 (VIF: 7.818670164713724)
VIF values after removing feature with highest VIF:
  Feature       VIF
0     age  1.216284
1     sex  1.269207
2     bmi  1.498559
3      bp  1.447358
4      s2  1.180838
5      s3  1.473827
6      s5  1.641090
7      s6  1.476913


In [13]:
print("Final DataFrame after feature removal:")
diabetes_df_filtered.head()

Final DataFrame after feature removal:


Unnamed: 0,age,sex,bmi,bp,s2,s3,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.034821,-0.043401,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.019163,0.074412,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.034194,-0.032356,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.024991,-0.036038,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.015596,0.008142,-0.031988,-0.046641


## % of features left

In [14]:
print(f"Percentage of featues left = {diabetes_df_filtered.shape[1]/diabetes_df.shape[1]*100} %")

Percentage of featues left = 80.0 %
