In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
d = {'ZFR': [4,3,4,4,4,4,3], 
     'SFR': [8,5,7.5,7,6,6,5], 
     'Year_Old': [10,18,12,5,20,40,5], 
     'Cost_for_two': [1200,1000,1300,600,400,200,300]}

df = pd.DataFrame(data=d)
df

Unnamed: 0,ZFR,SFR,Year_Old,Cost_for_two
0,4,8.0,10,1200
1,3,5.0,18,1000
2,4,7.5,12,1300
3,4,7.0,5,600
4,4,6.0,20,400
5,4,6.0,40,200
6,3,5.0,5,300


### VIF

In [3]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def cal_vif(dataset):
    vif = pd.DataFrame()
    vif['feature'] = dataset.columns
    vif['VIF_Values'] = [variance_inflation_factor(dataset.values, i) for i in range(dataset.shape[1])]
    
    return(vif)

In [4]:
features = df.iloc[:,:-1]
cal_vif(features)

Unnamed: 0,feature,VIF_Values
0,ZFR,196.556082
1,SFR,160.933111
2,Year_Old,5.57294


### Dropping highly co-orelated feature

In [5]:
df.corr()

Unnamed: 0,ZFR,SFR,Year_Old,Cost_for_two
ZFR,1.0,0.785553,0.236454,0.097849
SFR,0.785553,1.0,-0.223692,0.586607
Year_Old,0.236454,-0.223692,1.0,-0.380386
Cost_for_two,0.097849,0.586607,-0.380386,1.0


Check which of the variable has the highest co-oreation with our TARGET variable.

SFR is more useful for me than ZFR. So we will keep SFR and drop ZFR.

In [6]:
features.drop('ZFR',axis=1,inplace=True)
features

Unnamed: 0,SFR,Year_Old
0,8.0,10
1,5.0,18
2,7.5,12
3,7.0,5
4,6.0,20
5,6.0,40
6,5.0,5


In [7]:
cal_vif(features)

Unnamed: 0,feature,VIF_Values
0,SFR,2.540858
1,Year_Old,2.540858


# Another Way

In [8]:
df['Product_of_rating_and_year'] = df['SFR']*df['Year_Old']
df.head()

Unnamed: 0,ZFR,SFR,Year_Old,Cost_for_two,Product_of_rating_and_year
0,4,8.0,10,1200,80.0
1,3,5.0,18,1000,90.0
2,4,7.5,12,1300,90.0
3,4,7.0,5,600,35.0
4,4,6.0,20,400,120.0


In [9]:
data = df.drop(['SFR','Year_Old'],axis=1)
data.head()

Unnamed: 0,ZFR,Cost_for_two,Product_of_rating_and_year
0,4,1200,80.0
1,3,1000,90.0
2,4,1300,90.0
3,4,600,35.0
4,4,400,120.0


In [10]:
cal_vif(data)

Unnamed: 0,feature,VIF_Values
0,ZFR,9.894508
1,Cost_for_two,4.623752
2,Product_of_rating_and_year,4.155566
