## MultiCollinearity


### Correlation Plot vs Variance Inflation Matrix



In [2]:
import numpy as np
import pandas as pd

In [3]:
data=pd.read_csv("C:\\Users\\admin\\IBM_Training\\Training\\ML Algorithms\\BMI.csv")
data.head()

Unnamed: 0,Gender,Height,Weight,Index
0,Male,174,96,4
1,Male,189,87,2
2,Female,185,110,4
3,Female,195,104,3
4,Male,149,61,3


In [4]:
data["Gender"]=data["Gender"].map({"Male":0,"Female":1})
data.head()

Unnamed: 0,Gender,Height,Weight,Index
0,0,174,96,4
1,0,189,87,2
2,1,185,110,4
3,1,195,104,3
4,0,149,61,3


In [5]:
X=data.iloc[:,:-1]
X.head()

Unnamed: 0,Gender,Height,Weight
0,0,174,96
1,0,189,87
2,1,185,110
3,1,195,104
4,0,149,61


In [6]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(dataset):
    vif=pd.DataFrame()
    vif["Column"]=dataset.columns
    vif["VIF"]=[variance_inflation_factor(dataset.values,i) for i in range(dataset.shape[1])]
    
    return vif

  import pandas.util.testing as tm


In [7]:
calculate_vif(X)

Unnamed: 0,Column,VIF
0,Gender,2.028864
1,Height,11.623103
2,Weight,10.688377


In [8]:
sb.heatmap(X.corr(),annot=True);

NameError: name 'sb' is not defined

***height and weight have very high values of VIF, indicating that these two variables are highly correlated. This is expected as the height of a person does influence their weight. Hence, considering these two features together leads to a model with high multicollinearity.***

In [None]:
data1=pd.read_csv("C:\\Users\\admin\\IBM_Training\\Training\\ML Algorithms\\HR_data.csv")
data1.head()
data1=data1.iloc[:,:-2]
data1.head()

In [None]:
calculate_vif(data1)

# A wider approach of VIF v/s Correlation

In [94]:
house=pd.read_csv("C:\\Users\\admin\\IBM_Training\\Training\\ML Algorithms\\House Sales.csv")
house.head()

Unnamed: 0,Interior(Sq Ft),# of Bed,# of Bath,# of Rooms,Condo Fee,Tax,Sale Price
0,0,2,1,4,70,490,106000.0
1,850,2,1,5,100,923,95000.0
2,730,2,1,4,185,575,131000.0
3,1058,3,1,5,163,1682,125000.0
4,1585,2,1,6,100,1402,79000.0


In [16]:
def vif_score(dataset):
    vif=pd.DataFrame()
    vif["Independent_Feautes"]=dataset.columns
    vif["VIF_Value"]=[variance_inflation_factor(dataset.values,i) for i in range(dataset.shape[1])]
    return vif

In [None]:
df1=house.iloc[:,:-1]
df1.head()

In [24]:
vif_score(df1)

Unnamed: 0,Independent_Feautes,VIF_Value
0,Interior(Sq Ft),35.436502
1,# of Bed,30.207875
2,# of Bath,12.25403
3,# of Rooms,41.654966
4,Condo Fee,9.023152
5,Tax,11.603921


In [23]:
df1.corr()

Unnamed: 0,Interior(Sq Ft),# of Bed,# of Bath,# of Rooms,Condo Fee,Tax
Interior(Sq Ft),1.0,0.730519,0.563361,0.771653,0.29224,0.592464
# of Bed,0.730519,1.0,0.376967,0.848883,0.060231,0.393161
# of Bath,0.563361,0.376967,1.0,0.298794,0.460524,0.541821
# of Rooms,0.771653,0.848883,0.298794,1.0,0.04696,0.377941
Condo Fee,0.29224,0.060231,0.460524,0.04696,1.0,0.729429
Tax,0.592464,0.393161,0.541821,0.377941,0.729429,1.0


The VIF scores are higher than 10 for most of the variables. The individual coefficients and the p-values will be greatly impacted if we build a regression model with this dataset. We will proceed on how to fix this issue.

#### Fixing Multi-Collinearity --- Dropping Variables

We will consider dropping the features Interior(Sq Ft) and # of Rooms which are having high VIF values because the same information is being captured by other variables. Also, it helps to reduce the redundancy in the dataset.

In [352]:
df2=df1.copy()
df2.drop(columns=["Interior(Sq Ft)","# of Rooms"],inplace=True)

In [353]:
df2.head()

Unnamed: 0,# of Bed,# of Bath,Condo Fee,Tax
0,2,1,70,490
1,2,1,100,923
2,2,1,185,575
3,3,1,163,1682
4,2,1,100,1402


In [34]:
vif_score(df2)

Unnamed: 0,Independent_Feautes,VIF_Value
0,# of Bed,7.515687
1,# of Bath,10.41179
2,Condo Fee,8.582383
3,Tax,10.240541


In [38]:
vif_score(df1).sort_values(by="VIF_Value",ascending=False)

Unnamed: 0,Independent_Feautes,VIF_Value
3,# of Rooms,41.654966
0,Interior(Sq Ft),35.436502
1,# of Bed,30.207875
2,# of Bath,12.25403
5,Tax,11.603921
4,Condo Fee,9.023152


From the above, we can notice that the VIF scores have reduced for other variables also after dropping the high-value (Interior(Sq Ft) and # of Rooms) VIF features.

#### Fixing Multi-Collinearity ---- Combining Variables

In [359]:
df3=df1.copy()
df3.head()

Unnamed: 0,Interior(Sq Ft),# of Bed,# of Bath,# of Rooms,Condo Fee,Tax
0,0,2,1,4,70,490
1,850,2,1,5,100,923
2,730,2,1,4,185,575
3,1058,3,1,5,163,1682
4,1585,2,1,6,100,1402


In [360]:
df3["Total_Rooms"]=df3["# of Bed"]+df3["# of Bath"]
df3.head()
df3.drop(columns=["Interior(Sq Ft)","# of Rooms","# of Bed","# of Bath"],inplace=True)
df3.head()

Unnamed: 0,Condo Fee,Tax,Total_Rooms
0,70,490,3
1,100,923,3
2,185,575,3
3,163,1682,4
4,100,1402,3


In [348]:
df3.head()
vif_score(df3)

Unnamed: 0,Independent_Feautes,VIF_Value
0,Condo Fee,7.472693
1,Tax,10.167316
2,Total_Rooms,5.466786


Now that we have reduced VIF score for both df1 and df2. Now let's make a Linear Regression Model to check the r2 values. On original Data, df1 and df2

## Linear Regression Model on Original Data, df1 and df2

In [51]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [63]:
house.head()
#df1.head()
#df2.head()

Unnamed: 0,Interior(Sq Ft),# of Bed,# of Bath,# of Rooms,Condo Fee,Tax,Sale Price
0,0,2,1,4,70,490,106000
1,850,2,1,5,100,923,95000
2,730,2,1,4,185,575,131000
3,1058,3,1,5,163,1682,125000
4,1585,2,1,6,100,1402,79000


#### House Data / Original Data

In [304]:
X=house.iloc[:,:-1]
X.head()
yy=house.iloc[:,6]
yy


0      1,06,000
1      95000.00
2      1,31,000
3      1,25,000
4      79000.00
         ...   
451    2,80,000
452    2,70,000
453    1,85,000
454    2,90,000
455    2,49,500
Name:  Sale Price , Length: 456, dtype: object

In [305]:
y=[pd.to_numeric((yy[i].replace(",",""))) for i in range(yy.shape[0])]
y

[106000,
 95000.0,
 131000,
 125000,
 79000.0,
 135000,
 118000,
 154000,
 111500,
 445000,
 128500,
 154000,
 142000,
 143000,
 212500,
 149000,
 125000,
 189000,
 202000,
 206000,
 175000,
 208000,
 136500,
 166000,
 97000.0,
 75000.0,
 205000,
 172500,
 179000,
 105000,
 166000,
 150700,
 166000,
 145000,
 122000,
 135000,
 165000,
 214500,
 152000,
 165000,
 142000,
 89000.0,
 133000,
 144000,
 110000,
 120000,
 101000,
 125000,
 121000,
 119500,
 138500,
 110000,
 160000,
 179000,
 100000,
 505000,
 215000,
 530000,
 400000,
 380000,
 535000,
 225000,
 370000,
 575000,
 875000,
 440000,
 460000,
 116500,
 153000,
 162000,
 132500,
 139000,
 129000,
 82882.0,
 106000,
 106000,
 261000,
 260000,
 255000,
 256000,
 261000,
 240000,
 231500,
 265000,
 200000,
 178000,
 240000,
 274500,
 112000,
 177000,
 134000,
 160000,
 195000,
 222500,
 180000,
 148000,
 160000,
 136000,
 149000,
 395000,
 376500,
 218000,
 186000,
 205000,
 105000,
 163000,
 109000,
 132000,
 147000,
 141000,
 215

In [330]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [331]:
lr=LinearRegression()
lr.fit(X_train,y_train)

LinearRegression()

In [332]:
pred=lr.predict(X_test)

In [334]:
r2_score(y_test,pred)

0.08931368639938797

## Df2 Data

In [356]:
df2.head()
X=df2.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

lr.fit(X_train,y_train)

df2_pred=lr.predict(X_test)
r2_score(y_test,df2_pred)

0.09280646314291163

## Df3 Data

In [363]:
df3.head()

X=df3.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

lr.fit(X_train,y_train)

df3_pred=lr.predict(X_test)
r2_score(y_test,df3_pred)

0.10177054571896804

In [365]:
#For a better view 
print(r2_score(y_test,pred),"Accuracy from Original Data")

print()

print(r2_score(y_test,df2_pred),"Accuracy from Df2 Data where 2 columns were deleted")

print()

print(r2_score(y_test,df3_pred),"Accuracy from df3 Data where 2 columns were delete and 2 columns were combined")

0.08931368639938797 Accuracy from Original Data

0.09280646314291163 Accuracy from Df2 Data where 2 columns were deleted

0.10177054571896804 Accuracy from df3 Data where 2 columns were delete and 2 columns were combined


# Thank You