In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## STEP 1 : Data Collection

In [2]:
filepath_House_Price = r"C:\Users\aakas\PythonStuff\Projects\Cali_HP\dataset\housing.csv"

In [3]:
houses = pd.read_csv(filepath_House_Price , header = 0 )

In [4]:
houses.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## STEP 2 : FE

In [5]:
df= houses.copy()

In [7]:
df["rooms_per_household"] = df["total_rooms"]/df["households"]
df["bedrooms_per_room"] = df["total_bedrooms"]/df["total_rooms"]
df["population_per_household"]=df["population"]/df["households"]

In [8]:
df.corrwith(other = df['median_house_value'] ,numeric_only= True).sort_values(ascending=False, 
                                                                         key=lambda x: abs(x))

median_house_value          1.000000
median_income               0.688075
bedrooms_per_room          -0.255880
rooms_per_household         0.151948
latitude                   -0.144160
total_rooms                 0.134153
housing_median_age          0.105623
households                  0.065843
total_bedrooms              0.049686
longitude                  -0.045967
population                 -0.024650
population_per_household   -0.023737
dtype: float64

In [9]:
mc_feat = ['population_per_household','bedrooms_per_room','rooms_per_household','total_rooms',
           'total_bedrooms','population', 'households']

In [10]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

#### Syntax : variance_inflation_factor(exog, exog_idx)

- **Explanation :**

    One recommendation is that if VIF is greater than 5, then the explanatory variable given by 

    'exog_idx' is highly collinear with the other explanatory variables, and 

    the parameter estimates will have large standard errors because of this.

In [11]:
df_vif = df.copy()

In [12]:
len(df_vif)

20640

In [13]:
# Defining custom function for calculating VIF

def compute_vif(feature_list ,dataframe):
    
    X = dataframe[feature_list]
    
    # Dropping any NaN values
    X.dropna(inplace = True)
    
    # Adding constant column in X
    X  = add_constant(X ,prepend=True)
    
    # Creating DF for storing VIF values
    vif = pd.DataFrame()
    vif['variable'] = X.columns
    
    
    vif['VIF'] = [variance_inflation_factor(X.values ,i) for i in range(X.shape[1])]
    
    vif = vif[vif['variable'] != 'const']
    
    return vif

In [14]:
import warnings 

warnings.filterwarnings("ignore")

In [15]:
compute_vif(feature_list = mc_feat ,dataframe = df_vif)

Unnamed: 0,variable,VIF
1,population_per_household,1.05776
2,bedrooms_per_room,2.40679
3,rooms_per_household,1.626829
4,total_rooms,17.386008
5,total_bedrooms,45.992472
6,population,6.52294
7,households,42.160467


In [16]:
# lets remove 'total_bedrooms' column

mc_feat.remove('total_bedrooms')

In [17]:
compute_vif(feature_list = mc_feat , dataframe  = df_vif)

Unnamed: 0,variable,VIF
1,population_per_household,1.055588
2,bedrooms_per_room,1.691705
3,rooms_per_household,1.425318
4,total_rooms,12.749128
5,population,6.133637
6,households,14.97837


In [18]:
# lets remove 'households' column

mc_feat.remove('households')

In [19]:
compute_vif(feature_list = mc_feat , dataframe  = df_vif)

Unnamed: 0,variable,VIF
1,population_per_household,1.041641
2,bedrooms_per_room,1.349834
3,rooms_per_household,1.297331
4,total_rooms,5.078177
5,population,4.9269


## STEP 3 : Drop Irrelevent Features

In [20]:
# drop_col = ['households' , 'rooms_per_household' , 'total_bedrooms']
drop_col = ['households' , 'total_bedrooms']

In [21]:
df_export = df.drop(labels = drop_col ,axis = 1)

In [23]:
df_export.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'population', 'median_income', 'median_house_value', 'ocean_proximity',
       'rooms_per_household', 'bedrooms_per_room', 'population_per_household'],
      dtype='object')

## STEP 4 : Export the DF as a .csv file.

In [24]:
df_export.to_csv("S2a_Part2_test_FE_VIF_trimming.csv")