In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## STEP 1 : Data Collection

In [2]:
filepath_House_Price = r"C:\Users\aakas\PythonStuff\Regression_udemy\resources\Linear_Regression\House_Price.csv"

In [3]:
houses = pd.read_csv(filepath_House_Price , header = 0 )

In [4]:
houses.head()

Unnamed: 0,price,crime_rate,resid_area,air_qual,room_num,age,dist1,dist2,dist3,dist4,teachers,poor_prop,airport,n_hos_beds,n_hot_rooms,waterbody,rainfall,bus_ter,parks
0,24.0,0.00632,32.31,0.538,6.575,65.2,4.35,3.81,4.18,4.01,24.7,4.98,YES,5.48,11.192,River,23,YES,0.049347
1,21.6,0.02731,37.07,0.469,6.421,78.9,4.99,4.7,5.12,5.06,22.2,9.14,NO,7.332,12.1728,Lake,42,YES,0.046146
2,34.7,0.02729,37.07,0.469,7.185,61.1,5.03,4.86,5.01,4.97,22.2,4.03,NO,7.394,101.12,,38,YES,0.045764
3,33.4,0.03237,32.18,0.458,6.998,45.8,6.21,5.93,6.16,5.96,21.3,2.94,YES,9.268,11.2672,Lake,45,YES,0.047151
4,36.2,0.06905,32.18,0.458,7.147,54.2,6.16,5.86,6.37,5.86,21.3,5.33,NO,8.824,11.2896,Lake,55,YES,0.039474


## STEP 2 : Outlier Treatment - Trimming

In [5]:
# Z-score for Normally or almost normally distributed data

nor_col = ['room_num',]
skew_col = ['dist1', 'dist2', 'dist3', 'dist4','poor_prop' , 'parks']

In [6]:
df_trim = houses.copy()

In [7]:
df_trim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        506 non-null    float64
 1   crime_rate   506 non-null    float64
 2   resid_area   506 non-null    float64
 3   air_qual     506 non-null    float64
 4   room_num     506 non-null    float64
 5   age          506 non-null    float64
 6   dist1        506 non-null    float64
 7   dist2        506 non-null    float64
 8   dist3        506 non-null    float64
 9   dist4        506 non-null    float64
 10  teachers     506 non-null    float64
 11  poor_prop    506 non-null    float64
 12  airport      506 non-null    object 
 13  n_hos_beds   498 non-null    float64
 14  n_hot_rooms  506 non-null    float64
 15  waterbody    506 non-null    object 
 16  rainfall     506 non-null    int64  
 17  bus_ter      506 non-null    object 
 18  parks        506 non-null    float64
dtypes: float

In [8]:
# loop for normally or almost normally distributed data
for col in nor_col :
    
    # Finding mean and Std
    mean_col = df_trim[col].mean()
    std_col = df_trim[col].std()
    
    # Finding lower and upper limits
    lower_limit = mean_col - 3*std_col
    upper_limit = mean_col + 3*std_col
    
    df_trim = df_trim[(df_trim[col] > lower_limit)  & (df_trim[col] < upper_limit)]
    

# loop for skew data
for col in skew_col :
    
    # Finding IQR
    percentile25 = df_trim[col].quantile(0.25)
    percentile75 = df_trim[col].quantile(.75)
    
    # Compute IQR
    IQR = percentile75 - percentile25
    
    # Finding lower and upper limits
    lowerlimit = percentile25 - 1.5*IQR
    upperlimit = percentile75 + 1.5*IQR
    
    # Trimming
    df_trim = df_trim[(df_trim[col]> lowerlimit) & (df_trim[col] < upperlimit) ]
    
    
    # min_cgpa = df_trim['cgpa'].min()
    # max_cgpa = df_trim['cgpa'].max()

In [9]:
df_trim.shape

(482, 19)

## STEP 2 : Feature Enginnering

#### ******* Initial Correlation ********
    price          1.000000
    poor_prop     -0.740836
    room_num       0.696304
    teachers       0.505655
    resid_area    -0.484754
    air_qual      -0.429300
    parks         -0.391574
    crime_rate    -0.389582
    age           -0.377999
    dist1          0.251355
    dist2          0.249459
    dist4          0.248200
    dist3          0.246650
    n_hos_beds     0.109646
    rainfall      -0.047426
    n_hot_rooms    0.023122

In [10]:
df = houses.copy()

### (i) crime_rate vs price has power-law relation.

In [11]:
df['crime_rate'] = np.log(df['crime_rate'])

In [12]:
# plt.figure(figsize = (5,4))
# plt.scatter(x = df_trim['crime_rate'] , y = df_trim['price']  )

# plt.show()

In [13]:
# df_trim.corrwith(other = df_trim['price'] ,numeric_only= True).sort_values(ascending=False, 
#                                                                          key=lambda x: abs(x))

### (ii) poor_prop vs price has  power-law relation.

In [14]:
df['poor_prop'] = np.log(df['poor_prop'])

In [15]:
# plt.scatter(x = df_trim['poor_prop'] , y = df_trim['price'])

# plt.show()

In [16]:
# df_trim.corrwith(other = df_trim['price'] ,numeric_only= True).sort_values(ascending=False, 
#                                                                          key=lambda x: abs(x))

### (iii) Feature Merging

In [17]:
df['dist'] = (df['dist1'] + df['dist2'] + df['dist3'] + df['dist4'] )/4

In [18]:
# df_trim.drop(labels = ['dist1', 'dist2', 'dist3', 'dist4'] ,axis = 1 , inplace = True)

In [19]:
df.corrwith(other = df['price'] ,numeric_only= True).sort_values(ascending=False, 
                                                                         key=lambda x: abs(x))

price          1.000000
poor_prop     -0.818237
room_num       0.696304
teachers       0.505655
resid_area    -0.484754
crime_rate    -0.457303
air_qual      -0.429300
parks         -0.391574
age           -0.377999
dist1          0.251355
dist2          0.249459
dist           0.249289
dist4          0.248200
dist3          0.246650
n_hos_beds     0.109646
rainfall      -0.047426
n_hot_rooms    0.023122
dtype: float64

###  Multi-Variate Analysis

In [20]:
df.columns

Index(['price', 'crime_rate', 'resid_area', 'air_qual', 'room_num', 'age',
       'dist1', 'dist2', 'dist3', 'dist4', 'teachers', 'poor_prop', 'airport',
       'n_hos_beds', 'n_hot_rooms', 'waterbody', 'rainfall', 'bus_ter',
       'parks', 'dist'],
      dtype='object')

In [21]:
mc_feat = ['resid_area' , 'room_num' ,'teachers' , 'poor_prop','dist','parks','air_qual','age',
           'dist1', 'dist2', 'dist3', 'dist4']

In [22]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

#### Syntax : variance_inflation_factor(exog, exog_idx)

- **Explanation :**

    One recommendation is that if VIF is greater than 5, then the explanatory variable given by 

    'exog_idx' is highly collinear with the other explanatory variables, and 

    the parameter estimates will have large standard errors because of this.

In [23]:
df_vif = df.copy()

In [24]:
len(df_vif)

506

In [25]:
df_vif.dropna(inplace = True)   

In [26]:
# df_vif.isnull().sum()

In [27]:
# Defining custom function for calculating VIF

def compute_vif(feature_list ,dataframe):
    
    X = dataframe[feature_list]
    
    # Dropping any NaN values
    X.dropna(inplace = True)
    
    # Adding constant column in X
    X  = add_constant(X ,prepend=True)
    
    # Creating DF for storing VIF values
    vif = pd.DataFrame()
    vif['variable'] = X.columns
    
    
    vif['VIF'] = [variance_inflation_factor(X.values ,i) for i in range(X.shape[1])]
    
    vif = vif[vif['variable'] != 'const']
    
    return vif

In [28]:
import warnings 

warnings.filterwarnings("ignore")

In [29]:
compute_vif(feature_list = mc_feat ,dataframe = df_vif)

Unnamed: 0,variable,VIF
1,resid_area,3.155353
2,room_num,2.019201
3,teachers,1.353041
4,poor_prop,3.151735
5,dist,inf
6,parks,6.208761
7,air_qual,8.736281
8,age,3.112176
9,dist1,inf
10,dist2,inf


In [30]:
# df_houses['parks'].corr(df_houses['air_qual'])

In [31]:
# Removing 'dist4' feature
mc_feat.remove('dist4')

In [32]:
compute_vif(feature_list = mc_feat , dataframe  = df_vif)

Unnamed: 0,variable,VIF
1,resid_area,3.155353
2,room_num,2.019201
3,teachers,1.353041
4,poor_prop,3.151735
5,dist,1552.914157
6,parks,6.208761
7,air_qual,8.736281
8,age,3.112176
9,dist1,481.073775
10,dist2,508.997402


In [33]:
# Removing 'dist3' feature
mc_feat.remove('dist3')

In [34]:
compute_vif(feature_list = mc_feat , dataframe  = df_vif)

Unnamed: 0,variable,VIF
1,resid_area,3.155316
2,room_num,2.016226
3,teachers,1.348241
4,poor_prop,3.143032
5,dist,1010.253937
6,parks,6.162243
7,air_qual,8.726375
8,age,3.111914
9,dist1,478.366103
10,dist2,508.075705


In [35]:
# Removing 'dist2' feature

mc_feat.remove('dist2')

In [36]:
compute_vif(feature_list = mc_feat , dataframe  = df_vif)

Unnamed: 0,variable,VIF
1,resid_area,3.155261
2,room_num,2.013163
3,teachers,1.344719
4,poor_prop,3.143029
5,dist,481.747061
6,parks,6.161973
7,air_qual,8.709043
8,age,3.105518
9,dist1,478.116562


In [37]:
# Removing 'dist1' feature
mc_feat.remove('dist1')

In [38]:
compute_vif(feature_list = mc_feat , dataframe  = df_vif)

Unnamed: 0,variable,VIF
1,resid_area,3.148902
2,room_num,2.00591
3,teachers,1.344648
4,poor_prop,3.14226
5,dist,3.247198
6,parks,6.136865
7,air_qual,8.682828
8,age,3.104732


In [39]:
# Removing 'parks' feature
mc_feat.remove('parks')

In [40]:
compute_vif(feature_list = mc_feat , dataframe  = df_vif)

Unnamed: 0,variable,VIF
1,resid_area,3.146775
2,room_num,2.005444
3,teachers,1.344028
4,poor_prop,3.140143
5,dist,3.247177
6,air_qual,3.767779
7,age,3.104622


#### Conclusion : we have to do feature enginnering

- **dist1** , **dist2** , **dist3** , **dist4** , **parks** are highly correlated , so we can drop these features.
- **bus_ter** feature has only single categorical value.

## STEP 3 : Drop Irrelevent Features

- Dropping :  **dist1** , **dist2** , **dist3** , **dist4** , **parks** , **bus_ter** features

In [41]:
df_export = df.drop(labels = ['dist1' , 'dist2' , 'dist3' , 'dist4' , 'parks','bus_ter'] ,
                    axis = 1)

In [42]:
df_export.head(2)

Unnamed: 0,price,crime_rate,resid_area,air_qual,room_num,age,teachers,poor_prop,airport,n_hos_beds,n_hot_rooms,waterbody,rainfall,dist
0,24.0,-5.064036,32.31,0.538,6.575,65.2,24.7,1.60543,YES,5.48,11.192,River,23,4.0875
1,21.6,-3.600502,37.07,0.469,6.421,78.9,22.2,2.21266,NO,7.332,12.1728,Lake,42,4.9675


## STEP 4 : Export the DF as a .csv file.

In [43]:
df_export.to_csv("S2a_Part2_FE_VIF_trimming")