In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## STEP 1 : Data Collection

In [2]:
filepath_House_Price = r"C:\Users\aakas\PythonStuff\Projects\Cali_HP\dataset\housing.csv"

In [3]:
houses = pd.read_csv(filepath_House_Price , header = 0 )

In [4]:
houses.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## STEP 2 : Package importing 

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline                
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

In [6]:
def num_cat_col_list(DF) : 
    
    from pandas.api.types import is_string_dtype, is_numeric_dtype

    
    num_list = []
    cat_list = []

    for column in DF:

        if is_numeric_dtype(DF[column]):
            num_list.append(column)

        elif is_string_dtype(DF[column]):
            cat_list.append(column)

    return (num_list , cat_list)

In [7]:
num_list , cat_list = num_cat_col_list(houses)

print("Numerical column : ",num_list)
print("Categorical column : ",cat_list)

Numerical column :  ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']
Categorical column :  ['ocean_proximity']


## STEP 3 : Missing Value Imputation

In [8]:
df = houses.copy()

### Making changes to num_feat and cat_feat list

In [9]:
# num_list

num_feat = num_list.copy()
num_feat.remove('median_house_value')

cat_feat = cat_list.copy()

### Train - Test Split

In [10]:
D_train , D_test = train_test_split(df ,test_size= 0.2 ,random_state = 100 )

In [11]:
D_train.shape , D_test.shape

((16512, 10), (4128, 10))

In [12]:
D_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 3278 to 5640
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16354 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   median_house_value  16512 non-null  float64
 9   ocean_proximity     16512 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.4+ MB


In [13]:
X_train = D_train.drop(labels= ['median_house_value'] , axis = 1)
y_train = D_train['median_house_value']

X_test = D_test.drop(labels= ['median_house_value'] , axis = 1)
y_test = D_test['median_house_value']

In [14]:
X_train.shape , y_train.shape , X_test.shape , y_test.shape

((16512, 9), (16512,), (4128, 9), (4128,))

### PipeLine

### Pipelines :  numerical and categorical features

In [15]:
# ohe = OneHotEncoder()

In [16]:
num_pipeline = Pipeline([('Imputer' , SimpleImputer(strategy='median')) ,
                        ])

In [17]:
# cat_pipeline = Pipeline([('Nominal Encoding' ,ohe)])

### Main-Pipeline

In [18]:
pass_col = ['ocean_proximity']

In [19]:
pipeline = ColumnTransformer([("Num pipeline" , num_pipeline , num_feat),
                              ("dont' change", "passthrough", pass_col)
                             ])

### Transforming

In [20]:
X_train_tr = pipeline.fit_transform(X_train)

In [21]:
X_test_tr = pipeline.transform(X_test)

## STEP4  : Create DF from transformed 2D-array

In [22]:
feat = pipeline.get_feature_names_out()
feat

array(['Num pipeline__longitude', 'Num pipeline__latitude',
       'Num pipeline__housing_median_age', 'Num pipeline__total_rooms',
       'Num pipeline__total_bedrooms', 'Num pipeline__population',
       'Num pipeline__households', 'Num pipeline__median_income',
       "dont' change__ocean_proximity"], dtype=object)

In [23]:
out_cols = []

i= 0  # counter

for col in pipeline.get_feature_names_out():
    out_cols.append(col.split("__")[-1])
    #out_cols[i] = out_cols[i].split("_")[-1]
    
    i += 1
print(out_cols)

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity']


In [24]:
len(out_cols)

9

In [25]:
X_train_tr[0]

array([-122.7, 39.14, 13.0, 532.0, 111.0, 214.0, 62.0, 3.3929, 'INLAND'],
      dtype=object)

In [26]:
X_train_tr_df = pd.DataFrame(X_train_tr, columns= out_cols)

X_test_tr_df = pd.DataFrame(X_test_tr , columns = out_cols)

In [27]:
columns = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']

for col in columns:
    
    X_train_tr_df[col] = pd.to_numeric(X_train_tr_df[col] , downcast= "float")
    X_test_tr_df[col] = pd.to_numeric(X_test_tr_df[col] , downcast= "float")
       

In [28]:
X_train_tr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float32
 1   latitude            16512 non-null  float32
 2   housing_median_age  16512 non-null  float32
 3   total_rooms         16512 non-null  float32
 4   total_bedrooms      16512 non-null  float32
 5   population          16512 non-null  float32
 6   households          16512 non-null  float32
 7   median_income       16512 non-null  float32
 8   ocean_proximity     16512 non-null  object 
dtypes: float32(8), object(1)
memory usage: 645.1+ KB


In [29]:
X_test_tr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4128 entries, 0 to 4127
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           4128 non-null   float32
 1   latitude            4128 non-null   float32
 2   housing_median_age  4128 non-null   float32
 3   total_rooms         4128 non-null   float32
 4   total_bedrooms      4128 non-null   float32
 5   population          4128 non-null   float32
 6   households          4128 non-null   float32
 7   median_income       4128 non-null   float32
 8   ocean_proximity     4128 non-null   object 
dtypes: float32(8), object(1)
memory usage: 161.4+ KB


In [30]:
X_train_tr_df.shape , y_train.shape

((16512, 9), (16512,))

In [31]:
D_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
3278,-122.7,39.14,13.0,532.0,111.0,214.0,62.0,3.3929,108300.0,INLAND
16630,-120.83,35.32,11.0,3252.0,701.0,1814.0,660.0,3.2226,183200.0,NEAR OCEAN
18748,-122.31,40.49,18.0,4026.0,718.0,1731.0,705.0,3.35,118400.0,INLAND
14961,-116.92,32.76,7.0,1659.0,237.0,862.0,242.0,5.2741,249400.0,<1H OCEAN
1740,-122.34,37.97,19.0,2237.0,580.0,1438.0,551.0,2.3382,120700.0,NEAR BAY


In [32]:
y_train.head()

3278     108300.0
16630    183200.0
18748    118400.0
14961    249400.0
1740     120700.0
Name: median_house_value, dtype: float64

In [33]:
X_train_tr_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.699997,39.139999,13.0,532.0,111.0,214.0,62.0,3.3929,INLAND
1,-120.830002,35.32,11.0,3252.0,701.0,1814.0,660.0,3.2226,NEAR OCEAN
2,-122.309998,40.490002,18.0,4026.0,718.0,1731.0,705.0,3.35,INLAND
3,-116.919998,32.759998,7.0,1659.0,237.0,862.0,242.0,5.2741,<1H OCEAN
4,-122.339996,37.970001,19.0,2237.0,580.0,1438.0,551.0,2.3382,NEAR BAY


In [34]:
y_train = y_train.reset_index(drop =  True)
y_train.head()

0    108300.0
1    183200.0
2    118400.0
3    249400.0
4    120700.0
Name: median_house_value, dtype: float64

In [35]:
# y_train_df = pd.DataFrame(y_train , columns=["median_house_value"])

In [36]:
# y_train_df.head()

In [37]:
df_train_trans = pd.concat([X_train_tr_df,y_train] ,axis = 1 )

In [38]:
df_train_trans.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.699997,39.139999,13.0,532.0,111.0,214.0,62.0,3.3929,INLAND,108300.0
1,-120.830002,35.32,11.0,3252.0,701.0,1814.0,660.0,3.2226,NEAR OCEAN,183200.0
2,-122.309998,40.490002,18.0,4026.0,718.0,1731.0,705.0,3.35,INLAND,118400.0
3,-116.919998,32.759998,7.0,1659.0,237.0,862.0,242.0,5.2741,<1H OCEAN,249400.0
4,-122.339996,37.970001,19.0,2237.0,580.0,1438.0,551.0,2.3382,NEAR BAY,120700.0


In [39]:
df_train_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float32
 1   latitude            16512 non-null  float32
 2   housing_median_age  16512 non-null  float32
 3   total_rooms         16512 non-null  float32
 4   total_bedrooms      16512 non-null  float32
 5   population          16512 non-null  float32
 6   households          16512 non-null  float32
 7   median_income       16512 non-null  float32
 8   ocean_proximity     16512 non-null  object 
 9   median_house_value  16512 non-null  float64
dtypes: float32(8), float64(1), object(1)
memory usage: 774.1+ KB


In [40]:
X_test_tr_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-118.099998,33.810001,36.0,1111.0,184.0,444.0,177.0,3.7031,<1H OCEAN
1,-122.269997,37.82,52.0,1630.0,456.0,1162.0,400.0,1.2475,NEAR BAY
2,-119.120003,35.369999,13.0,4527.0,713.0,2170.0,671.0,4.8266,INLAND
3,-123.150002,39.310001,19.0,1026.0,205.0,424.0,152.0,2.8833,INLAND
4,-122.519997,37.98,31.0,6555.0,1571.0,2962.0,1464.0,2.8903,NEAR BAY


In [41]:
y_test = y_test.reset_index(drop =  True)
y_test.head()

0    245300.0
1    104200.0
2    146200.0
3    154200.0
4    324200.0
Name: median_house_value, dtype: float64

In [42]:
df_test_trans = pd.concat([X_test_tr_df,y_test] ,axis = 1 )
df_test_trans.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-118.099998,33.810001,36.0,1111.0,184.0,444.0,177.0,3.7031,<1H OCEAN,245300.0
1,-122.269997,37.82,52.0,1630.0,456.0,1162.0,400.0,1.2475,NEAR BAY,104200.0
2,-119.120003,35.369999,13.0,4527.0,713.0,2170.0,671.0,4.8266,INLAND,146200.0
3,-123.150002,39.310001,19.0,1026.0,205.0,424.0,152.0,2.8833,INLAND,154200.0
4,-122.519997,37.98,31.0,6555.0,1571.0,2962.0,1464.0,2.8903,NEAR BAY,324200.0


In [43]:
df_test_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4128 entries, 0 to 4127
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           4128 non-null   float32
 1   latitude            4128 non-null   float32
 2   housing_median_age  4128 non-null   float32
 3   total_rooms         4128 non-null   float32
 4   total_bedrooms      4128 non-null   float32
 5   population          4128 non-null   float32
 6   households          4128 non-null   float32
 7   median_income       4128 non-null   float32
 8   ocean_proximity     4128 non-null   object 
 9   median_house_value  4128 non-null   float64
dtypes: float32(8), float64(1), object(1)
memory usage: 193.6+ KB


## STEP5  : Outlier Treatment

In [44]:
# only on training dataset

In [45]:
df_train_trim = df_train_trans.copy()

In [46]:
df_train_trim.shape

(16512, 10)

In [47]:
# Z-score for Normally or almost normally distributed data
norcol = []
skewcol = ['total_rooms','total_bedrooms', 'population','households', 'median_income']

In [48]:
def trimming(DF ,nor_col = [],skew_col = [] ):
    
    # loop for normally or almost normally distributed data
    for col in nor_col :

        # Finding mean and Std
        mean_col = DF[col].mean()
        std_col = DF[col].std()

        # Finding lower and upper limits
        lower_limit = mean_col - 3*std_col
        upper_limit = mean_col + 3*std_col

        DF = DF[(DF[col] > lower_limit)  & (DF[col] < upper_limit)]


    # loop for skew data
    for col in skew_col :

        # Finding IQR
        percentile25 = DF[col].quantile(0.25)
        percentile75 = DF[col].quantile(.75)

        # Compute IQR
        IQR = percentile75 - percentile25

        # Finding lower and upper limits
        lowerlimit = percentile25 - 1.5*IQR
        upperlimit = percentile75 + 1.5*IQR

        # Trimming
        DF = DF[(DF[col]> lowerlimit) & (DF[col] < upperlimit) ]
    
    return DF

In [49]:
df_train_trim_tr = trimming(DF = df_train_trim , nor_col= norcol , skew_col= skewcol )

In [50]:
df_train_trim_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14051 entries, 0 to 16511
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           14051 non-null  float32
 1   latitude            14051 non-null  float32
 2   housing_median_age  14051 non-null  float32
 3   total_rooms         14051 non-null  float32
 4   total_bedrooms      14051 non-null  float32
 5   population          14051 non-null  float32
 6   households          14051 non-null  float32
 7   median_income       14051 non-null  float32
 8   ocean_proximity     14051 non-null  object 
 9   median_house_value  14051 non-null  float64
dtypes: float32(8), float64(1), object(1)
memory usage: 768.4+ KB


In [51]:
df_train_trim.shape

(16512, 10)

In [52]:
df_train_trim_tr.shape

(14051, 10)

## STEP 6 : FE

In [53]:
# df_train_trim_tr

In [54]:
df_train_trim_tr["rooms_per_household"] = df_train_trim_tr["total_rooms"]/df_train_trim_tr["households"]
df_train_trim_tr["bedrooms_per_room"] = df_train_trim_tr["total_bedrooms"]/df_train_trim_tr["total_rooms"]
df_train_trim_tr["population_per_household"]=df_train_trim_tr["population"]/df_train_trim_tr["households"]

In [55]:
df_train_trim_tr.shape

(14051, 13)

In [56]:
# df_test_trans

In [57]:
df_test_trans["rooms_per_household"] = df_test_trans["total_rooms"]/df_test_trans["households"]
df_test_trans["bedrooms_per_room"] = df_test_trans["total_bedrooms"]/df_test_trans["total_rooms"]
df_test_trans["population_per_household"]=df_test_trans["population"]/df_test_trans["households"]

In [58]:
df_test_trans.shape

(4128, 13)

In [59]:
df_train_trim_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14051 entries, 0 to 16511
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   longitude                 14051 non-null  float32
 1   latitude                  14051 non-null  float32
 2   housing_median_age        14051 non-null  float32
 3   total_rooms               14051 non-null  float32
 4   total_bedrooms            14051 non-null  float32
 5   population                14051 non-null  float32
 6   households                14051 non-null  float32
 7   median_income             14051 non-null  float32
 8   ocean_proximity           14051 non-null  object 
 9   median_house_value        14051 non-null  float64
 10  rooms_per_household       14051 non-null  float32
 11  bedrooms_per_room         14051 non-null  float32
 12  population_per_household  14051 non-null  float32
dtypes: float32(11), float64(1), object(1)
memory usage: 933.1+ KB

In [60]:
df_train_trim_tr.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity', 'median_house_value', 'rooms_per_household',
       'bedrooms_per_room', 'population_per_household'],
      dtype='object')

In [61]:
df_train_trim_tr.corrwith(other = df_train_trim_tr['median_house_value'] ,numeric_only= True).sort_values(ascending=False, 
                                                                         key=lambda x: abs(x))

median_house_value          1.000000
median_income               0.632326
population_per_household   -0.206809
bedrooms_per_room          -0.187524
total_rooms                 0.185251
latitude                   -0.157214
housing_median_age          0.128496
rooms_per_household         0.102412
households                  0.101350
total_bedrooms              0.078214
population                 -0.052800
longitude                  -0.038052
dtype: float64

In [62]:
mc_feat = ['population_per_household','bedrooms_per_room','rooms_per_household','total_rooms',
           'total_bedrooms','population', 'households']

In [63]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

#### Syntax : variance_inflation_factor(exog, exog_idx)

- **Explanation :**

    One recommendation is that if VIF is greater than 5, then the explanatory variable given by 

    'exog_idx' is highly collinear with the other explanatory variables, and 

    the parameter estimates will have large standard errors because of this.

In [64]:
df_train_tr = df_train_trim_tr.copy()

In [65]:
len(df_train_tr)

14051

In [66]:
# Defining custom function for calculating VIF

def compute_vif(feature_list ,dataframe):
    
    X = dataframe[feature_list]
    
    # Dropping any NaN values
    X.dropna(inplace = True)
    
    # Adding constant column in X
    X  = add_constant(X ,prepend=True)
    
    # Creating DF for storing VIF values
    vif = pd.DataFrame()
    vif['variable'] = X.columns
    
    
    vif['VIF'] = [variance_inflation_factor(X.values ,i) for i in range(X.shape[1])]
    
    vif = vif[vif['variable'] != 'const']
    
    return vif

In [67]:
import warnings 

warnings.filterwarnings("ignore")

In [68]:
compute_vif(feature_list = mc_feat ,dataframe = df_train_tr)

Unnamed: 0,variable,VIF
1,population_per_household,1.85427
2,bedrooms_per_room,2.886543
3,rooms_per_household,1.696808
4,total_rooms,12.747789
5,total_bedrooms,24.918016
6,population,6.77502
7,households,27.02979


In [69]:
# lets remove 'households' column

mc_feat.remove('households')

In [70]:
compute_vif(feature_list = mc_feat , dataframe  = df_train_tr)

Unnamed: 0,variable,VIF
1,population_per_household,1.669053
2,bedrooms_per_room,2.811717
3,rooms_per_household,1.295379
4,total_rooms,12.558651
5,total_bedrooms,12.737534
6,population,5.050763


In [71]:
# lets remove 'rooms_per_household' column

# mc_feat.remove('rooms_per_household')

In [82]:
# compute_vif(feature_list = mc_feat , dataframe  = df_train_tr)

In [73]:
# lets remove 'total_bedrooms' column

mc_feat.remove('total_bedrooms')

In [74]:
compute_vif(feature_list = mc_feat , dataframe  = df_train_tr)

Unnamed: 0,variable,VIF
1,population_per_household,1.503268
2,bedrooms_per_room,1.434047
3,rooms_per_household,1.295317
4,total_rooms,4.709111
5,population,4.362234


## STEP 5 : Drop Irrelevent Features

In [75]:
# drop_col = ['households' , 'rooms_per_household' , 'total_bedrooms']
drop_col = ['households' , 'total_bedrooms']

In [76]:
df_test_export = df_test_trans.drop(labels = drop_col ,axis = 1)

In [77]:
df_train_export = df_train_tr.drop(labels = drop_col ,axis = 1)                 

In [78]:
df_test_export.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,median_income,ocean_proximity,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household
0,-118.099998,33.810001,36.0,1111.0,444.0,3.7031,<1H OCEAN,245300.0,6.276836,0.165617,2.508475
1,-122.269997,37.82,52.0,1630.0,1162.0,1.2475,NEAR BAY,104200.0,4.075,0.279755,2.905


In [79]:
df_train_export.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,median_income,ocean_proximity,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.699997,39.139999,13.0,532.0,214.0,3.3929,INLAND,108300.0,8.580646,0.208647,3.451613
1,-120.830002,35.32,11.0,3252.0,1814.0,3.2226,NEAR OCEAN,183200.0,4.927273,0.21556,2.748485


## STEP 6 : Export the DF as a .csv file.

In [80]:
df_test_export.to_csv("S2_Part2_test_DP_Trimming1_test_dataset.csv")

In [81]:
df_train_export.to_csv("S2_Part2_test_DP_Trimming1_train_dataset.csv")