Reading cleaned data

In [25]:
import pandas as pd

df = pd.read_csv('../data/processed/housing_cleaned.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df.tail()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
20428,-121.09,39.48,25,1665,374.0,845,330,1.5603,INLAND,78100
20429,-121.21,39.49,18,697,150.0,356,114,2.5568,INLAND,77100
20430,-121.22,39.43,17,2254,485.0,1007,433,1.7,INLAND,92300
20431,-121.32,39.43,18,1860,409.0,741,349,1.8672,INLAND,84700
20432,-121.24,39.37,16,2785,616.0,1387,530,2.3886,INLAND,89400


Encoding Ocean Proximity

In [26]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
encoded = encoder.fit_transform(df[['ocean_proximity']]).toarray()
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['ocean_proximity']))
df = pd.concat([df.drop('ocean_proximity', axis=1), encoded_df], axis=1)
df.tail()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
20428,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,0.0,1.0,0.0,0.0,0.0
20429,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,0.0,1.0,0.0,0.0,0.0
20430,-121.22,39.43,17,2254,485.0,1007,433,1.7,92300,0.0,1.0,0.0,0.0,0.0
20431,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,0.0,1.0,0.0,0.0,0.0
20432,-121.24,39.37,16,2785,616.0,1387,530,2.3886,89400,0.0,1.0,0.0,0.0,0.0


Adding Features

In [None]:
import numpy as np

df['rooms_per_bedroom'] = df['total_rooms'] / df['total_bedrooms']
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['population_per_household'] = df['population'] / df['households']
#Log transforming the house values 
df['log_prices'] = np.log1p(df['median_house_value'])
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,rooms_per_bedroom,rooms_per_household,population_per_household,log_prices
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,0.0,0.0,0.0,1.0,0.0,6.821705,6.984127,2.555556,13.022766
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,0.0,0.0,0.0,1.0,0.0,6.418626,6.238137,2.109842,12.789687
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,0.0,0.0,0.0,1.0,0.0,7.721053,8.288136,2.80226,12.771673
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,0.0,0.0,0.0,1.0,0.0,5.421277,5.817352,2.547945,12.74052
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,0.0,0.0,0.0,1.0,0.0,5.810714,6.281853,2.181467,12.743154


Saving dataset with final features

In [29]:
df.to_csv('../data/processed/housing_final_features.csv')