In [1]:
import os
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
hdf = pd.read_csv('housing.csv')

In [3]:
corr_M = hdf.corr()
corr_M['median_house_value']

longitude            -0.045967
latitude             -0.144160
housing_median_age    0.105623
total_rooms           0.134153
total_bedrooms        0.049686
population           -0.024650
households            0.065843
median_income         0.688075
median_house_value    1.000000
Name: median_house_value, dtype: float64

In [4]:
hdf.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [5]:
mediantb = hdf['total_bedrooms'].median()
hdf['total_bedrooms'].fillna(mediantb, inplace = True)

In [6]:
hdf['rooms_per_house'] = hdf['total_rooms'] / hdf['households']
hdf['bebrooms_per_room'] = hdf['total_bedrooms'] / hdf['total_rooms']
hdf['population_per_house_hold'] = hdf['population'] / hdf['households']

In [7]:
corr_M = hdf.corr()
corr_M['median_house_value']

longitude                   -0.045967
latitude                    -0.144160
housing_median_age           0.105623
total_rooms                  0.134153
total_bedrooms               0.049457
population                  -0.024650
households                   0.065843
median_income                0.688075
median_house_value           1.000000
rooms_per_house              0.151948
bebrooms_per_room           -0.233303
population_per_house_hold   -0.023737
Name: median_house_value, dtype: float64

In [8]:
#drop median_house_value >500000
new_hdf = hdf[hdf['median_house_value']<500000]

In [9]:
corr_M = new_hdf.corr()
corr_M['median_house_value']

longitude                   -0.045733
latitude                    -0.149257
housing_median_age           0.065139
total_rooms                  0.144988
total_bedrooms               0.074704
population                   0.013592
households                   0.095634
median_income                0.646719
median_house_value           1.000000
rooms_per_house              0.111581
bebrooms_per_room           -0.199733
population_per_house_hold   -0.021205
Name: median_house_value, dtype: float64

In [10]:
X = new_hdf.drop(['median_house_value'], axis = 1)
Y = new_hdf['median_house_value']

In [11]:
h_cat_en, h_categories = X['ocean_proximity'].factorize()

In [12]:
from sklearn.preprocessing import OneHotEncoder

oh = OneHotEncoder()
coded = oh.fit_transform(h_cat_en.reshape(-1,1))

proxy = pd.DataFrame(coded.toarray(), index = X.index, columns = ['1','2','3','4','5'])

X = pd.concat([X, proxy], axis = 1)
X = X.drop(['ocean_proximity'], axis = 1)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train,  Y_test =  train_test_split(X, Y, test_size = 0.2, random_state = 42)

Linear Regression

In [14]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train, Y_train)

LinearRegression()

In [15]:
someData = X_test[:6]
someLabels = Y_test[:6]

In [16]:
# predict
preds = linreg.predict(someData)

In [17]:
#mean squared error
from sklearn.metrics import mean_squared_error
import math
mse = mean_squared_error(preds, someLabels)
rmse = math.sqrt(mse)
rmse

43983.90092161152

In [18]:
linreg.coef_

array([-2.48424201e+04, -2.27964470e+04,  9.11223268e+02, -1.87808193e+00,
        1.79846585e+01, -3.11508749e+01,  9.77714477e+01,  3.84220620e+04,
        2.14488924e+03,  1.23798954e+05,  3.41301729e+01, -3.32722155e+04,
       -2.51928864e+04, -6.23470955e+04, -2.08644991e+04,  1.41676697e+05])

Decision Tree Regressor

In [19]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor()
tree.fit(X_train, Y_train)

DecisionTreeRegressor()

In [20]:
someData = X_test[:6]
someLabels = Y_test[:6]

In [21]:
# predict
preds = tree.predict (someData)

In [22]:
#mean squared error
mse = mean_squared_error(preds, someLabels)
rmse = math.sqrt(mse)
rmse

71948.7780762583

Random Forest Regressor

In [23]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()
forest.fit(X_train, Y_train)

RandomForestRegressor()

In [24]:
someData = X_test[:6]
someLabels = Y_test[:6]

In [25]:
preds = forest.predict (someData)

In [26]:
#mean squared error
mse = mean_squared_error(preds, someLabels)
rmse = math.sqrt(mse)
rmse

50179.25825976307