In [22]:
#importing the necessary libraries
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pandas as pd
import matplotlib.pyplot as plt
from numpy.linalg import inv

In [23]:
#univariate linear regression closed form
data = pd.read_csv('../datasets/housing/housing_multivariate.csv')

In [24]:
# doing following since data contains null/NAN value
data2 = data.fillna(method='ffill')

#can also use x = x[~numpy.isnan(x)] for a feature

data2.isnull().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms        False
population            False
households            False
median_income         False
median_house_value    False
dtype: bool

In [25]:
#normalizing the data
#data2 = (data2 - data2.mean())/data2.std()
#data2.head()
mean  = data2.mean()
deviation = data2.std()

In [26]:
size = len(data2)
train_len = int(size * 0.8)
test_len = size - train_len

In [27]:
# Split the data into training/testing sets

#len(data[:train_len])
#len(data[:test_len])
train = np.array(data2[:train_len])
test = np.array(data2[:test_len])

In [28]:
data2.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200


In [29]:
# actual training set and also reshaping to (n X 1) arrays
x_train = np.array(train[:,:-1])
y_train = np.array(train[:,-1:])
x_test = np.array(test[:,:-1])
y_test = np.array(test[:,-1:])

In [30]:
X = x_train
Y = y_train
one = np.ones((len(X),1))
#...and add to feature matrix
X = np.concatenate((one, X), 1)
np.size(X,1)


9

In [31]:
# calculate coefficients using closed-form solution
coeffs = inv(X.transpose().dot(X)).dot(X.transpose()).dot(Y)
coeffs

array([[ -3.48346104e+06],
       [ -4.18941827e+04],
       [ -4.25982141e+04],
       [  1.17622072e+03],
       [ -7.55336093e+00],
       [  8.78825433e+01],
       [ -3.98205325e+01],
       [  7.69986597e+01],
       [  4.01331772e+04]])

In [32]:
# ridge regression
a = 10
I = np.size(X,1)
A = np.identity(I)
alpha = a * A
cost_regr = np.subtract((X.transpose().dot(X)), A)

coeffs_regr = inv(cost_regr).dot(X.transpose()).dot(Y)
coeffs_regr

array([[  1.06792045e+07],
       [  1.18050354e+05],
       [  9.15168882e+04],
       [  4.21573366e+03],
       [ -3.22949988e+01],
       [ -2.39753903e+01],
       [ -4.70816793e+01],
       [  3.76721553e+02],
       [  5.61388369e+04]])

In [33]:
# test set
X = x_test
Y = y_test
one = np.ones((len(X),1))
#...and add to feature matrix
X = np.concatenate((one, X),1)
np.shape(X)

(4128, 9)

In [34]:
y_pred = X.dot(coeffs)
y_pred_ridge = X.dot(coeffs_regr)

In [35]:
mse = mean_squared_error(y_pred, y_test)
rmse = np.sqrt(mse)
print("Root mean square error : ", rmse)
mse2 = mean_squared_error(y_pred_ridge, y_test)
rmse_ridge = np.sqrt(mse2)
print("Root mean square error with ridge regression : ", rmse_ridge)

Root mean square error :  61597.6905631
Root mean square error with ridge regression :  121075.273105
