In [2]:
#importing the necessary libraries
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pandas as pd
import matplotlib.pyplot as plt
from numpy.linalg import inv

In [3]:
#univariate linear regression gradient descent
data = pd.read_csv('../datasets/housing/housing_univariate.csv')

In [4]:
# will divide in 8:2 ratio for train and test
#it is ok to call test for now-(consider same as validation)
size = len(data)
train_len = int(size * 0.8)
test_len = size - train_len

In [5]:
# Split the data into training/testing sets

#len(data[:train_len])
#len(data[:test_len])
train = np.array(data[:train_len])
test = np.array(data[:test_len])

# actual training set and also reshaping to (n X 1) arrays
x_train = np.array(train[:,0]).reshape(-1,1)
y_train = np.array(train[:,1]).reshape(-1,1)
x_test = np.array(test[:,0]).reshape(-1,1)
y_test = np.array(test[:,1]).reshape(-1,1)

In [6]:
#might seem redundant but for good readability
X = x_train
Y = y_train
one = np.ones((len(X),1))
#...and add to feature matrix
X = np.concatenate((one, X), 1)
#np.shape(X)

In [7]:
W = np.random.rand(np.size(X,1)).reshape(-1,1)
np.shape(W)

(2, 1)

In [8]:
#GRADIENT DESCENT

alpha = 0.01 #Step size
iterations = 1200 #No. of iterations
m = Y.size #No. of data points
np.random.seed(42) #Set the seed
W = np.random.rand(np.size(X,1)).reshape(-1,1) #Pick some random values to start with


#GRADIENT DESCENT
def gradient_descent(x, y, theta, iterations, alpha):
    past_costs = []
    past_thetas = [theta]
    for i in range(iterations):
        prediction = np.dot(x, theta)
        error = prediction - y
        cost = 1/(2*m) * np.dot(error.T, error)
        past_costs.append(cost)
        theta = theta - (alpha * (1/m) * np.dot(x.T, error))
        past_thetas.append(theta)
        #print(past_thetas)
        
    return past_thetas, past_costs

#Pass the relevant variables to the function and get the new values back...
past_thetas, past_costs = gradient_descent(X, Y, W, iterations, alpha)
W = past_thetas[-1]

#Print the results...
print("Parameters after Gradient Descent: {:.2f}, {:.2f}".format(W[0][0], W[1][0]))

Parameters after Gradient Descent: 43139.60, 41627.03


In [9]:
# for testing
Xt = x_test
Yt = y_test
one = np.ones((len(Xt),1))
#...and add to feature matrix
Xt = np.concatenate((one, Xt), 1)

In [10]:
y_pred = np.dot(Xt, W)

In [11]:
mse = mean_squared_error(y_pred, y_test)
rmse = np.sqrt(mse)
print("Root mean squared error : {:.5f}".format(rmse))

Root mean squared error : 71635.49189


In [12]:
#multivariate gradient descent
data3 = pd.read_csv('../datasets/housing/housing_multivariate.csv')

In [13]:
# doing following since data contains null/NAN value
data3 = data3.fillna(method='ffill')

#can also use x = x[~numpy.isnan(x)] for a feature

data3.isnull().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms        False
population            False
households            False
median_income         False
median_house_value    False
dtype: bool

In [14]:
#normalizing the data
data2 = (data3 - data3.mean())/data3.std()
data2.head()
mean  = data3.mean()
deviation = data3.std()

In [15]:
# will divide in 8:2 ratio for train and test
#it is ok to call test for now-(consider same as validation)
size = len(data2)
train_len = int(size * 0.8)
test_len = size - train_len

In [16]:
# Split the data into training/testing sets

#len(data[:train_len])
#len(data[:test_len])
train = np.array(data2[:train_len])
test = np.array(data2[:test_len])

In [17]:
# actual training set and also reshaping to (n X 1) arrays
x_train = np.array(train[:,:-1])
y_train = np.array(train[:,-1:])
x_test = np.array(test[:,:-1])
y_test = np.array(test[:,-1:])

In [18]:
X = x_train
Y = y_train
one = np.ones((len(X),1))
#...and add to feature matrix
X = np.concatenate((one, X), 1)
Y.size

16512

In [31]:
#GRADIENT DESCENT

alpha = 0.01 #Step size
iterations = 2000 #No. of iterations
m = Y.size #No. of data points
#np.random.seed(42) #Set the seed
W = np.random.rand(np.size(X,1)).reshape(-1,1) #Pick some random values to start with


#GRADIENT DESCENT
def gradient_descent(x, y, theta, iterations, alpha):
    past_costs = []
    past_thetas = [theta]
    for i in range(iterations):
        prediction = np.dot(x, theta)
        error = prediction - y
        cost = 1/(2*m) * np.dot(error.T, error)
        past_costs.append(cost)
        theta = theta - (alpha * (1/m) * np.dot(x.T, error))
        past_thetas.append(theta)
        #print(theta)
        #print(past_thetas[0])
        
    return past_thetas, past_costs

#Pass the relevant variables to the function and get the new values back...
past_thetas, past_costs = gradient_descent(X, Y, W, iterations, alpha)
W = past_thetas[-1]
np.shape(Y)
#Print the results...
#print("Parameters after Gradient Descent: {:.2f}, {:.2f}".format(W[0][0], W[1][0]))

(4128, 1)

In [32]:
# test set
X = x_test
Y = y_test
one = np.ones((len(X),1))
#...and add to feature matrix
X = np.concatenate((one, X),1)
np.shape(X)

(4128, 9)

In [33]:
y_pred = X.dot(W)

In [34]:
mse = mean_squared_error(y_pred, y_test)
rmse = np.sqrt(mse)
print("Root mean square error : ", rmse)

Root mean square error :  0.499184475973


In [35]:
y_test2 = y_test*deviation['median_house_value'] + mean['median_house_value']
y_pred2 = y_pred*deviation['median_house_value'] + mean['median_house_value']

In [36]:
mse = mean_squared_error(y_pred2, y_test2)
rmse = np.sqrt(mse)
print("Root mean square error : ", rmse)

Root mean square error :  57603.7000398
