In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy import stats

In [None]:
# load data
sales = pd.read_csv("../../ML Data & Script/kc_house_data.csv")
sales.head()

In [None]:
def get_numpy_data(data, features, output):
    # add a column with all ones to a dataframe
    data['constant'] = 1 
    # add the column 'constant' to the front of the features list so that we can extract it along with the others:
    # this is how you combine two lists
    features = ['constant'] + features 
    # get a dataframe with the selected features and convert to numpy matrix
    X = data[features].values
    # get output column(pandas series)
    output = data[output]
    # convert pandas series to numpy array
    y = output.values
    return(X, y)

In [None]:
# Checking the data at the first row for sqft
print(sales['sqft_living'][0])
#Checking the output of the first row
print(sales['price'][0])

In [None]:
# the [] around 'sqft_living' makes it a list
(X, y) = get_numpy_data(sales, ['sqft_living'], 'price')
# this accesses the first row of the data the ':' indicates 'all columns'
print("first row, features",  X[0, :].reshape(1,2))
# and the corresponding output
print("first row, output", y[0].reshape(1,1)) 
print(X[0,:].reshape(1,2))

In [None]:
# shape of X and y
print(X.shape)
y = y.reshape(-1,1)
print(y.shape)

In [None]:
# predict output given weights
# the example weights
my_weights = np.array([1., 1.]) 
# we'll use the first data point
first_data_point = X[0,]
# 1 * 1 + 1 * 1180
y_pred = np.dot(first_data_point, my_weights)
print(y_pred)

In [None]:
# calculate prediction for the whole data
def predict_output(data_matrix, weights):
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    pred = np.dot(data_matrix, weights)
    return(pred)

In [None]:
# test prediction
# (21613, 2).(2,1)
test_pred = predict_output(X, my_weights)
# should be 1181.0
print(test_pred[0])
# should be 2571.0
print(test_pred[1]) 

In [None]:
# compute derivative
def feature_derivative(errors, feature):
    # Assume that errors and feature are both numpy arrays of the same length (number of data points)
    # compute twice the dot product of these vectors as 'derivative' and return the value
    derivative=2* (np.dot(errors,feature))
    return(derivative)

In [None]:
# test feature derivative
(X, y) = get_numpy_data(sales, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.]) # this makes all the predictions 0
yhat = predict_output(X, my_weights) 
# just like SFrames 2 numpy arrays can be elementwise subtracted with '-': 
# prediction errors in this case is just the -example_output
errors = yhat - y 
# let's compute the derivative with respect to 'constant', the ":" indicates "all rows"
feature = X[:,0] 
derivative = feature_derivative(errors, feature)
print(derivative)
# should be the same as derivative
print(-np.sum(y) * 2) 

In [None]:
from math import sqrt
def regression_gradient_descent(X, y, initial_weights, step_size, tolerance):
    converged = False 
    # make sure it's a numpy array
    weights = np.array(initial_weights) 
    while not converged:
        # compute the predictions based on feature_matrix and weights using your predict_output() function
        yhat = predict_output(X, weights)
        # compute the errors as predictions - output
        errors = yhat - y
        # initialize the gradient sum of squares
        gradient_sum_squares = 0 
        # while we haven't reached the tolerance yet, update each feature's weight
        # loop over each weight
        for i in range(len(weights)): 
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, X[:, i])
            # add the squared value of the derivative to the gradient sum of squares (for assessing convergence)
            gradient_sum_squares += derivative * derivative
            # subtract the step size times the derivative from the current weight
            weights[i] -= step_size * derivative
        # compute the square-root of the gradient sum of squares to get the gradient magnitude:
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

In [None]:
# run gradient descent, simple linear regression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sales, sales['price'], test_size=0.2, random_state=0)
# let's test out the gradient descent
(X, y) = get_numpy_data(X_train, ['sqft_living'], 'price')
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7
simple_weights = regression_gradient_descent(X, y, initial_weights, step_size, tolerance)
print(simple_weights)

Very close to the weights obtained using sklearn
* Intercept: -48257.063591028564
* Slope: 283.96855715512993

In [None]:
# test data, simple features
(X_test, y_test) = get_numpy_data(X_test, ['sqft_living'], 'price')
y_hat_model_one = predict_output(X_test, simple_weights)

In [None]:
# predicited house price for the first house
y_hat_model_one[0]

In [None]:
# running multiple regression
# sqft_living15 is the average squarefeet for the nearest 15 neighbors.
X_train, X_test, y_train, y_test = train_test_split(sales, sales['price'], test_size=0.2, random_state=0)
adv_features = ['sqft_living', 'sqft_living15']  
(X_train, y_train) = get_numpy_data(X_train, adv_features, 'price')
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9
adv_weights = regression_gradient_descent(X_train, y_train, initial_weights, step_size, tolerance)
print(adv_weights)


In [None]:
# test on a test set
(X_test, y_test) = get_numpy_data(X_test, adv_features, 'price')
y_hat_model_two = predict_output(X_test, adv_weights)

In [None]:
# model2 prediction for the first house
y_hat_model_two[0]

In [None]:
y_test[0]

In [None]:
# difference with model one
print(y_test[0] - y_hat_model_one[0])
# difference with model two
print(y_test[0] - y_hat_model_two[0])

In [None]:
# RSS
# model_one RSS
print(((y_hat_model_one - y_test) ** 2).sum())
# model_two RSS
print(((y_hat_model_two - y_test) ** 2).sum())

model_one performed better for the first house on the testset, but overall model_two has lower RSS