In [1]:
import graphlab
sales = graphlab.SFrame('kc_house_data.gl/')
import numpy as np
import math

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1483882787.log


This non-commercial license of GraphLab Create for academic use is assigned to workingjhy@gmail.com and will expire on July 31, 2017.


In [2]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # this is how you add a constant column to an SFrame
    # add the column 'constant' to the front of the features list so that we can extract it along with the others:
    features = ['constant'] + features # this is how you combine two lists
    # select the columns of data_SFrame given by the features list into the SFrame features_sframe (now including constant):
    features_sframe = data_sframe[features]
    # the following line will convert the features_SFrame into a numpy matrix:
    feature_matrix = features_sframe.to_numpy()
    # assign the column of data_sframe associated with the output to the SArray output_sarray
    output_sarray = data_sframe[output]
    # the following will convert the SArray into a numpy array by first converting it to a list
    output_array = output_sarray.to_numpy()
    return(feature_matrix, output_array)

In [3]:
def predict_output(feature_matrix, weights):
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix,weights)
    return(predictions)

In [4]:
def feature_derivative(errors, feature):
    # Assume that errors and feature are both numpy arrays of the same length (number of data points)
    # compute twice the dot product of these vectors as 'derivative' and return the value
    derivative = 2*np.dot(errors,feature)
    return(derivative)

In [8]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False 
    weights = np.array(initial_weights)
    while not converged:
        predictions = predict_output(feature_matrix,weights)

        errs = predictions - output
        gradient_sum_squares = 0 

        for i in range(len(weights)): # loop over each weight
            wi_dev = feature_derivative(errs,feature_matrix[:,i])
            gradient_sum_squares +=np.dot(wi_dev,wi_dev)
            weights[i]-= step_size*wi_dev

        gradient_magnitude = math.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

In [9]:
train_data,test_data = sales.random_split(.8,seed=0)
# let's test out the gradient descent
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7
my_weight = regression_gradient_descent(simple_feature_matrix,output,initial_weights,step_size,tolerance)
print my_weight
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)
preout = predict_output(test_simple_feature_matrix,my_weight)
print preout
RSS1= math.sqrt(((test_output-preout)*(test_output-preout)).sum())
print RSS1

[-46999.88716555    281.91211912]
[ 356134.44317093  784640.86422788  435069.83652353 ...,  663418.65300782
  604217.10799338  240550.4743332 ]
16595181.4571


In [10]:
model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. 
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9
my_weight2 = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)
print my_weight2


simple_features = ['sqft_living','sqft_living15']
my_output = 'price'
(simple_feature_matrix1, output1) = get_numpy_data(test_data, simple_features, my_output)
preout2 = predict_output(simple_feature_matrix1,my_weight2)
print len(preout2)
print preout2

[ -9.99999688e+04   2.45072603e+02   6.52795277e+01]
4229
[ 366651.41203656  762662.39786164  386312.09499712 ...,  682087.39928241
  585579.27865729  216559.20396617]
