In [104]:
import graphlab

In [105]:
sales = graphlab.SFrame('kc_house_data.gl/')

In [106]:
import numpy as np
import math

In [107]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # this is how you add a constant column to an SFrame

    features = ['constant'] + features # this is how you combine two lists

    features_sframe = data_sframe[features]

    feature_matrix = features_sframe.to_numpy()

    output_sarray = data_sframe[output]

    output_array = output_sarray.to_numpy()
    return(feature_matrix, output_array)

In [108]:
def predict_output(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [109]:
def feature_derivative(errors, feature):
    derivative = 2 * np.dot(errors, feature)
    return(derivative)

In [110]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        predictions = predict_output(feature_matrix, weights)
        errors = predictions - output
        
        gradient_sum_squares = 0
        
        for i in range(len(weights)):
            wi_dev = feature_derivative(errors, feature_matrix[:,i])
            gradient_sum_squares += np.dot(wi_dev, wi_dev)
            weights[i] -= step_size * wi_dev
        gradient_magnitude = math.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)        

In [111]:
train_data, test_data = sales.random_split(.8, seed = 0)

In [112]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [113]:
my_weight = regression_gradient_descent(simple_feature_matrix,output,initial_weights,step_size,tolerance)
print my_weight

[-46999.88716555    281.91211912]


In [114]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)

In [115]:
preout = predict_output(test_simple_feature_matrix,my_weight)
print preout

[ 356134.44317093  784640.86422788  435069.83652353 ...,  663418.65300782
  604217.10799338  240550.4743332 ]


In [116]:
Rss1 = math.sqrt(((test_output-preout)*(test_output-preout)).sum())
print Rss1

16595181.4571


In [117]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output2) = get_numpy_data(train_data, model_features, my_output)
initial_weights2 = np.array([-100000, 1., 1.])
step_size2 = 4e-12
tolerance2 = 1e9

In [118]:
my_weight2 = regression_gradient_descent(feature_matrix, output, initial_weights2, step_size2, tolerance2)

In [119]:
print my_weight2

[ -9.99999688e+04   2.45072603e+02   6.52795277e+01]


In [120]:
simple_features = ['sqft_living','sqft_living15']
my_output = 'price'
(simple_feature_matrix1, output1) = get_numpy_data(test_data, simple_features, my_output)
preout2 = predict_output(simple_feature_matrix1,my_weight2)
print len(preout2)
print preout2

4229
[ 366651.41203656  762662.39786164  386312.09499712 ...,  682087.39928241
  585579.27865729  216559.20396617]


In [121]:
(test_simple_feature_matrix2, test_output2) = get_numpy_data(test_data, model_features, my_output)
preout3 = predict_output(test_simple_feature_matrix2, my_weight2)
print preout3

[ 366651.41203656  762662.39786164  386312.09499712 ...,  682087.39928241
  585579.27865729  216559.20396617]


In [122]:
RSS2 = math.sqrt(((test_output2-preout3)*(test_output2-preout3)).sum())
print RSS2

16439691.1913
