In [1]:
import graphlab
import numpy as np
import pandas as pd

In [10]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_df = pd.DataFrame(data_sframe, columns=features)
    # this will convert the features_df into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = features_df.as_matrix()
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_series = pd.Series(data_sframe['price'])
    # this will convert the SArray into a numpy array:
    output_array = output_series.as_matrix() # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)

In [11]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix,weights)
    return(predictions)

In [12]:
def feature_derivative(errors, feature):
    derivative = 2 * np.dot(errors, feature)
    return(derivative)

In [71]:
import time
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        predictions = predict_outcome(feature_matrix, weights)
        #print "PRDICTIONS ==>"
        #print predictions
        # compute the errors as predictions - output:
        #errors = predictions - output
        errors = (output) - (predictions)
        #print "ERRORS ==>"
        #print errors
        #time.sleep(10)
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            #print "value of i" 
            #print i
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            feature_column = feature_matrix[:, i]
            #print "FEATURE_COLUMN ==>"
            #print feature_column
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, feature_column)
            #print "DERIVATIVE ==>"
            #print derivative
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares = gradient_sum_squares + (derivative**2)
            # update the weight based on step size and derivative:
            #print "WEIGHTS BEFORE UPDATE ==>"
            #print weights[i]
            weights[i]=weights[i] + (step_size*derivative)
            #print "WEIGHTS AFTER UPDATE ==>"
            #print weights[i]
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

In [14]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
house_data_train = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
house_data_test = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [15]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(house_data_train, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [41]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)

In [42]:
simple_weights

array([-46999.88716555,    281.91211918])

In [43]:
(test_simple_feature_matrix, test_output) = get_numpy_data(house_data_test, simple_features, my_output)

In [44]:
predictions_test = predict_outcome(test_simple_feature_matrix, simple_weights)

In [46]:
predictions_test

array([ 356134.443255  ,  784640.86440132,  435069.83662406, ...,
        663418.65315598,  604217.10812919,  240550.47439317])

In [50]:
RSS_model1=np.sum((test_output - predictions_test)**2)

In [51]:
RSS_model1

275400044902128.31

In [53]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(house_data_train, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [72]:
model2_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [73]:
model2_weights

array([ -9.99999688e+04,   2.45072603e+02,   6.52795267e+01])

In [74]:
(test_feature_matrix, test_output2) = get_numpy_data(house_data_test, model_features, my_output)

In [75]:
predictions_test2 = predict_outcome(test_feature_matrix, model2_weights)

In [76]:
predictions_test2

array([ 366651.41162949,  762662.39850726,  386312.09557541, ...,
        682087.39916306,  585579.27901327,  216559.20391786])

In [77]:
house_data_test.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,constant
0,114101516,20140528T000000,310000.0,3.0,1.0,1430.0,19901,1.5,0,0,...,1430,0,1927,0,98028,47.7558,-122.229,1780.0,12697.0,1
1,9297300055,20150124T000000,650000.0,4.0,3.0,2950.0,5000,2.0,0,3,...,1980,970,1979,0,98126,47.5714,-122.375,2140.0,4000.0,1
2,1202000200,20141103T000000,233000.0,3.0,2.0,1710.0,4697,1.5,0,0,...,1710,0,1941,0,98002,47.3048,-122.218,1030.0,4705.0,1
3,8562750320,20141110T000000,580500.0,3.0,2.5,2320.0,3980,2.0,0,0,...,2320,0,2003,0,98027,47.5391,-122.07,2580.0,3980.0,1
4,7589200193,20141110T000000,535000.0,3.0,1.0,1090.0,3000,1.5,0,0,...,1090,0,1929,0,98117,47.6889,-122.375,1570.0,5080.0,1


In [78]:
RSS_model2=np.sum((test_output2 - predictions_test2)**2)

In [79]:
RSS_model2

270263443629803.56

In [80]:
RSS_model1

275400044902128.31