In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [37]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 
              'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int,
              'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [38]:
df_data = pd.read_csv("./data/kc_house_data.csv", dtype=dtype_dict)

In [39]:
df_train = pd.read_csv("./data/kc_house_train_data.csv", dtype=dtype_dict)
df_train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [40]:
df_test = pd.read_csv("./data/kc_house_test_data.csv", dtype=dtype_dict)
df_test.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,114101516,20140528T000000,310000.0,3.0,1.0,1430.0,19901,1.5,0,0,...,7,1430,0,1927,0,98028,47.7558,-122.229,1780.0,12697.0
1,9297300055,20150124T000000,650000.0,4.0,3.0,2950.0,5000,2.0,0,3,...,9,1980,970,1979,0,98126,47.5714,-122.375,2140.0,4000.0
2,1202000200,20141103T000000,233000.0,3.0,2.0,1710.0,4697,1.5,0,0,...,6,1710,0,1941,0,98002,47.3048,-122.218,1030.0,4705.0
3,8562750320,20141110T000000,580500.0,3.0,2.5,2320.0,3980,2.0,0,0,...,8,2320,0,2003,0,98027,47.5391,-122.07,2580.0,3980.0
4,7589200193,20141110T000000,535000.0,3.0,1.0,1090.0,3000,1.5,0,0,...,8,1090,0,1929,0,98117,47.6889,-122.375,1570.0,5080.0


In [41]:
def get_numpy_data(df, features, output):
    df['constant'] = 1 # this is how you add a constant column to a DataFrame
    # add the column 'constant' to the front of the features list so that we can extract it along with the others:
    features = ['constant'] + features # this is how you combine two lists
    # select the columns of df given by the features list into the DataFrame df (now including constant):
    df_features = df[features]
    # the following line will convert the features_SFrame into a numpy matrix:
    feature_matrix = df_features.as_matrix()
    # assign the column of data_sframe associated with the output to the SArray output_sarray
    output_sarray = df[output]
    # the following will convert the SArray into a numpy array by first converting it to a list
    output_array = output_sarray.as_matrix()
    return(feature_matrix, output_array)

In [42]:
(example_features, example_output) = get_numpy_data(df_train, ['sqft_living'], 'price') # the [] around 'sqft_living' makes it a list
print(example_features[0,:]) # this accesses the first row of the data the ':' indicates 'all columns'
print(example_output[0]) # and the corresponding output

[  1.00000000e+00   1.18000000e+03]
221900.0


## Predicting output given the regression weights

In [43]:
my_weights = np.array([1., 1.]) # the example weights
my_features = example_features[0, :] # we'll use the first data point
predicted_value = np.dot(my_features, my_weights)
print(predicted_value)

1181.0


In [44]:
def predict_output(feature_matrix, weights):
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [45]:
test_predictions = predict_output(example_features, my_weights)
print(test_predictions[0]) # should be 1181.0
print(test_predictions[1]) # should be 2571.0

1181.0
2571.0


## Computing the Derivative

In [46]:
def feature_derivative(errors, feature):
    # Assume that errors and feature are both numpy arrays of the same length (number of data points)
    # compute twice the dot product of these vectors as 'derivative' and return the value
    derivative = 2 * np.dot(errors, feature)
    return(derivative)

In [47]:
(example_features, example_output) = get_numpy_data(df_train, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.]) # this makes all the predictions 0
test_predictions = predict_output(example_features, my_weights) 
# just like SFrames 2 numpy arrays can be elementwise subtracted with '-': 
errors = test_predictions - example_output # prediction errors in this case is just the -example_output
feature = example_features[:,0] # let's compute the derivative with respect to 'constant', the ":" indicates "all rows"
derivative = feature_derivative(errors, feature)
print(derivative)
print(-np.sum(example_output)*2) # should be the same as derivative

-18752698920.0
-18752698920.0


## Gradient Descent

In [48]:
from math import sqrt

In [49]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False 
    weights = np.array(initial_weights) # make sure it's a numpy array
    while not converged:
        # compute the predictions based on feature_matrix and weights using your predict_output() function
        predictions = predict_output(feature_matrix, weights)
        # compute the errors as predictions - output
        errors = predictions - output
        gradient_sum_squares = 0 # initialize the gradient sum of squares
        # while we haven't reached the tolerance yet, update each feature's weight
        for i in range(len(weights)): # loop over each weight
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, feature_matrix[:, i])
            # add the squared value of the derivative to the gradient sum of squares (for assessing convergence)
            gradient_sum_squares = gradient_sum_squares + derivative * derivative
            # subtract the step size times the derivative from the current weight
            weights[i] = weights[i] - step_size * derivative
        # compute the square-root of the gradient sum of squares to get the gradient magnitude:
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

### Running the gradient descent

In [50]:
# let's test out the gradient descent
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(df_train, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [51]:
my_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)

In [52]:
(test_simple_feature_matrix, test_output) = get_numpy_data(df_test, simple_features, my_output)

In [53]:
predictions_test = predict_output(test_simple_feature_matrix, my_weights)

In [54]:
predictions_test[0]

356134.44325500238

In [55]:
def get_residual_sum_of_squares(predictions, output):
    residual = predictions - output
    RSS = (residual * residual).sum()
    return RSS

In [56]:
get_residual_sum_of_squares(predictions_test, test_output)

275400044902128.31

In [57]:
model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. 
my_output = 'price'
(feature_matrix, output) = get_numpy_data(df_train, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [60]:
multi_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [61]:
multi_weights

array([ -9.99999688e+04,   2.45072603e+02,   6.52795267e+01])

In [62]:
(test_multi_feature_matrix, test_output) = get_numpy_data(df_test, model_features, my_output)

In [63]:
predictions_multi = predict_output(test_multi_feature_matrix, multi_weights)

In [64]:
predictions_multi[0]

366651.41162949387

In [65]:
test_output[0]

310000.0

In [66]:
get_residual_sum_of_squares(predictions_multi, test_output)

270263443629803.56

In [67]:
my_weights

array([-46999.88716555,    281.91211918])