In [1]:
import pandas as pd
import numpy as np

from sklearn import linear_model

## Steps 1 & 2
1. Read data
2. Split them into training and testing data

In [13]:
data = pd.read_csv('data/kc_house_data.csv')
train_data = pd.read_csv('data/kc_house_train_data.csv')
test_data = pd.read_csv('data/kc_house_test_data.csv')

## Step 3
- Summarize data into matrix

In [14]:
def get_numpy_data(df, features, output):
    df['constant'] = 1
    features = ['constant'] + features
    features_matrix = df.loc[:, features].as_matrix()
    output_array = df.loc[:, output].as_matrix()
    return features_matrix, output_array

## Step 4
- Complete prediction function.

In [139]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return predictions

## Step 5
- Implementing function feature_derivative()

In [140]:
def feature_derivative(errors, feature):
    derivative = 2 * np.dot(errors, feature)
    return derivative

## Step 6
- Implementing function regression_gradient_descent()

In [141]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    t = 1
    while not converged and t <= 10000:
        # compute the predictions based on feature_matrix and weights:
        predictions = predict_outcome(feature_matrix, weights)
        
        # compute the errors as predictions - output:
        errors = predictions - output
        
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, feature_matrix[:, i])
            
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares += derivative**2
            
            # update the weight based on step size and derivative:
            weights[i] = weights[i] - step_size * derivative
            
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
        t += 1
        print('Round {0}: weights={1}, gradient_magnitude={2}'.format(t, weights, gradient_magnitude))
    return weights

## Step 7 ~ 9
- features: ‘sqft_living’
- output: ‘price’
- initial weights: -47000, 1 (intercept, sqft_living respectively)
- step_size = 7e-12
- tolerance = 2.5e7

In [108]:
simple_features = ['sqft_living']
my_output= 'price'
simple_feature_matrix, output = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

simple_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)
simple_weights

Round 2: weights=[-46999.85779866    354.86068692], gradient_magnitude=50551530784973.43
Round 3: weights=[-46999.894732      262.96853716], gradient_magnitude=13127451026296.436
Round 4: weights=[-46999.88514683    286.83150782], gradient_magnitude=3408996083241.053
Round 5: weights=[-46999.88764179    280.63466326], gradient_magnitude=885263580285.0396
Round 6: weights=[-46999.88699974    282.24388799], gradient_magnitude=229889265767.897
Round 7: weights=[-46999.88717231    281.82599721], gradient_magnitude=59698688272.160286
Round 8: weights=[-46999.88713334    281.93451699], gradient_magnitude=15502826425.34162
Round 9: weights=[-46999.88714931    281.90633608], gradient_magnitude=4025844402.343505
Round 10: weights=[-46999.88715101    281.91365423], gradient_magnitude=1045449748.3835757
Round 11: weights=[-46999.88715641    281.91175382], gradient_magnitude=271487891.95332104
Round 12: weights=[-46999.88716085    281.91224733], gradient_magnitude=70504114.84341839
Round 13: weigh

array([-46999.88716555,    281.91211918])

## Step 10 & 11
Quiz Question: What is the predicted price for the 1st house in the Test data set for model 1 (round to nearest dollar)?

In [116]:
test_simple_feature_matrix, test_output = get_numpy_data(test_data, simple_features, my_output)
simple_test_predictions = predict_outcome(test_simple_feature_matrix, simple_weights)
simple_test_predictions[0]

356134.44325500238

## Step 12
Compute rss for test data.

In [117]:
def get_residual_sum_of_squares(output, predictions):
    RSS = sum((output - predictions) ** 2)
    return RSS

In [119]:
simple_rss = get_residual_sum_of_squares(test_output, simple_test_predictions)
simple_rss

275400044902128.78

## Step  12 ~15
Quiz Question:  What is the predicted price for the 1st house in the TEST data set for model 2 (round to nearest dollar)?

In [142]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
feature_matrix, output = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [143]:
model_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)
model_weights

Round 2: weights=[-99999.91164747    217.89658257    196.92903734], gradient_magnitude=73072020556001.0
Round 3: weights=[-99999.94015235    153.06856223    133.5056493 ], gradient_magnitude=22673220967534.742
Round 4: weights=[-99999.93238686    175.56189878    150.58566314], gradient_magnitude=7060794583490.458
Round 5: weights=[-99999.93584555    170.91513246    142.75832408], gradient_magnitude=2275682396587.625
Round 6: weights=[-99999.93579816    174.63083061    142.71624195], gradient_magnitude=928984110636.1498
Round 7: weights=[-99999.93681401    175.69887261    140.31809341], gradient_magnitude=656307431862.7946
Round 8: weights=[-99999.93747645    177.53542414    138.70790836], gradient_magnitude=610615358663.3833
Round 9: weights=[-99999.93822543    179.08218151    136.90918105], gradient_magnitude=593078772020.329
Round 10: weights=[-99999.93892505    180.66860722    135.22345813], gradient_magnitude=578705926685.869
Round 11: weights=[-99999.93961798    182.19370698    13

Round 187: weights=[ -9.99999681e+04   2.44263659e+02   6.61579455e+01], gradient_magnitude=8252855639.31265
Round 188: weights=[ -9.99999682e+04   2.44285491e+02   6.61342386e+01], gradient_magnitude=8057045943.374624
Round 189: weights=[ -9.99999682e+04   2.44306805e+02   6.61110941e+01], gradient_magnitude=7865882086.378992
Round 190: weights=[ -9.99999682e+04   2.44327613e+02   6.60884987e+01], gradient_magnitude=7679253839.776345
Round 191: weights=[ -9.99999682e+04   2.44347928e+02   6.60664395e+01], gradient_magnitude=7497053590.330721
Round 192: weights=[ -9.99999682e+04   2.44367760e+02   6.60449036e+01], gradient_magnitude=7319176278.071498
Round 193: weights=[ -9.99999682e+04   2.44387122e+02   6.60238787e+01], gradient_magnitude=7145519335.711815
Round 194: weights=[ -9.99999682e+04   2.44406025e+02   6.60033527e+01], gradient_magnitude=6975982629.506193
Round 195: weights=[ -9.99999682e+04   2.44424479e+02   6.59833136e+01], gradient_magnitude=6810468401.5117445
Round 196:

array([ -9.99999688e+04,   2.45072603e+02,   6.52795267e+01])

In [144]:
test_model_feature_matrix, test_output = get_numpy_data(test_data, model_features, my_output)
model_test_predictions = predict_outcome(test_model_feature_matrix, model_weights)
model_test_predictions[0]

366651.41162949387

In [145]:
test_output[0]

310000.0

## Step 17
Quiz Question: Which estimate was closer to the true price for the 1st house on the TEST data set, model 1 or model 2?

## Step 18 & 19
Quiz Question: Which model (1 or 2) has lowest RSS on all of the TEST data?

In [146]:
model_rss = get_residual_sum_of_squares(test_output, model_test_predictions)
model_rss

270263443629803.31

In [147]:
model_rss < simple_rss

True