# Regression Week 2: Multiple Regression (Interpretation)

In [3]:
import graphlab
import numpy as np
from math import sqrt

In [4]:
sales = graphlab.SFrame('kc_house_data.gl/')

[INFO] [1;32m1449070257 : INFO:     (initialize_globals_from_environment:282): Setting configuration variable GRAPHLAB_FILEIO_ALTERNATIVE_SSL_CERT_FILE to C:\Users\antonio.rincon\AppData\Local\Continuum\Miniconda\envs\machine-learning\lib\site-packages\certifi\cacert.pem
[0m[1;32m1449070257 : INFO:     (initialize_globals_from_environment:282): Setting configuration variable GRAPHLAB_FILEIO_ALTERNATIVE_SSL_CERT_DIR to 
[0mThis non-commercial license of GraphLab Create is assigned to antonio.rincon@gmail.com and will expire on October 07, 2016. For commercial licensing options, visit https://dato.com/buy/.

[INFO] Start server at: ipc:///tmp/graphlab_server-52392 - Server binary: C:\Users\antonio.rincon\AppData\Local\Continuum\Miniconda\envs\machine-learning\lib\site-packages\graphlab\unity_server.exe - Server log: C:\Users\ANTONI~1.RIN\AppData\Local\Temp\graphlab_server_1449070257.log.0
[INFO] GraphLab Server Version: 1.7.1


In [5]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe = data_sframe[features]
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = features_sframe.to_numpy()
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray = data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_array = output_sarray.to_numpy() # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)

In [6]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix,weights)
    return(predictions)

In [7]:
def feature_derivative(errors, feature):
    derivative = 2*np.dot(feature,errors)
    return(derivative)

In [41]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    n = 0
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        predictions = predict_outcome(feature_matrix,weights)
        # compute the errors as predictions - output:
        errors = predictions-output
        
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors,feature_matrix[:,i])
            
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares = gradient_sum_squares + derivative*derivative
            
            # update the weight based on step size and derivative:
            weights[i] = weights[i] - step_size*derivative
            
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
        n = n +1
        if n >= 400:
            print "stop on iteration 400"
            converged = True
    return(weights)

In [31]:
train_data,test_data = sales.random_split(.8,seed=0)

In [32]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [33]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size, tolerance)

In [34]:
simple_weights

array([-46999.88716555,    281.91211912])

In [36]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)
predictions = predict_outcome(test_simple_feature_matrix,simple_weights)
predictions

array([ 356134.44317093,  784640.86422788,  435069.83652353, ...,
        663418.65300782,  604217.10799338,  240550.4743332 ])

In [37]:
simple_RSS = sum((predictions-test_output)*(predictions-test_output))

In [39]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [42]:
model_weights = regression_gradient_descent(feature_matrix, output,initial_weights, step_size, tolerance)

In [44]:
(test_model_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)
predictions = predict_outcome(test_model_feature_matrix,model_weights)
predictions

array([ 366651.41203656,  762662.39786164,  386312.09499712, ...,
        682087.39928241,  585579.27865729,  216559.20396617])

In [47]:
test_output

array([ 310000.,  650000.,  233000., ...,  610685.,  400000.,  402101.])

In [48]:
model_RSS = sum((predictions-test_output)*(predictions-test_output))

In [49]:
print simple_RSS
print model_RSS

2.75400047593e+14
2.70263446465e+14
