# Regression: Simple Linear Regression

In [1]:
# Fire up Turi Create
import turicreate

In [2]:
# Load house sales data
# Dataset is from house sales in King County, the region where the city of Seattle, WA is located.
sales = turicreate.SFrame('kc_house_data.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str,float,int,float,int,int,float,int,int,int,int,int,int,int,int,int,float,float,int,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [4]:
# Print sales data
sales

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view
7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0
6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0
5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0
2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0
1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0
7237550310,20140512T000000,1225000.0,4,4.5,5420,101930,1.0,0,0
1321400060,20140627T000000,257500.0,3,2.25,1715,6819,2.0,0,0
2008000270,20150115T000000,291850.0,3,1.5,1060,9711,1.0,0,0
2414600126,20150415T000000,229500.0,3,1.0,1780,7470,1.0,0,0
3793500160,20150312T000000,323000.0,3,2.5,1890,6560,2.0,0,0

condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15
3,7,1180,0,1955,0,98178,47.5112,-122.257,1340
3,7,2170,400,1951,1991,98125,47.721,-122.319,1690
3,6,770,0,1933,0,98028,47.7379,-122.233,2720
5,7,1050,910,1965,0,98136,47.5208,-122.393,1360
3,8,1680,0,1987,0,98074,47.6168,-122.045,1800
3,11,3890,1530,2001,0,98053,47.6561,-122.005,4760
3,7,1715,0,1995,0,98003,47.3097,-122.327,2238
3,7,1060,0,1963,0,98198,47.4095,-122.315,1650
3,7,1050,730,1960,0,98146,47.5123,-122.337,1780
3,7,1890,0,2003,0,98038,47.3684,-122.031,2390

sqft_lot15
5650
7639
8062
5000
7503
101930
6819
9711
8113
7570


In [6]:
# Split data into training and testing
train_data,test_data = sales.random_split(.8,seed=0)

In [9]:
## Useful SFrame summary functions
# Let's compute the mean of the House Prices in King County in 2 different ways.
prices = sales['price'] # extract the price column of the sales SFrame -- this is now an SArray

# recall that the arithmetic average (the mean) is the sum of the prices divided by the total number of houses:
sum_prices = prices.sum()
num_houses = len(prices) # when prices is an SArray len() returns its length
avg_price_1 = sum_prices/num_houses
avg_price_2 = prices.mean() # if you just want the average, the .mean() function
print ("average price via method 1: " + str(avg_price_1))
print ("average price via method 2: " + str(avg_price_2))
### As we see we get the same answer both ways

average price via method 1: 540088.1417665294
average price via method 2: 540088.1417665299


In [12]:
# if we want to multiply every price by 0.5 it's a simple as:
half_prices = 0.5*prices
# Let's compute the sum of squares of price. We can multiply two SArrays of the same length elementwise also with *
prices_squared = prices*prices
sum_prices_squared = prices_squared.sum() # price_squared is an SArray of the squares and we want to add them up.
print ("the sum of price squared is: " + str(sum_prices_squared))
### Aside: The python notation x.xxe+yy means x.xx \* 10^(yy). e.g 100 = 10^2 = 1*10^2 = 1e2 

the sum of price squared is: 9217325138472070.0


In [14]:
## Build a generic simple linear regression function 
# define simple linear regression function
def simple_lin_reg(input_feature, output):
    reg_model = turicreate.linear_regression.create(train_data,
    target=output,
    features=[input_feature],
    validation_set=None,
    verbose=False)
    intercept = reg_model.coefficients['value'][0]
    slope = reg_model.coefficients['value'][1]
    return ('intercept:', intercept, 'slope:', slope)

In [15]:
# call simple linear regression function
simple_lin_reg('sqft_living','price')

('intercept:', -47114.02316816081, 'slope:', 281.95785122862833)

In [16]:
#save slope and intercept
squarfeet_slope = 281.95785122862833
squarefeet_intercept = -47114.02316816081
## Now that we know it works let's build a regression model for predicting price based on sqft_living. 
## Rembember that we train on train_data!

In [23]:
## Predicting Values
# write new function for predictions
def get_regression_predictions(input_feature, slope, intercept):
    predicted_output = slope*train_data[input_feature] + intercept
    return(predicted_output)

In [31]:
# run function for regression predictions
train_data['squarfeet_predict'] = get_regression_predictions('sqft_living',squarfeet_slope,squarefeet_intercept)

In [32]:
# spit out RSS estimate
sum((train_data['price'] - train_data['squarfeet_predict'])**2)

1201918354191709.5

In [33]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    predicted = (slope*(train_data[input_feature]) + intercept)
    RSS = sum((train_data[output] - predicted)**2)
    return(RSS)

In [35]:
#compute the residual sum of squares
## Note this should be the same as above
get_residual_sum_of_squares('sqft_living', 'price', squarefeet_intercept, squarfeet_slope)

1201918354191709.5

In [43]:
# Predict the squarefeet given price
# Basically this is a function for computing the inverse from a simple linear regression
def inverse_regression_predictions(output, intercept, slope):
    estimated_feature = (output - intercept)/slope
    return estimated_feature

In [45]:
# run function to see what the output is for this example
inverse_regression_predictions(800000,squarefeet_intercept,squarfeet_slope )

3004.3994855148403

In [49]:
### New Model: estimate prices from bedrooms
### Test your Linear Regression Algorithm
# Estimate the slope and intercept for predicting 'price' based on 'bedrooms'
simple_lin_reg('bedrooms','price')

('intercept:', 109476.84175773867, 'slope:', 127587.86544796114)

In [50]:
# Save these new parameter estimates
bed_slope = 127587.86544796114
bed_intercept = 109476.84175773867

In [51]:
# get predictions for this new model
train_data['bedroom_predict'] = get_regression_predictions('bedrooms',bed_slope,bed_intercept)

In [53]:
# Get RSS for this new model
get_residual_sum_of_squares('bedrooms', 'price', bed_intercept, bed_slope)

2143244498178887.5

In [54]:
# Create a function that computes RSS for the 'test' dataset
def get_residual_sum_of_squares_test(input_feature, output, intercept, slope):
    predicted = (slope*(test_data[input_feature]) + intercept)
    RSS = sum((test_data[output] - predicted)**2)
    return(RSS)

In [55]:
# Compute RSS when using bedrooms on TEST data:
get_residual_sum_of_squares_test('bedrooms', 'price', bed_intercept, bed_slope)

493364347210739.1

In [56]:
# Compute RSS when using squarefeet on TEST data:
get_residual_sum_of_squares_test('sqft_living', 'price', squarefeet_intercept, squarfeet_slope)

275402885496450.78