In [11]:
import graphlab

In [12]:
sales = graphlab.SFrame('kc_house_data.gl/')

In [13]:
sales

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900.0,3.0,1.0,1180.0,5650,1,0
6414100192,2014-12-09 00:00:00+00:00,538000.0,3.0,2.25,2570.0,7242,2,0
5631500400,2015-02-25 00:00:00+00:00,180000.0,2.0,1.0,770.0,10000,1,0
2487200875,2014-12-09 00:00:00+00:00,604000.0,4.0,3.0,1960.0,5000,1,0
1954400510,2015-02-18 00:00:00+00:00,510000.0,3.0,2.0,1680.0,8080,1,0
7237550310,2014-05-12 00:00:00+00:00,1225000.0,4.0,4.5,5420.0,101930,1,0
1321400060,2014-06-27 00:00:00+00:00,257500.0,3.0,2.25,1715.0,6819,2,0
2008000270,2015-01-15 00:00:00+00:00,291850.0,3.0,1.5,1060.0,9711,1,0
2414600126,2015-04-15 00:00:00+00:00,229500.0,3.0,1.0,1780.0,7470,1,0
3793500160,2015-03-12 00:00:00+00:00,323000.0,3.0,2.5,1890.0,6560,2,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7,1180,0,1955,0,98178,47.51123398
0,3,7,2170,400,1951,1991,98125,47.72102274
0,3,6,770,0,1933,0,98028,47.73792661
0,5,7,1050,910,1965,0,98136,47.52082
0,3,8,1680,0,1987,0,98074,47.61681228
0,3,11,3890,1530,2001,0,98053,47.65611835
0,3,7,1715,0,1995,0,98003,47.30972002
0,3,7,1060,0,1963,0,98198,47.40949984
0,3,7,1050,730,1960,0,98146,47.51229381
0,3,7,1890,0,2003,0,98038,47.36840673

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0
-122.3188624,1690.0,7639.0
-122.23319601,2720.0,8062.0
-122.39318505,1360.0,5000.0
-122.04490059,1800.0,7503.0
-122.00528655,4760.0,101930.0
-122.32704857,2238.0,6819.0
-122.31457273,1650.0,9711.0
-122.33659507,1780.0,8113.0
-122.0308176,2390.0,7570.0


In [10]:
train_data,test_data=sales.random_split(.8,seed=0)

In [14]:
# Generic Linear Regression Model

In [27]:
#Approach 1: Set gradient = 0
#Formula: Slope = [(SumYiXi - (SunYiSumXi/N))/(SumXi^2 - SumXiSumXi/N)], Where Xi = input feature, Yi = output feature, N = number of input festures
def simple_linear_regression (input_feature,output):
    Xi = input_feature
    Yi = output
    
    N = len(Xi)
    
    XiMean = Xi.mean()
    YiMean = Yi.mean()
    
    # calculate the Sum of Xi Yi and Squares of Xi that are required to find the Slope
    SumXi = Xi.sum()
    SumYi = Yi.sum()
    SumYiXi = (Yi * Xi).sum()
    SumYiXi_by_N = (SumYi * SumXi) / N
    Xi_sq = (Xi * Xi).sum()
    XiXi_by_N = (Xi.sum() * Xi.sum())/N
    
    
    slope = (SumYiXi - SumYiXi_by_N) / (Xi_sq - XiXi_by_N)
    #intercept = Mean(Yi) - (Slope * Mean(Xi))
    intercept = YiMean - (slope * XiMean)
    
    return(intercept,slope)


In [16]:
Test the Simple Linear Regression

540088.1419053345

In [47]:
test_input = graphlab.SArray(range(5))
test_output = graphlab.SArray(1 + 1 * test_input)
(t_intercept, t_slope) = simple_linear_regression(test_input,test_output)
print(t_intercept)
print(t_slope)

1.0
1


In [48]:
intercept,slope = simple_linear_regression(train_data['sqft_living'],train_data['price']) 

In [49]:
print (intercept)
print (slope)

-47116.0765749
281.958838568


# Predicting values

In [50]:
def get_regression_predictions(input_feature,intercept,slope):
    pred_price = intercept + (slope * input_feature)
    return (pred_price)

In [39]:
new_house_sqft = 2650
new_house_price = get_regression_predictions(new_house_sqft,intercept,slope)
print("Estimated price of the house with 2650 sq ft is :",new_house_price)

('Estimated price of the house with 2650 sq ft is :', 700074.8456294573)


# Residual Sum of Squares

Evaluate our regression model using Residual Sum of Squares. RSS is the difference between the predicted output and true ouput

In [41]:
def get_residual_sum_of_squares(input_feature, output, intercept,slope):
    pred_price = intercept + (slope * input_feature)
    residuals = output - pred_price
    RSS = (residuals * residuals).sum()
    return(RSS)

In [51]:
#Test with test_input and test_output
print  get_residual_sum_of_squares(test_input,test_output,t_intercept,t_slope)

0.0


In [52]:
# calculate the RSS using training data on the model above
RSS_prices = get_residual_sum_of_squares(train_data['sqft_living'],train_data['price'],intercept,slope)
print ("Residual Sum of Squares of predicting prices using sqft is :",RSS_prices)

('Residual Sum of Squares of predicting prices using sqft is :', 1201918356321967.5)


# Predict Square feet of a new house using Price for buyers

In [56]:
def inverse_regression_predictions(output, intercept, slope):
    # estimated_sqft = (price - intercept)/slope
    price = output 
    estimated_sqft = (price - intercept)/slope
    return(estimated_sqft)

In [61]:
my_price = 800000
estimated_sqft = inverse_regression_predictions(my_price,intercept,slope)
print("Estimated square feet for the price 800000 :",estimated_sqft)

('Estimated square feet for the price 800000 :', 3004.3962476159477)


# New Prediction model: Use the number of bedrooms as the feature instead of Sqft_living

In [67]:
# Calculate Residual Sum of Squares using "Test" Data for the bedrooms

BR_intercept,BR_slope = simple_linear_regression(train_data['bedrooms'],train_data['price'])
print (BR_intercept,BR_slope)
RSS_prices_BR = get_residual_sum_of_squares(test_data['bedrooms'],test_data['price'],BR_intercept,BR_slope)
print ("Residual Sum of Squares of predicting prices using bedroom is :",RSS_prices_BR) 

(109473.18046928657, 127588.95217458377)
('Residual Sum of Squares of predicting prices using bedroom is :', 493364582868288.0)


In [68]:
# Calculate Residual Sum of Squares using "Test" Data for the Squarefeet

BR_intercept,BR_slope = simple_linear_regression(train_data['sqft_living'],train_data['price'])
print (BR_intercept,BR_slope)
RSS_prices_BR = get_residual_sum_of_squares(test_data['sqft_living'],test_data['price'],BR_intercept,BR_slope)
print ("Residual Sum of Squares of predicting prices using sqft is :",RSS_prices_BR) 

(-47116.07657494082, 281.9588385676974)
('Residual Sum of Squares of predicting prices using sqft is :', 275402936247141.3)


# Summary: Out of the two predictions the RSS for the Sqft_living is less than the RSS using Bedrooms.