In [1]:
import pandas as pd
import numpy as np

from sklearn import linear_model

## Steps 1 & 2
1. Read data
2. Split them into training and testing data

In [5]:
train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')

## Step 3
- Calculating slope and intercept with simple regression

In [38]:
def simple_linear_regression(input_feature, output):
    model = linear_model.LinearRegression()
    model.fit(input_feature, output)
    return model.intercept_, model.coef_

## Step 4
- Predict ‘price’ given ‘sqft_living’

In [40]:
sqft_living = train_data.loc[:, 'sqft_living'].values
sqft_living = np.reshape(sqft_living, (len(sqft_living), 1))
price = train_data.loc[:, 'price'].values
price = np.reshape(price, (len(price), 1))

intercept, slope = simple_linear_regression(sqft_living, price)
print(intercept, slope)

[-47116.07907289] [[ 281.95883963]]


## Step 5
- Implementing predicting function

In [42]:
def get_regression_predictions(input_feature, intercept, slope):
    predicted_output = intercept + (input_feature * slope)
    return predicted_output

## Step 6
Quiz Question: Using your Slope and Intercept from (4), What is the predicted price for a house with 2650 sqft?

In [43]:
print(get_regression_predictions(2650, intercept, slope))

[[ 700074.84594751]]


## Step 7
- Implementing residual_sum_of_squares function

In [50]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    predicted_output = intercept + (input_feature * slope)
    RSS = sum((output - predicted_output) ** 2)
    return RSS

## Step 8
Quiz Question: What is the RSS for the simple linear regression using squarefeet to predict prices on TRAINING data?

In [51]:
print(get_residual_sum_of_squares(sqft_living, price, intercept, slope))

[  1.20191835e+15]


## Step 9
- Implementing inverse_regression_predictions function

In [52]:
def inverse_regression_predictions(output, intercept, slope):
    estimated_input = (output - intercept) / slope
    return estimated_input

## Step 10
Quiz Question: According to this function and the regression slope and intercept from (3) what is the estimated square-feet for a house costing $800,000?

In [53]:
print(inverse_regression_predictions(800000, intercept, slope))

[[ 3004.39624515]]


## Step 11
Use ‘bedrooms’ (a count of the number of bedrooms in the house) to estimate prices.

In [55]:
bedrooms = train_data.loc[:, 'bedrooms'].values
bedrooms = np.reshape(bedrooms, (len(bedrooms), 1))
bd_intercept, bd_slope = simple_linear_regression(bedrooms, price)
print(bd_intercept, bd_slope)

[ 109473.17762296] [[ 127588.95293399]]


## Step 12
Now that we have 2 different models compute the RSS from BOTH models on TEST data.

In [72]:
test_sqft_living = test_data.loc[:, 'sqft_living'].values
test_sqft_living = np.reshape(test_sqft_living, (len(test_sqft_living), 1))
test_bedrooms = test_data.loc[:, 'bedrooms'].values
test_bedrooms = np.reshape(test_bedrooms, (len(test_bedrooms), 1))

test_price = test_data.loc[:, 'price'].values
test_price = np.reshape(test_price, (len(test_price), 1))

print('Using bedrooms: {0}\nUsing sqft_living: {1}'.format(
    get_residual_sum_of_squares(test_bedrooms, test_price, bd_intercept, bd_slope),
    get_residual_sum_of_squares(test_sqft_living, test_price, intercept, slope)
))

Using bedrooms: [  4.93364586e+14]
Using sqft_living: [  2.75402934e+14]


## Step 13
Quiz Question: Which model (square feet or bedrooms) has lowest RSS on TEST data? Think about why this might be the case.