In [1]:
import pandas as pd
import numpy as np

from sklearn import linear_model

## Steps 1 & 2
1. Read data
2. Split them into training and testing data

In [2]:
train_data = pd.read_csv('data/kc_house_train_data.csv')
test_data = pd.read_csv('data/kc_house_test_data.csv')

## Step 3
- Feature transformation

In [3]:
train_data['bedrooms_squared'] = train_data['bedrooms']**2
train_data['bed_bath_rooms'] = train_data['bedrooms']*train_data['bathrooms']
train_data['log_sqft_living'] = np.log(train_data['sqft_living'])
train_data['lat_plus_long'] = train_data['lat'] + train_data['long']

test_data['bedrooms_squared'] = test_data['bedrooms']**2
test_data['bed_bath_rooms'] = test_data['bedrooms']*test_data['bathrooms']
test_data['log_sqft_living'] = np.log(test_data['sqft_living'])
test_data['lat_plus_long'] = test_data['lat'] + test_data['long']

## Step 4
- Calculate the mean of the each of the 4 new variables.

In [4]:
test_data.loc[:, 'bedrooms_squared':'lat_plus_long'].mean().round(2)

bedrooms_squared    12.45
bed_bath_rooms       7.50
log_sqft_living      7.55
lat_plus_long      -74.65
dtype: float64

## Step 5
- Implementing 3 different models

In [5]:
feature_1 = train_data.loc[:, ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']]
output = train_data.loc[:, 'price']
model_1 = linear_model.LinearRegression()
model_1.fit(feature_1, output)

feature_2 = train_data.loc[:, ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']]
model_2 = linear_model.LinearRegression()
model_2.fit(feature_2, output)

feature_3 = train_data.loc[:, [
    'sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms',
    'bedrooms_squared', 'log_sqft_living', 'lat_plus_long'
]]
model_3 = linear_model.LinearRegression()
model_3.fit(feature_3, output)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## Step 6
Quiz Question:  What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 1?

In [6]:
model_1.coef_

array([  3.12258646e+02,  -5.95865332e+04,   1.57067421e+04,
         6.58619264e+05,  -3.09374351e+05])

## Step 7
Quiz Questions: What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 2?

In [7]:
model_2.coef_

array([  3.06610053e+02,  -1.13446368e+05,  -7.14613083e+04,
         6.54844630e+05,  -2.94298969e+05,   2.55796520e+04])

## Step 9 & 10
- Now using your three estimated models compute the RSS (Residual Sum of Squares) on the Training data.

Quiz Question: Which model (1, 2 or 3) had the lowest RSS on TRAINING data?

In [14]:
def get_residual_sum_of_squares(input_feature, output, model):
    predicted_output = model.predict(input_feature)
    RSS = sum((output - predicted_output) ** 2)
    return RSS

print(get_residual_sum_of_squares(feature_1, output, model_1))
print(get_residual_sum_of_squares(feature_2, output, model_2))
print(get_residual_sum_of_squares(feature_3, output, model_3))

9.6787996305e+14
9.58419635074e+14
9.0343645505e+14


## Step 11 & 12
- Now using your three estimated models compute the RSS on the Testing data

Quiz Question: Which model (1, 2, or 3) had the lowest RSS on TESTING data?

In [15]:
output = test_data.loc[:, 'price']
feature_1 = test_data.loc[:, ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']]
feature_2 = test_data.loc[:, ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']]
feature_3 = test_data.loc[:, [
    'sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms',
    'bedrooms_squared', 'log_sqft_living', 'lat_plus_long'
]]

print(get_residual_sum_of_squares(feature_1, output, model_1))
print(get_residual_sum_of_squares(feature_2, output, model_2))
print(get_residual_sum_of_squares(feature_3, output, model_3))

2.25500469795e+14
2.23377462976e+14
2.59236319207e+14
