# Week 2: Multiple Regression

In [1]:
import pandas as pd
import numpy as np

## Load training and testing data

In [2]:
df_train = pd.read_csv('./kc_house_train_data.csv')
df_test = pd.read_csv('./kc_house_test_data.csv')

In [4]:
df_train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [6]:
df_train.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

## Learning a multiple regression model

In [3]:
from sklearn.linear_model import LinearRegression

In [7]:
example_features = ['sqft_living', 'bedrooms', 'bathrooms']
example_model = LinearRegression()
example_model.fit(df_train[example_features], df_train['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [8]:
intercept = example_model.intercept_
coef = example_model.coef_

In [9]:
intercept

87912.86581496481

In [13]:
pd.DataFrame(coef, index=example_features, columns=['coef'])

Unnamed: 0,coef
sqft_living,315.406691
bedrooms,-65081.887116
bathrooms,6942.165986


## Make predictions

In [14]:
example_predictions = example_model.predict(df_train[example_features])

In [15]:
example_predictions[0]

271789.26537996985

## Compute Rss

In [16]:
def get_residual_sum_of_squares(model, X_pred, y_true, features=['sqft_living', 'bedrooms', 'bathrooms']):
    y_pred = model.predict(X_pred[features])
    errors = y_pred - y_true
    RSS = np.square(errors).sum()
    return RSS

In [17]:
get_residual_sum_of_squares(example_model, df_test, df_test['price'], features=example_features)

273761940583133.75

## Create some new features

In [18]:
from math import log

In [19]:
df_train['bedrooms_squared'] = df_train['bedrooms'].apply(lambda x: x**2)
df_test['bedrooms_squared'] = df_test['bedrooms'].apply(lambda x: x**2)

In [20]:
df_train['bed_bath_rooms'] = df_train['bedrooms'] *df_train['bathrooms']
df_train['log_sqft_living'] = df_train['sqft_living'].apply(lambda x: log(x))
df_train['lat_plus_long'] = df_train['lat'] + df_train['long']


In [21]:
df_test['bed_bath_rooms'] = df_test['bedrooms'] *df_test['bathrooms']
df_test['log_sqft_living'] = df_test['sqft_living'].apply(lambda x: log(x))
df_test['lat_plus_long'] = df_test['lat'] + df_test['long']

### quiz: what is the mean (arithmetic average) value of your 4 new features on TEST data?

In [22]:
df_test[['bedrooms_squared', 'bed_bath_rooms', 'log_sqft_living', 'lat_plus_long']].mean()

bedrooms_squared    12.446678
bed_bath_rooms       7.503902
log_sqft_living      7.550275
lat_plus_long      -74.653334
dtype: float64

## Learning Multiple Models

In [23]:
model1_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model2_features = model1_features + ['bed_bath_rooms']
model3_features = model2_features + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [24]:
model1 = LinearRegression()
model1.fit(df_train[model1_features], df_train['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [25]:
model2 = LinearRegression()
model2.fit(df_train[model2_features], df_train['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [26]:
model3 = LinearRegression()
model3.fit(df_train[model3_features], df_train['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [34]:
pd.DataFrame([[model.intercept_, *model.coef_] for model in [model1, model2, model3]], index=['model1', 'model2', 'model3'], columns=['intercept'] + model3_features)

Unnamed: 0,intercept,sqft_living,bedrooms,bathrooms,lat,long,bed_bath_rooms,bedrooms_squared,log_sqft_living,lat_plus_long
model1,-69075730.0,312.258646,-59586.533154,15706.742083,658619.263931,-309374.351268,,,,
model2,-66867970.0,306.610053,-113446.36807,-71461.308293,654844.629503,-294298.969138,25579.652001,,,
model3,-62036080.0,529.42282,34514.229578,67060.781319,534085.610867,-406750.710861,-8570.504395,-6788.58667,-561831.484076,127334.900006


## RSS on training data

In [35]:
[get_residual_sum_of_squares(model, df_train, df_train['price'], features) for model, features in zip([model1, model2, model3], [model1_features, model2_features, model3_features])]

[967879963049546.4, 958419635074068.2, 903436455050477.4]

## RSS on testing data

In [36]:
[get_residual_sum_of_squares(model, df_test, df_test['price'], features) for model, features in zip([model1, model2, model3], [model1_features, model2_features, model3_features])]

[225500469795490.2, 223377462976466.72, 259236319207178.5]