# Part 5 - Multiple Linear Regression

In the previous notebook we trained a Simple Linear Regressor with a single feature (`sqft`).  
In this notebook we aim to extend the capability of our model by using multiple features for our independent variable `X`.  
Our equation is the same however our `X` is now a matrix and the equation no longer represents a line but rather a hyperplane in N-dimensional space where **N** is the number of features in `X`.
  
$$
y = mX + b
$$ 
  


In [2]:
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
%matplotlib inline

## Let's load the data and remind ourselves of the contents

In [3]:
df = pd.read_csv('./data/sf/data_clean_engineered.csv')
df.head()

Unnamed: 0,bath,bed,sqft,price,property_type_apartment,property_type_auction,property_type_coming,property_type_condo,property_type_coop,property_type_house,...,postal_code_94121,postal_code_94122,postal_code_94123,postal_code_94124,postal_code_94127,postal_code_94131,postal_code_94132,postal_code_94133,postal_code_94134,postal_code_94501
0,2.0,3.0,1520.0,1995000.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,1.0,566.0,625000.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,1.0,914.0,1196000.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.5,1.0,1022.0,935000.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2.0,2.0,1912.0,2750000.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Prepare our training and validation data

In [4]:
df.columns

Index(['bath', 'bed', 'sqft', 'price', 'property_type_apartment',
       'property_type_auction', 'property_type_coming', 'property_type_condo',
       'property_type_coop', 'property_type_house', 'property_type_lot',
       'property_type_new', 'postal_code_94102', 'postal_code_94103',
       'postal_code_94104', 'postal_code_94105', 'postal_code_94107',
       'postal_code_94108', 'postal_code_94109', 'postal_code_94110',
       'postal_code_94111', 'postal_code_94112', 'postal_code_94114',
       'postal_code_94115', 'postal_code_94116', 'postal_code_94117',
       'postal_code_94118', 'postal_code_94121', 'postal_code_94122',
       'postal_code_94123', 'postal_code_94124', 'postal_code_94127',
       'postal_code_94131', 'postal_code_94132', 'postal_code_94133',
       'postal_code_94134', 'postal_code_94501'],
      dtype='object')

In [5]:
features = [feature for feature in df.columns if feature != 'price']
X = df[features]
y = df['price']
X_np = X.values
y_np = y.values.reshape((len(df), 1))

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X_np, y_np, test_size=0.30, random_state=123) # split 70% train, 30% validation

In [7]:
regressor = LinearRegression(normalize=True)

In [8]:
model = regressor.fit(X_train, y_train)

In [9]:
def evaluate_model(model, X, y):
    y_pred = model.predict(X) # predict y values from input X
    mse = mean_squared_error(y_true=y, y_pred=y_pred)
    print("Mean Squared Error: {}".format(mse))
    print("Accuracy: {}%".format(model.score(X, y)*100.0))
evaluate_model(model, X_val, y_val)

Mean Squared Error: 306575104434.63226
Accuracy: 66.27434714321974%


In [11]:
# try brand new data
# actual_price = '$583,000'
# sqft = 800
# bed = 1
# bath = 1
# property_type = 'condo'
# postal_code = '94124'
actual_price = '$1,695,000'
sqft = 1509
bed = 2
bath = 2
property_type = 'condo'
postal_code = '94158'
new_data = {'sqft': sqft,
            'bed': bed,
            'bath': bath,
            'property_type_{}'.format(property_type): 1,
            'postal_code_{}'.format(postal_code): 1
           }
new_df = pd.get_dummies(pd.DataFrame(data=[new_data], columns=X.columns).fillna(0))
new_df
print(X.shape)
predicted_price = model.predict(new_df)
print("predicted price: ${}M".format(predicted_price[0]/1e6))
print("actual price: {}".format(actual_price))

(514, 36)
predicted price: $[-3.14689522e+11]M
actual price: $1,695,000


## Retrain on entire dataset and save model to disk

In [16]:
import pickle
model = regressor.fit(X, y)
with open('./models/sf/multiple_linear.pkl', 'wb') as f:
    pickle.dump(model, f)