In [35]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [37]:
orig_data = pd.read_csv('saratoga-house-prices.csv')
X = orig_data.iloc[:,1:].as_matrix()
Y = orig_data.iloc[:,0].as_matrix()
print(X.shape)
print(Y.shape)
orig_data.head(5)

(1063, 6)
(1063,)


Unnamed: 0,Price,Size,Baths,Bedrooms,Fireplace,Acres,Age
0,142.212,1.982,1.0,3,0,2.0,133
1,134.865,1.676,1.5,3,1,0.38,14
2,118.007,1.694,2.0,3,1,0.96,15
3,138.297,1.8,1.0,2,1,0.48,49
4,129.47,2.088,1.0,3,1,1.84,29


In this dataset, we have prices of houses (in dollars per sq. ft) made available along with the some properties of the house like area, number of bedrooms, age of house, etc. The goal is to create a model for the price as a function of those properties.

In [45]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.50)
print (X_train.shape, X_test.shape)

(531, 6) (532, 6)


In [46]:
def get_rmse(degree):
    model = Pipeline([('poly', PolynomialFeatures(degree=degree, interaction_only=True)),
                      ('linear', LinearRegression(fit_intercept=False))])
    model.fit(X_train, Y_train)
    Y_train_pred = model.predict(X_train)
    Y_test_pred = model.predict(X_test)
    training_rmse = math.sqrt(mean_squared_error(Y_train_pred, Y_train))
    test_rmse = math.sqrt(mean_squared_error(Y_test_pred, Y_test))
    return (training_rmse, test_rmse)

In [47]:
tr, te = get_rmse(degree=1)
print (tr, te)

52.861410736676966 52.293790731908686


We have chosen the **Polynomial Regression** model, which is a generalization of the linear model (1-degree polynomial is a linear function). The function `get_rmse()` outputs a tuple containing the training and the test RMSE values for the polynomial with `degree` degrees.

Your exercise is to calculate the errors for degree in the range = [1, 5] (put that range in the variable `degrees`), and collect the training and test errors in the lists `training_error` and `test_error` respectively. Then, you can uncomment the code below and run it to see a graph of how training error and test error relate to each other.

In [48]:
#plt.plot(degrees, training_error, label="Training Error")
#plt.plot(degrees, test_error, label="Test Error")
#plt.legend(loc='upper left')
#plt.show()