## Load packages and data

In [2]:
# Loading the packages to be used
from __future__ import print_function  # Python 2 and 3
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

In [3]:
#pd.set_option("display.max_rows", 400)
pd.set_option("display.max_columns", 400)

In [4]:
# Import data; for linear regression, either import encoded datasets or dummify and then oneHotEncode in python
housetrain_linear = pd.read_csv('../Data/housetrain_linear.csv')
housetest_linear = pd.read_csv('../Data/housetest_linear.csv')
privtrain_linear = pd.read_csv('../Data/privtrain_linear.csv')
privteset_linear = pd.read_csv('../Data/privtest_linear.csv')

In [5]:
print(housetrain_linear.shape)
print(housetest_linear.shape)
print(privtrain_linear.shape)
print(privteset_linear.shape)

(1460, 230)
(1459, 230)
(1168, 228)
(292, 228)


## Define predictor and response objects

In [4]:
# Define data frame of predictors and BoxCox response variable
X_private_train = privtrain_linear.drop('SalePrice', axis = 1)
yT_private_train, private_lambda = stats.boxcox(privtrain_linear[['SalePrice']]) # T indicates transformation; ndarray 
#yT_private_train = stats.boxcox(encoded_private_train['SalePrice'], lmbda = 0.22)
# Note: The above lambda was found in R; to let Python find it, use the commented-out line above

X_private_test = privtest_linear.drop('SalePrice', axis = 1)

X_houses_train = housetrain_linear.drop('SalePrice', axis = 1)
yT_houses_train, houses_lambda = stats.boxcox(housetrain_linear['SalePrice']) # T indicates transformation; ndarray output

X_houses_test = housetest_linear.drop('SalePrice', axis = 1)

In [5]:
def unBoxCox(column, lambda_):
    unboxcoxed_column = np.power((column * lambda_) + 1, (1/lambda_))
    return unboxcoxed_column

## Defining model object and fitting to private training set

In [6]:
#### Define a linear regression object ols
from sklearn import linear_model
ols = linear_model.LinearRegression()

(Recall from lecture code) 
Some attributes and methods we will use for linear regression:

- `coef_`: Estimated coefficients ($\hat{\beta}_1$,...) for the linear regression problem.
- `intercept_`: Independent term ($\hat{\beta}_0$) in the linear model.
- `fit(X, y)`: Fit linear model.
- `predict(X)`: Predict using the linear model
- `score(X, y)`: Returns the coefficient of determination $R^2$ of the prediction.

In [35]:
# If data isn't already pre-split:
#try:  # train_test_split was moved in 0.18.0
#    from sklearn.model_selection import train_test_split
#except:  # Following import works through 0.19 but outputs a warning in 0.18
#    from sklearn.cross_validation import train_test_split
#X_private_train, X_private_test, yT_private_train, yT_private_test = train_test_split(X_houses_train, yT_houses_train, test_size=0.2, random_state=11)



In [7]:
# Fit a linear model to our private_train data using BoxCoxed response
ols.fit(X_private_train, yT_private_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
# Predict BoxCoxed response outputs
predictedT = ols.predict(X_private_train)

In [9]:
# Inverse BoxCox previous prediction to get predicted SalePrices
predicted = unBoxCox(predictedT, private_lambda)

In [10]:
# Compute RMSE and R^2 using log of un-BoxCoxed predictions
print("RMSE: {0}".format(np.sqrt(np.mean((np.log(privtrain_linear['SalePrice'].values) - np.log(predicted.reshape(-1,))) ** 2))))

RMSE: 0.09955213126465351


## Testing model on private test set

In [11]:
# Predict BoxCoxed response outputs for private test
#test_predictedT = ols.predict(pca.transform(X_private_test))
test_predictedT = ols.predict(X_private_test)

In [12]:
# Inverse BoxCox previous prediction to get predicted SalePrices
test_predicted = unBoxCox(test_predictedT, private_lambda)

In [13]:
# Compute RMSE and R^2 using log of un-BoxCoxed predictions; must use .reshape(-1,1) to match dimensions of object
# containing actual y-values with object containing predicted y-values
print("RMSE: {0}".format(np.sqrt(np.mean((np.log(privteset_linear['SalePrice'].values) - np.log(test_predicted.reshape(-1,))) ** 2))))

RMSE: 0.1413714656657429


## Fitting on house test set

In [14]:
ols = linear_model.LinearRegression()
ols.fit(X_houses_train, yT_houses_train)
predictedT = ols.predict(X_houses_train)
predicted = unBoxCox(predictedT, houses_lambda)
print("RMSE: {0}".format(np.sqrt(np.mean((np.log(housetrain_linear['SalePrice'].values) - np.log(predicted.reshape(-1,))) ** 2))))

RMSE: 0.10379687561900319
