## Load packages and data

In [165]:
# Loading the packages to be used
from __future__ import print_function  # Python 2 and 3
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

In [210]:
# Import data; for linear regression, either import encoded datasets or dummify and then oneHotEncode in python
houses_train = pd.read_csv('/Users/kathrynbryant/Dropbox/NYCDSA/blackFlamingo/Data/encoded_houses_train.csv')
houses_test = pd.read_csv('/Users/kathrynbryant/Dropbox/NYCDSA/blackFlamingo/Data/encoded_houses_test.csv')
encoded_private_train = pd.read_csv('/Users/kathrynbryant/Dropbox/NYCDSA/blackFlamingo/Data/encoded_private_train.csv')
encoded_private_test = pd.read_csv('/Users/kathrynbryant/Dropbox/NYCDSA/blackFlamingo/Data/encoded_private_test.csv')
private_train = pd.read_csv('/Users/kathrynbryant/Dropbox/NYCDSA/blackFlamingo/Data/private_train.csv')
private_test = pd.read_csv('/Users/kathrynbryant/Dropbox/NYCDSA/blackFlamingo/Data/private_test.csv')

## Clean data for pandas (if needed)

In [19]:
# Save the 'Id' column
#houses_train_ID = houses_train['Id']
#houses_test_ID = houses_test['Id']
#private_train_ID = private_train['Id']
#private_test_ID = private_test['Id']

# Now drop the 'Id' colum since we can not use it as a feature to train our model.
#houses_train.drop("Id", axis = 1, inplace = True)
#houses_test.drop("Id", axis = 1, inplace = True)
#private_train.drop("Id", axis = 1, inplace = True)
#private_test.drop("Id", axis = 1, inplace = True)

## Define predictor and response objects

In [243]:
# Define data frame of predictors and BoxCox response variable
X_private_train = encoded_private_train.drop('SalePrice', axis = 1)
yT_private_train, private_lambda = stats.boxcox(private_train[['SalePrice']]) # T indicates transformation; ndarray 
yT_private_train = stats.boxcox(encoded_private_train['SalePrice'], lmbda = 0.22)
# Note: The above lambda was found in R; to let Python find it, use the commented-out line above

X_private_test = encoded_private_test.drop('SalePrice', axis = 1)

X_houses_train = houses_train.drop('SalePrice', axis = 1)
yT_houses_train = stats.boxcox(houses_train['SalePrice'], lmbda = 0.22) # T indicates transformation; ndarray output

## Defining model object and fitting to private training set

In [244]:
#### Define a linear regression object ols
from sklearn import linear_model
ols = linear_model.LinearRegression()

(Recall from lecture code) 
Some attributes and methods we will use for linear regression:

- `coef_`: Estimated coefficients ($\hat{\beta}_1$,...) for the linear regression problem.
- `intercept_`: Independent term ($\hat{\beta}_0$) in the linear model.
- `fit(X, y)`: Fit linear model.
- `predict(X)`: Predict using the linear model
- `score(X, y)`: Returns the coefficient of determination $R^2$ of the prediction.

In [237]:
# If data isn't already pre-split:
#try:  # train_test_split was moved in 0.18.0
#    from sklearn.model_selection import train_test_split
#except:  # Following import works through 0.19 but outputs a warning in 0.18
#    from sklearn.cross_validation import train_test_split
#X_private_train, X_private_test, yT_private_train, yT_private_test = train_test_split(X_houses_train, yT_houses_train, test_size=0.2, random_state=11)



In [245]:
# Fit a linear model to our private_train data using BoxCoxed response
ols.fit(X_private_train, yT_private_train) 

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [246]:
# Predict BoxCoxed response outputs
predictedT = ols.predict(X_private_train)

In [247]:
predictedT

array([ 59.82547362,  56.77666655,  61.74734862, ...,  76.43972091,
        54.83332822,  59.32353102])

In [248]:
# Inverse BoxCox previous prediction to get predicted SalePrices
#predicted = np.power((predictedT * private_lambda) + 1, 1 / private_lambda) 
predicted = np.power((predictedT * 0.22) + 1, 1/(0.22)) # Use line above to use python-found lambda value

In [249]:
# Compute RMSE and R^2 using log of un-BoxCoxed predictions; must use .reshape(-1,1) to match dimensions of object
# containing actual y-values with object containing predicted y-values
print("RMSE: {0}".format(np.sqrt(np.mean((np.log(encoded_private_train['SalePrice'].values) - np.log(predicted)) ** 2))))

RMSE: 0.08528780403197979


## Testing model on private test set

In [250]:
# Predict BoxCoxed response outputs for private test
test_predictedT = ols.predict(X_private_test)

In [257]:
predictedT

array([ 59.82547362,  56.77666655,  61.74734862, ...,  76.43972091,
        54.83332822,  59.32353102])

In [258]:
test_predictedT

array([  1.92271606e+13,  -3.45902668e+12,  -6.80319881e+11,
         9.10965126e+12,   2.66662764e+12,  -2.03594596e+12,
        -2.20468984e+12,  -3.72171007e+13,   4.55427040e+12,
        -2.43498030e+11,  -8.06737477e+11,   4.36891369e+12,
        -5.44458794e+12,  -2.51835162e+12,  -1.70097379e+12,
         5.51440318e+11,  -9.55220005e+10,   5.82586864e+12,
        -1.77529354e+13,   5.23311651e+12,   2.44177630e+12,
         1.21111481e+13,  -6.10416260e+12,  -1.79311568e+13,
        -1.27842670e+12,  -1.00501678e+12,  -1.55930605e+12,
        -7.57591411e+12,   5.66014302e+11,   1.57869127e+13,
        -9.99266475e+12,   1.18764209e+13,   1.56641631e+13,
         2.69313960e+12,  -3.77079965e+12,  -1.47517999e+12,
         1.22344095e+13,   4.59270973e+13,   8.36380417e+12,
         1.20909881e+13,  -3.64923565e+12,   6.55611662e+11,
        -5.21646774e+12,   5.07229560e+12,   1.22524805e+12,
        -7.22704239e+10,   4.87753988e+12,   2.71338903e+12,
         9.96629594e+12,

In [130]:
# Inverse BoxCox previous prediction to get predicted SalePrices
test_predicted = np.power((test_predictedT * 0.22) + 1, 1/(0.22))

  


In [125]:
# Compute RMSE and R^2 using log of un-BoxCoxed predictions; must use .reshape(-1,1) to match dimensions of object
# containing actual y-values with object containing predicted y-values
print("RMSE: {0}".format(np.sqrt(np.mean((np.log(private_test['SalePrice'].values.reshape(-1,1)) - np.log(test_predicted)) ** 2))))

RMSE: nan
