## Load packages and data

In [31]:
# Loading the packages to be used
from __future__ import print_function  # Python 2 and 3
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

In [32]:
# Import data; for linear regression, either import encoded datasets or dummify and then oneHotEncode in python
houses_train = pd.read_csv('/Users/kathrynbryant/Dropbox/NYCDSA/blackFlamingo/Data/encoded_houses_train.csv')
houses_test = pd.read_csv('/Users/kathrynbryant/Dropbox/NYCDSA/blackFlamingo/Data/encoded_houses_test.csv')
encoded_private_train = pd.read_csv('/Users/kathrynbryant/Dropbox/NYCDSA/blackFlamingo/Data/encoded_private_train.csv')
encoded_private_test = pd.read_csv('/Users/kathrynbryant/Dropbox/NYCDSA/blackFlamingo/Data/encoded_private_test.csv')

## Define predictor and response objects

In [33]:
# Define data frame of predictors and BoxCox response variable
X_private_train = encoded_private_train.drop('SalePrice', axis = 1)
#yT_private_train, private_lambda = stats.boxcox(encoded_private_train[['SalePrice']]) # T indicates transformation; ndarray 
yT_private_train = stats.boxcox(encoded_private_train['SalePrice'], lmbda = 0.22)
# Note: The above lambda was found in R; to let Python find it, use the commented-out line above

X_private_test = encoded_private_test.drop('SalePrice', axis = 1)

X_houses_train = houses_train.drop('SalePrice', axis = 1)
yT_houses_train = stats.boxcox(houses_train['SalePrice'], lmbda = 0.22) # T indicates transformation; ndarray output

## Defining model object and fitting to private training set

In [34]:
#### Define a linear regression object ols
from sklearn import linear_model
ols = linear_model.LinearRegression()

(Recall from lecture code) 
Some attributes and methods we will use for linear regression:

- `coef_`: Estimated coefficients ($\hat{\beta}_1$,...) for the linear regression problem.
- `intercept_`: Independent term ($\hat{\beta}_0$) in the linear model.
- `fit(X, y)`: Fit linear model.
- `predict(X)`: Predict using the linear model
- `score(X, y)`: Returns the coefficient of determination $R^2$ of the prediction.

In [35]:
# If data isn't already pre-split:
#try:  # train_test_split was moved in 0.18.0
#    from sklearn.model_selection import train_test_split
#except:  # Following import works through 0.19 but outputs a warning in 0.18
#    from sklearn.cross_validation import train_test_split
#X_private_train, X_private_test, yT_private_train, yT_private_test = train_test_split(X_houses_train, yT_houses_train, test_size=0.2, random_state=11)



In [36]:
#from sklearn.decomposition import PCA
#pca = PCA(n_components=200)
#pca.fit(X_private_train)

In [37]:
# Fit a linear model to our private_train data using BoxCoxed response
#ols.fit(pca.transform(X_private_train), yT_private_train)
ols.fit(X_private_train, yT_private_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [25]:
# Predict BoxCoxed response outputs
#predictedT = ols.predict(pca.transform(X_private_train))
predictedT = ols.predict(X_private_train)

In [26]:
predictedT

array([ 59.8564746 ,  56.31624283,  62.13400496, ...,  76.31586053,
        55.21181148,  60.26475354])

In [27]:
# Inverse BoxCox previous prediction to get predicted SalePrices
#predicted = np.power((predictedT * private_lambda) + 1, 1 / private_lambda) 
predicted = np.power((predictedT * 0.22) + 1, 1/(0.22)) # Use line above to use python-found lambda value

In [28]:
# Compute RMSE and R^2 using log of un-BoxCoxed predictions; must use .reshape(-1,1) to match dimensions of object
# containing actual y-values with object containing predicted y-values
print("RMSE: {0}".format(np.sqrt(np.mean((np.log(encoded_private_train['SalePrice'].values) - np.log(predicted)) ** 2))))

RMSE: 0.07754057269756197


## Testing model on private test set

In [29]:
# Predict BoxCoxed response outputs for private test
#test_predictedT = ols.predict(pca.transform(X_private_test))
test_predictedT = ols.predict(X_private_test)

In [30]:
test_predictedT

array([ -1.06208589e+13,   3.82687370e+12,   6.60602420e+12,
         1.30198229e+13,   9.66227340e+11,   3.57795320e+12,
        -7.54319940e+10,  -3.72058869e+13,   6.17225527e+11,
         5.97502827e+12,   1.41381799e+12,   2.78647434e+12,
        -6.11047640e+12,   1.43099247e+11,  -8.92298491e+08,
        -8.91561055e+12,  -3.65074617e+11,   1.14166334e+12,
        -1.88388090e+13,   8.55785497e+12,   4.25001905e+12,
         5.09547203e+12,   1.32096170e+12,  -4.70682753e+12,
         3.16856131e+12,   1.55497234e+11,   3.01123293e+12,
         3.34752950e+09,  -1.07985193e+12,   1.04282750e+13,
         1.12330863e+12,   1.05638246e+11,   5.23283580e+12,
         7.21112704e+11,   6.74026846e+12,   4.74470991e+12,
         2.20834237e+13,  -2.92711823e+13,   3.38552506e+12,
         4.63566265e+12,  -8.67114284e+12,  -2.68453407e+12,
        -1.16501912e+13,  -1.06355911e+13,  -2.23868589e+13,
        -7.71493590e+11,   2.08721445e+12,   1.58415912e+11,
         1.34876513e+12,

In [14]:
# Inverse BoxCox previous prediction to get predicted SalePrices
test_predicted = np.power((test_predictedT * 0.22) + 1, 1/(0.22))

  


In [18]:
# Compute RMSE and R^2 using log of un-BoxCoxed predictions; must use .reshape(-1,1) to match dimensions of object
# containing actual y-values with object containing predicted y-values
print("RMSE: {0}".format(np.sqrt(np.mean((np.log(encoded_private_test['SalePrice'].values.reshape(-1,1)) - np.log(test_predicted)) ** 2))))

RMSE: nan


  This is separate from the ipykernel package so we can avoid doing imports until
