In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy import stats

In [23]:
# load data
sales = pd.read_csv("../../ML Data & Script/kc_house_data.csv")
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [24]:
# split data into training and testing
from sklearn.model_selection import train_test_split
target = sales['price']
X_train, X_test, y_train, y_test = train_test_split(sales, target, test_size=0.2, random_state=0)

In [25]:
features = ['sqft_living', 'bedrooms', 'bathrooms']
# use sklearn
from sklearn.linear_model import LinearRegression
model_one = LinearRegression()
model_one.fit(X_train[features],y_train)
print("cofficents: ", model_one.coef_)
print("intercept: ", model_one.intercept_)
# calculate r2
print("RSquared(on training data)", model_one.score(X_train[features],y_train))

from sklearn import metrics
# calculate RSS( MSE is RSS/n)
print("RSS/SSE: ", y_train.shape[0] * metrics.mean_squared_error(model_one.predict(X_train[features]), y_train))

cofficents:  [   313.17055038 -56754.66651422   6887.71910816]
intercept:  67512.01513813145
RSquared(on training data) 0.5090198210347912
RSS/SSE:  1177454678249841.2


In [26]:
# use statsmodels
import statsmodels.formula.api as smf
import statsmodels.api as sm
# An intercept is not included by default and should be added by the user
X_train = sm.add_constant(X_train[features])
model_two = smf.OLS(y_train, X_train).fit()
print(model_two.summary())
print("\n pvalues \n")
print(model_two.pvalues)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.509
Model:                            OLS   Adj. R-squared:                  0.509
Method:                 Least Squares   F-statistic:                     5974.
Date:                Mon, 17 Dec 2018   Prob (F-statistic):               0.00
Time:                        02:47:22   Log-Likelihood:            -2.4018e+05
No. Observations:               17290   AIC:                         4.804e+05
Df Residuals:                   17286   BIC:                         4.804e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const        6.751e+04   7739.284      8.723      

In [27]:
# make predictions
example_predictions = model_one.predict(X_train[features])
print(example_predictions[0]) 

395813.4988028938


In [28]:
def get_residual_sum_of_squares(model, data, outcome):
    # First get the predictions
    predictions=model.predict(data)    
    # Then compute the residuals/errors
    RSS=outcome-predictions
    # Then square and add them up
    RSS=(RSS*RSS).sum()

    return(RSS)    

In [29]:
# rss on test data
rss_example_test = get_residual_sum_of_squares(model_one, X_test[features], y_test)
print("RSS(mannual): ", rss_example_test)
# MSE = RSS/n so RSS = MSE * n
print("RSS(sklearn): ", y_test.shape[0] * metrics.mean_squared_error(model_one.predict(X_test[features]), y_test) )
# r2 on test data
print("Rsquared on test data", model_one.score(X_test[features], y_test))

RSS(mannual):  259213572106085.38
RSS(sklearn):  259213572106085.38
Rsquared on test data 0.49580096548774166


In [30]:
# create new features
sales['bedrooms_squared'] = sales['bedrooms'] * sales['bedrooms']
sales['bed_bath_rooms'] = sales['bedrooms'] * sales['bathrooms']
sales['log_sqft_living'] = np.log(sales['sqft_living'])
sales['lat_plus_long'] = sales['lat'] + sales['long']

X_train, X_test, y_train, y_test = train_test_split(sales, target, test_size=0.2, random_state=0)

In [31]:
## the mean (arithmetic average) value of your 4 new features on TEST data? (round to 2 digits)**
print(X_test['bedrooms_squared'].mean())
print(X_test['bed_bath_rooms'].mean())
print(X_test['log_sqft_living'].mean())
print(X_test['lat_plus_long'].mean())

12.210501966227158
7.447721489706223
7.550239467794731
-74.65426069858911


* Squaring bedrooms will increase the separation between not many bedrooms (e.g. 1) and lots of bedrooms (e.g. 4) since 1^2 = 1 but 4^2 = 16. Consequently this feature will mostly affect houses with many bedrooms.
* bedrooms times bathrooms gives what's called an "interaction" feature. It is large when *both* of them are large.
* Taking the log of squarefeet has the effect of bringing large values closer together and spreading out small values.
* Adding latitude to longitude is totally non-sensical but we will do it anyway (you'll see why)

In [32]:
# Learning multiple models
import statsmodels.formula.api as smf
import statsmodels.api as sm

features1 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
features2 = features1 + ['bed_bath_rooms']
features3 = features2 + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']
X_train = sm.add_constant(X_train)

In [33]:
# statsmodels
model_three = smf.OLS(y_train, X_train[features1 + ['const']]).fit()
print(model_three.summary())
print("\n pvalues \n")
print(model_three.pvalues)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.591
Model:                            OLS   Adj. R-squared:                  0.591
Method:                 Least Squares   F-statistic:                     5004.
Date:                Mon, 17 Dec 2018   Prob (F-statistic):               0.00
Time:                        02:47:23   Log-Likelihood:            -2.3859e+05
No. Observations:               17290   AIC:                         4.772e+05
Df Residuals:                   17284   BIC:                         4.772e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
sqft_living   312.9420      3.199     97.821      

In [34]:
# scikit_learn
model_three = LinearRegression()
model_three.fit(X_train[features1], y_train)
print("cofficents: ", model_three.coef_)
print("intercept: ", model_three.intercept_)
# calculate r2
print("RSquared(on training data)", model_three.score(X_train[features1],y_train))

cofficents:  [ 3.12942010e+02 -5.30962691e+04  1.47770428e+04  6.53983343e+05
 -3.25707336e+05]
intercept:  -70870846.23890634
RSquared(on training data) 0.591420549946847


In [35]:
# model_four
model_four = smf.OLS(y_train, X_train[features2 + ['const']]).fit()
print(model_four.summary())
print("\n pvalues \n")
print(model_four.pvalues)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.595
Model:                            OLS   Adj. R-squared:                  0.595
Method:                 Least Squares   F-statistic:                     4235.
Date:                Mon, 17 Dec 2018   Prob (F-statistic):               0.00
Time:                        02:47:23   Log-Likelihood:            -2.3851e+05
No. Observations:               17290   AIC:                         4.770e+05
Df Residuals:                   17283   BIC:                         4.771e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
sqft_living      306.8196      3.221     95.

In [36]:
# scikit_learn
model_four = LinearRegression()
model_four.fit(X_train[features2], y_train)
print("cofficents: ", model_four.coef_)
print("intercept: ", model_four.intercept_)
# calculate r2
print("RSquared(on training data)", model_four.score(X_train[features2],y_train))

cofficents:  [ 3.06819573e+02 -1.04604718e+05 -7.01815289e+04  6.50590952e+05
 -3.09965751e+05  2.49441497e+04]
intercept:  -68606820.39569089
RSquared(on training data) 0.5951919223498363


In [37]:
# model_five
model_five = smf.OLS(y_train, X_train[features3 + ['const']]).fit()
print(model_five.summary())
print("\n pvalues \n")
print(model_five.pvalues)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.619
Model:                            OLS   Adj. R-squared:                  0.619
Method:                 Least Squares   F-statistic:                     3510.
Date:                Mon, 17 Dec 2018   Prob (F-statistic):               0.00
Time:                        02:47:23   Log-Likelihood:            -2.3798e+05
No. Observations:               17290   AIC:                         4.760e+05
Df Residuals:                   17281   BIC:                         4.761e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
sqft_living        537.8081      7.799  

In [38]:
# scikit_learn
model_five = LinearRegression()
model_five.fit(X_train[features3], y_train)
print("cofficents: ", model_five.coef_)
print("intercept: ", model_five.intercept_)
# calculate r2
print("RSquared(on training data)", model_five.score(X_train[features3],y_train))

cofficents:  [ 5.37808086e+02  2.78047910e+03  1.01363766e+05  5.30798406e+05
 -4.09655435e+05 -1.81822552e+04  7.24579939e+02 -5.71030023e+05
  1.21142971e+05]
intercept:  -62628450.27224184
RSquared(on training data) 0.6190207246118422


In [39]:
# comparing the models on training data(RSS)
rss_model_three = y_train.shape[0] * metrics.mean_squared_error(model_three.predict(X_train[features1]), y_train)
rss_model_four = y_train.shape[0] * metrics.mean_squared_error(model_four.predict(X_train[features2]), y_train)
rss_model_five = y_train.shape[0] * metrics.mean_squared_error(model_five.predict(X_train[features3]), y_train)

print(rss_model_three)
print(rss_model_four)
print(rss_model_five)


979843597588329.8
970799199729578.9
913653644974959.5


In [42]:
# comparing the models on training data(R2)
rsquared_model_three = model_three.score(X_train[features1], y_train)
rsquared_model_four = model_four.score(X_train[features2], y_train)
rsquared_model_five = model_five.score(X_train[features3], y_train)
print(rsquared_model_three)
print(rsquared_model_four)
print(rsquared_model_five)

0.591420549946847
0.5951919223498363
0.6190207246118422


In [40]:
# comparing the models on test data
rss_model_three = y_test.shape[0] * metrics.mean_squared_error(model_three.predict(X_test[features1]), y_test)
rss_model_four = y_test.shape[0] * metrics.mean_squared_error(model_four.predict(X_test[features2]), y_test)
rss_model_five = y_test.shape[0] * metrics.mean_squared_error(model_five.predict(X_test[features3]), y_test)

print(rss_model_three)
print(rss_model_four)
print(rss_model_five)

213487129319106.8
210778544168945.2
203972051917617.3


In [43]:
# comparing the models on test data(R2)
rsquared_model_three = model_three.score(X_test[features1], y_test)
rsquared_model_four = model_four.score(X_test[features2], y_test)
rsquared_model_five = model_five.score(X_test[features3], y_test)
print(rsquared_model_three)
print(rsquared_model_four)
print(rsquared_model_five)

0.5847439483630328
0.5900124457125729
0.6032518250924851


#### Training Data

* RSS(lowest to highest): model_four, model_five, model_three
* R2(highest to lowest) : model_five, model_four, model_three

#### Testing Data   

* RSS(lowest to highest): model_five, model_four, model_three
* R2(highest to lowest) : model_five, model_four, model_three