In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy import stats
from sklearn.model_selection import train_test_split


In [2]:
# load data
sales = pd.read_csv("../../ML Data & Script/kc_house_data.csv")
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
# split data into training and testing
target = sales['price']
X_train, X_test, y_train, y_test = train_test_split(sales, target, test_size=0.2, random_state=10)

features = ['sqft_living', 'bedrooms', 'bathrooms']
# use sklearn
from sklearn.linear_model import LinearRegression
model_one = LinearRegression()
model_one.fit(X_train[features],y_train)
print("cofficents: ", model_one.coef_)
print("intercept: ", model_one.intercept_)
# calculate r2
print("RSquared(on training data)", model_one.score(X_train[features], y_train))

from sklearn import metrics
# calculate RSS( MSE is RSS/n)
print("RSS/SSE: ", y_train.shape[0] * metrics.mean_squared_error(model_one.predict(X_train[features]), y_train))

cofficents:  [   306.15090562 -57658.90103459   7913.53847651]
intercept:  81100.95967753674
RSquared(on training data) 0.5032799225389821
RSS/SSE:  1133191715090008.2


In [4]:
# use statsmodels
import statsmodels.formula.api as smf
import statsmodels.api as sm
# An intercept is not included by default and should be added by the user
X_train = sm.add_constant(X_train[features])
model_two = smf.OLS(y_train, X_train).fit()
print(model_two.summary())
print("\n pvalues \n")
print(model_two.pvalues)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.503
Model:                            OLS   Adj. R-squared:                  0.503
Method:                 Least Squares   F-statistic:                     5838.
Date:                Sat, 22 Dec 2018   Prob (F-statistic):               0.00
Time:                        11:53:42   Log-Likelihood:            -2.3985e+05
No. Observations:               17290   AIC:                         4.797e+05
Df Residuals:                   17286   BIC:                         4.797e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const         8.11e+04   7641.093     10.614      

In [5]:
# make predictions
example_predictions = model_one.predict(X_train[features])
print(example_predictions[0]) 

258417.038313439


In [6]:
def get_residual_sum_of_squares(model, data, outcome):
    # First get the predictions
    predictions=model.predict(data)    
    # Then compute the residuals/errors
    RSS=outcome-predictions
    # Then square and add them up
    RSS=(RSS*RSS).sum()

    return(RSS)    

In [7]:
# rss on test data
rss_example_test = get_residual_sum_of_squares(model_one, X_test[features], y_test)
print("RSS on test data(mannual): ", rss_example_test)
# MSE = RSS/n so RSS = MSE * n
print("RSS on test data(sklearn): ", y_test.shape[0] * metrics.mean_squared_error(model_one.predict(X_test[features]), y_test) )
# r2 on test data
print("Rsquared on test data", model_one.score(X_test[features], y_test))

RSS on test data(mannual):  303290122873171.5
RSS on test data(sklearn):  303290122873171.5
Rsquared on test data 0.519780761889068


In [8]:
# create new features
sales['bedrooms_squared'] = sales['bedrooms'] * sales['bedrooms']
sales['bed_bath_rooms'] = sales['bedrooms'] * sales['bathrooms']
sales['log_sqft_living'] = np.log(sales['sqft_living'])
sales['lat_plus_long'] = sales['lat'] + sales['long']

X_train, X_test, y_train, y_test = train_test_split(sales, target, test_size=0.2, random_state=10)

In [9]:
## the mean (arithmetic average) value of your 4 new features on TEST data? (round to 2 digits)**
print(X_test['bedrooms_squared'].mean())
print(X_test['bed_bath_rooms'].mean())
print(X_test['log_sqft_living'].mean())
print(X_test['lat_plus_long'].mean())

12.133703446680546
7.463162155910248
7.5486804489573025
-74.65885033541488


* Squaring bedrooms will increase the separation between not many bedrooms (e.g. 1) and lots of bedrooms (e.g. 4) since 1^2 = 1 but 4^2 = 16. Consequently this feature will mostly affect houses with many bedrooms.
* bedrooms times bathrooms gives what's called an "interaction" feature. It is large when *both* of them are large.
* Taking the log of squarefeet has the effect of bringing large values closer together and spreading out small values.
* Adding latitude to longitude is totally non-sensical but we will do it anyway (you'll see why)

In [10]:
# Learning multiple models
import statsmodels.formula.api as smf
import statsmodels.api as sm

features1 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
features2 = features1 + ['bed_bath_rooms']
features3 = features2 + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']
X_train = sm.add_constant(X_train)

In [11]:
# statsmodels
model_three = smf.OLS(y_train, X_train[features1 + ['const']]).fit()
print(model_three.summary())
print("\n pvalues \n")
print(model_three.pvalues)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.586
Model:                            OLS   Adj. R-squared:                  0.586
Method:                 Least Squares   F-statistic:                     4897.
Date:                Sat, 22 Dec 2018   Prob (F-statistic):               0.00
Time:                        11:53:43   Log-Likelihood:            -2.3827e+05
No. Observations:               17290   AIC:                         4.765e+05
Df Residuals:                   17284   BIC:                         4.766e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
sqft_living   304.2173      3.173     95.877      

In [12]:
# scikit_learn
model_three = LinearRegression()
model_three.fit(X_train[features1], y_train)
print("cofficents: ", model_three.coef_)
print("intercept: ", model_three.intercept_)
# calculate r2
print("RSquared(on training data)", model_three.score(X_train[features1],y_train))

cofficents:  [ 3.04217336e+02 -5.37618792e+04  1.60797232e+04  6.51882003e+05
 -3.04222659e+05]
intercept:  -68129245.36049479
RSquared(on training data) 0.5861763880871873


In [13]:
# model_four
model_four = smf.OLS(y_train, X_train[features2 + ['const']]).fit()
print(model_four.summary())
print("\n pvalues \n")
print(model_four.pvalues)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.588
Model:                            OLS   Adj. R-squared:                  0.588
Method:                 Least Squares   F-statistic:                     4110.
Date:                Sat, 22 Dec 2018   Prob (F-statistic):               0.00
Time:                        11:53:43   Log-Likelihood:            -2.3823e+05
No. Observations:               17290   AIC:                         4.765e+05
Df Residuals:                   17283   BIC:                         4.765e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
sqft_living      300.3238      3.199     93.

In [14]:
# scikit_learn
model_four = LinearRegression()
model_four.fit(X_train[features2], y_train)
print("cofficents: ", model_four.coef_)
print("intercept: ", model_four.intercept_)
# calculate r2
print("RSquared(on training data)", model_four.score(X_train[features2],y_train))

cofficents:  [ 3.00323837e+02 -8.83214580e+04 -4.12541090e+04  6.49855298e+05
 -2.94483595e+05  1.67752839e+04]
intercept:  -66722570.29684726
RSquared(on training data) 0.5879394915900829


In [15]:
# model_five
model_five = smf.OLS(y_train, X_train[features3 + ['const']]).fit()
print(model_five.summary())
print("\n pvalues \n")
print(model_five.pvalues)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.609
Model:                            OLS   Adj. R-squared:                  0.608
Method:                 Least Squares   F-statistic:                     3360.
Date:                Sat, 22 Dec 2018   Prob (F-statistic):               0.00
Time:                        11:53:43   Log-Likelihood:            -2.3778e+05
No. Observations:               17290   AIC:                         4.756e+05
Df Residuals:                   17281   BIC:                         4.757e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
sqft_living        514.7614      7.874  

In [16]:
# scikit_learn
model_five = LinearRegression()
model_five.fit(X_train[features3], y_train)
print("cofficents: ", model_five.coef_)
print("intercept: ", model_five.intercept_)
# calculate r2
print("RSquared(on training data)", model_five.score(X_train[features3],y_train))

cofficents:  [ 5.14761424e+02  4.24414365e+03  1.09425026e+05  5.26320812e+05
 -4.00727762e+05 -2.12216563e+04  8.33084495e+02 -5.24370453e+05
  1.25593050e+05]
intercept:  -61298463.39175346
RSquared(on training data) 0.6086692576682473


In [17]:
# comparing the models on training data(RSS)
rss_model_three = y_train.shape[0] * metrics.mean_squared_error(model_three.predict(X_train[features1]), y_train)
rss_model_four = y_train.shape[0] * metrics.mean_squared_error(model_four.predict(X_train[features2]), y_train)
rss_model_five = y_train.shape[0] * metrics.mean_squared_error(model_five.predict(X_train[features3]), y_train)

print(rss_model_three)
print(rss_model_four)
print(rss_model_five)


944075969155936.2
940053715228654.5
892761889829522.1


In [18]:
# comparing the models on training data(R2)
rsquared_model_three = model_three.score(X_train[features1], y_train)
rsquared_model_four = model_four.score(X_train[features2], y_train)
rsquared_model_five = model_five.score(X_train[features3], y_train)
print(rsquared_model_three)
print(rsquared_model_four)
print(rsquared_model_five)

0.5861763880871873
0.5879394915900829
0.6086692576682473


In [19]:
# comparing the models on test data
rss_model_three = y_test.shape[0] * metrics.mean_squared_error(model_three.predict(X_test[features1]), y_test)
rss_model_four = y_test.shape[0] * metrics.mean_squared_error(model_four.predict(X_test[features2]), y_test)
rss_model_five = y_test.shape[0] * metrics.mean_squared_error(model_five.predict(X_test[features3]), y_test)

print(rss_model_three)
print(rss_model_four)
print(rss_model_five)

249121687561489.2
242840827232481.2
225783764096783.75


In [20]:
# comparing the models on test data(R2)
rsquared_model_three = model_three.score(X_test[features1], y_test)
rsquared_model_four = model_four.score(X_test[features2], y_test)
rsquared_model_five = model_five.score(X_test[features3], y_test)
print(rsquared_model_three)
print(rsquared_model_four)
print(rsquared_model_five)

0.6055492151727091
0.6154941152350784
0.6425016873440456


#### Training Data

* RSS(lowest to highest): model_five, model_four, model_three
* R2(highest to lowest) : model_five, model_four, model_three

#### Testing Data   

* RSS(lowest to highest): model_five, model_four, model_three
* R2(highest to lowest) : model_five, model_four, model_three