In [43]:
# A package for linear regression
install.packages("moderndive")
library(moderndive)

# Turn the scientific notation off. To turn it on, set the number as 0.
options(scipen = 999)
# Set the digits of output as 4.
options(digits = 4)

house <- read.csv(file = 'kc_house_data.csv')
# If we need, we can delete the id and date columns.
head(house)

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
7129300520,20141013T000000,221900,3,1.0,1180,5650,1,0,0,...,7,1180,0,1955,0,98178,47.51,-122.3,1340,5650
6414100192,20141209T000000,538000,3,2.25,2570,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.72,-122.3,1690,7639
5631500400,20150225T000000,180000,2,1.0,770,10000,1,0,0,...,6,770,0,1933,0,98028,47.74,-122.2,2720,8062
2487200875,20141209T000000,604000,4,3.0,1960,5000,1,0,0,...,7,1050,910,1965,0,98136,47.52,-122.4,1360,5000
1954400510,20150218T000000,510000,3,2.0,1680,8080,1,0,0,...,8,1680,0,1987,0,98074,47.62,-122.0,1800,7503
7237550310,20140512T000000,1225000,4,4.5,5420,101930,1,0,0,...,11,3890,1530,2001,0,98053,47.66,-122.0,4760,101930


In [58]:
# Check the data types of the independent variables.
str(house)

'data.frame':	21613 obs. of  21 variables:
 $ id           : num  7129300520 6414100192 5631500400 2487200875 1954400510 ...
 $ date         : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
 $ price        : num  221900 538000 180000 604000 510000 ...
 $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
 $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
 $ sqft_living  : int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
 $ sqft_lot     : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
 $ floors       : num  1 2 1 1 1 1 2 1 1 2 ...
 $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ view         : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ condition    : Factor w/ 5 levels "1","2","3","4",..: 3 3 3 5 3 3 3 3 3 3 ...
 $ grade        : Factor w/ 12 levels "1","3","4","5",..: 6 6 5 6 7 10 6 6 6 6 ...
 $ sqft_above   : int  1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
 $ sqft_basement: int  0 400 0 91

In [89]:
# Insert NULL values into the unrelevant variables.
house$id <- NULL
house$date <- NULL
house$lat <- NULL # The data is just for the King County, no big impact
house$long <- NULL # In the same area, no big impact
house$sqft_above <- NULL # sqft_living = sqft_above + sqft_basement
house$sqft_basement <- NULL # sqft_living = sqft_above + sqft_basement

# Make sure that the data types of catogorical variables are "factor."
# No need to change "waterfront" becasue it's dummy coding.
house$view = as.factor(house$view)
house$condition = as.factor(house$condition)
house$grade = as.factor(house$grade)
house$zipcode = as.factor(house$zipcode)

# Convert "yr_renovated" to dummy coding
house$yr_renovated = ifelse(house$yr_renovated == 0,0,1)

# Check the data types again
str(house)

'data.frame':	21613 obs. of  15 variables:
 $ price        : num  221900 538000 180000 604000 510000 ...
 $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
 $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
 $ sqft_living  : int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
 $ sqft_lot     : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
 $ floors       : num  1 2 1 1 1 1 2 1 1 2 ...
 $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ view         : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ condition    : Factor w/ 5 levels "1","2","3","4",..: 3 3 3 5 3 3 3 3 3 3 ...
 $ grade        : Factor w/ 12 levels "1","3","4","5",..: 6 6 5 6 7 10 6 6 6 6 ...
 $ yr_built     : int  1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
 $ yr_renovated : num  0 1 0 0 0 0 0 0 0 0 ...
 $ zipcode      : Factor w/ 70 levels "98001","98002",..: 67 56 17 59 38 30 3 69 61 24 ...
 $ sqft_living15: int  1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
 $

In [105]:
# Linear Regression for price and all variables
get_regression_table(price_all)
get_regression_summaries(price_all)

term,estimate,std_error,statistic,p_value,lower_ci,upper_ci
intercept,624282.859,193784.835,3.222,0.001,244450.194,1004115.525
bedrooms,-11780.654,1461.715,-8.059,0.000,-14645.725,-8915.583
bathrooms,20249.275,2448.160,8.271,0.000,15450.699,25047.851
sqft_living,144.911,2.592,55.905,0.000,139.831,149.992
sqft_lot,0.254,0.036,7.064,0.000,0.183,0.324
floors,-8615.326,2618.517,-3.290,0.001,-13747.815,-3482.838
waterfront,573526.001,15023.867,38.174,0.000,544078.107,602973.896
view1,77651.068,8508.326,9.126,0.000,60974.117,94328.019
view2,62023.441,5188.406,11.954,0.000,51853.779,72193.102
view3,135065.289,7087.810,19.056,0.000,121172.656,148957.923


r_squared,adj_r_squared,mse,rmse,sigma,statistic,p_value,df
0.834,0.833,22391414542,149638,149982,1102,0,99


In [108]:
# Get information on each point/observation in the regression, including fitted/predicted values and residuals
prediction <- get_regression_points(price_all)
prediction

ID,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,yr_built,yr_renovated,zipcode,sqft_living15,sqft_lot15,price_hat,residual
1,221900,3,1.00,1180,5650,1.0,0,0,3,7,1955,0,98178,1340,5650,153333,68567
2,538000,3,2.25,2570,7242,2.0,0,0,3,7,1951,1,98125,1690,7639,604968,-66968
3,180000,2,1.00,770,10000,1.0,0,0,3,6,1933,0,98028,2720,8062,245036,-65036
4,604000,4,3.00,1960,5000,1.0,0,0,5,7,1965,0,98136,1360,5000,574502,29498
5,510000,3,2.00,1680,8080,1.0,0,0,3,8,1987,0,98074,1800,7503,423733,86267
6,1225000,4,4.50,5420,101930,1.0,0,0,3,11,2001,0,98053,4760,101930,1528473,-303473
7,257500,3,2.25,1715,6819,2.0,0,0,3,7,1995,0,98003,2238,6819,222874,34626
8,291850,3,1.50,1060,9711,1.0,0,0,3,7,1963,0,98198,1650,9711,124147,167703
9,229500,3,1.00,1780,7470,1.0,0,0,3,7,1960,0,98146,1780,8113,315698,-86198
10,323000,3,2.50,1890,6560,2.0,0,0,3,7,2003,0,98038,2390,7570,297937,25063


In [120]:
# Get the average residual and price
avg_residual <- mean(abs(prediction$residual))
avg_price <- mean(prediction$price)
# The percentage which cannot be explained by the model
round(avg_residual / avg_price * 100, 2)