In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [3]:
NY_Housing = pd.read_csv("../data/house_prices_multivariate.csv")
NY_Housing.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,YrSold,SalePrice
0,65.0,8450,7,5,2003,2003,196.0,706,0,150,...,548,0,61,0,0,0,0,0,2008,208500
1,80.0,9600,6,8,1976,1976,0.0,978,0,284,...,460,298,0,0,0,0,0,0,2007,181500
2,68.0,11250,7,5,2001,2002,162.0,486,0,434,...,608,0,42,0,0,0,0,0,2008,223500
3,60.0,9550,7,5,1915,1970,0.0,216,0,540,...,642,0,35,272,0,0,0,0,2006,140000
4,84.0,14260,8,5,2000,2000,350.0,655,0,490,...,836,192,84,0,0,0,0,0,2008,250000


In [4]:
# making Independent and Dependent variables from the dataset
X = NY_Housing.iloc[:,:-1] # Selecting everything except the last column
y = NY_Housing.SalePrice   
X, y

(      LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  YearRemodAdd  \
 0            65.0     8450            7            5       2003          2003   
 1            80.0     9600            6            8       1976          1976   
 2            68.0    11250            7            5       2001          2002   
 3            60.0     9550            7            5       1915          1970   
 4            84.0    14260            8            5       2000          2000   
 ...           ...      ...          ...          ...        ...           ...   
 1374         62.0     7917            6            5       1999          2000   
 1375         85.0    13175            6            6       1978          1988   
 1376         66.0     9042            7            9       1941          2006   
 1377         68.0     9717            5            6       1950          1996   
 1378         75.0     9937            5            6       1965          1965   
 
       MasVnrA

In [5]:
regressor = LinearRegression() 
regressor.fit(X, y)

LinearRegression()

In [6]:
print("intercept:", regressor.intercept_) # This is the y-intercept 
print("coefficients of predictors:", regressor.coef_) # These are the weights or regression coefficients.

intercept: 310649.26008892496
coefficients of predictors: [ 4.21581098e+01  4.41367617e-01  1.77089455e+04  5.84597164e+03
  3.59658315e+02  1.19385237e+02  2.59435150e+01  9.76748897e+00
  7.65860420e-01 -6.61329856e-01  9.87201953e+00  1.96567898e+01
  1.31846867e+01 -4.66155080e+00  2.81799257e+01  6.78157998e+03
  3.36169287e+02  1.40344800e+03 -2.93573021e+03 -8.64074712e+03
 -3.35073713e+04  6.10172168e+03  3.20869122e+03 -8.23684306e+01
  1.56189970e+04  9.59392447e+00  2.51559075e+01  5.60981357e-01
  1.07712460e+01  2.51081902e+01  5.36124522e+01 -4.13099007e+01
 -8.16461371e-02 -5.83097021e+02]


In [10]:
#predict the price of a house with following specifications:
my_house = X.iloc[155]
my_house

LotFrontage          0.0
LotArea          16669.0
OverallQual          8.0
OverallCond          6.0
YearBuilt         1981.0
YearRemodAdd      1981.0
MasVnrArea         653.0
BsmtFinSF1           0.0
BsmtFinSF2           0.0
BsmtUnfSF         1686.0
TotalBsmtSF       1686.0
1stFlrSF          1707.0
2ndFlrSF             0.0
LowQualFinSF         0.0
GrLivArea         1707.0
BsmtFullBath         0.0
BsmtHalfBath         0.0
FullBath             2.0
HalfBath             1.0
BedroomAbvGr         2.0
KitchenAbvGr         1.0
TotRmsAbvGrd         6.0
Fireplaces           1.0
GarageYrBlt       1981.0
GarageCars           2.0
GarageArea         511.0
WoodDeckSF         574.0
OpenPorchSF         64.0
EnclosedPorch        0.0
3SsnPorch            0.0
ScreenPorch          0.0
PoolArea             0.0
MiscVal              0.0
YrSold            2006.0
Name: 155, dtype: float64

In [11]:
pred_my_house = regressor.predict(my_house.values.reshape(1, -1))
print(pred_my_house)
print("predicted value:", pred_my_house[0])
print("actual value:", y[155])

[264519.41857028]
predicted value: 264519.41857028275
actual value: 228000


In [13]:
#predict the price for all the houses in the dataset:
y_pred = regressor.predict(X)
print(y_pred)
print(y_pred[:10])

[223165.2446233  193708.14702761 216394.79759077 ... 231576.65840449
 128124.97903219 150396.517218  ]
[223165.2446233  193708.14702761 216394.79759077 197356.62505514
 295125.75398645 172516.96207705 269477.13355183 245198.81455232
 168787.92247658  87185.78920276]


In [14]:
#put the predicted values next to the actual values
prices = pd.DataFrame({"actual": y, "predicted": y_pred})
prices.head(10)

Unnamed: 0,actual,predicted
0,208500,223165.244623
1,181500,193708.147028
2,223500,216394.797591
3,140000,197356.625055
4,250000,295125.753986
5,143000,172516.962077
6,307000,269477.133552
7,200000,245198.814552
8,129900,168787.922477
9,118000,87185.789203


In [15]:
# measuring the goodness of fit using cost function: mean squared error (MSE)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_pred, y)

1219044781.4947448

In [None]:
'''
Error value is very high
, this is because of violation of assumptions in linear regression.

Assumptions in linear regression:
1. linear relationship assumption
2. little or no multi-collinearity assumption
3. homoscedasticity assumption
4. little or no autocorrelation in residuals
5. normal distribution of error terms
'''