In [1]:
%env HV_DOC_HTML = true

env: HV_DOC_HTML=true


In [2]:
pip install -q hvplot

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.8 MB[0m [31m7.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m27.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Import required libraries
import pandas as pd
import numpy as np
import hvplot.pandas
from sklearn.linear_model import LinearRegression
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
# Load the housing data
housingDF = pd.read_csv('Housing.csv')
housingDF.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [6]:
# Make scatter plot of the area vs. price
hvplot.extension('bokeh')
housingScatterPlot = housingDF.hvplot.scatter(
    x='area',
    y='price' # no "by" since there is no grouping
)

housingScatterPlot

In [7]:
# To make the linear regression, we split the x (i.e., area) off from the y (price)
X = housingDF['area'].values.reshape(-1, 1)
X[:3]

array([[7420],
       [8960],
       [9960]])

In [8]:
# The shape is going to be 545 observations (i.e., samples) with a single feature (column, in this case, area)
X.shape

(545, 1)

In [10]:
# Create the array for the dependent variable Y (i.e., price)
y = housingDF['price']
y[:3]

0    13300000
1    12250000
2    12250000
Name: price, dtype: int64

In [11]:
# Set up the linear regression model
model = LinearRegression()

In [12]:
# Fit the X and y data onto the model
model.fit(X, y)

In [13]:
# Remember the goal for linear regression is to get a line equation that best "fits" the data (i.e, need a slope and intercept)
# : y = mx + b; get the slope: .coef_
model.coef_

array([461.97489427])

In [14]:
# Get the intercept - .intercept_
model.intercept_

2387308.48239643

In [15]:
# Make the line of best fit
print(f"y = {model.coef_[0]}X + {model.intercept_}")

y = 461.97489427278344X + 2387308.48239643


In [16]:
# The line of best fit also allows for us to make predictions - predict price based on the area
# E.g., Predict a price of a house with 2000 sqft area
twoThousandSQFAreaPred = (model.coef_[0] * 2000) + model.intercept_

print(f"Predicted Price for 2000 sqft area: ${twoThousandSQFAreaPred:,.2f}")

Predicted Price for 2000 sqft area: $3,311,258.27


In [17]:
# WE NEED A LIST OF PREDICTED VALUES TO DRAW A LINE OF BEST FIT FOR THE REGRESSION
predictedPrices = model.predict(X)
predictedPrices

array([5815162.19790048, 6526603.53508057, 6988578.42935335,
       5852120.18944231, 5815162.19790048, 5852120.18944231,
       6351053.07525691, 9871301.76961552, 6129305.12600598,
       5043664.12446493, 8485377.08679717, 5159157.84803313,
       5413244.03988316, 4004220.61235117, 5990712.65772414,
       5159157.84803313, 5436342.7845968 , 6314095.08371509,
       4512392.99605123, 5353187.3036277 , 4383040.02565485,
       5692738.8509182 , 6106206.38129234, 4493914.00028032,
       6452687.55199692, 5408624.29094043, 5159157.84803313,
       6487335.66906738, 6060008.89186506, 4928170.40089674,
       5840570.81708549, 5621132.74230591, 4641745.96644761,
       5140678.85226222, 5547216.75922227, 5621132.74230591,
       5843804.6413454 , 6545082.53085148, 5159157.84803313,
       5159157.84803313, 5413244.03988316, 5325468.80997133,
       5380905.79728407, 5159157.84803313, 5159157.84803313,
       5159157.84803313, 5159157.84803313, 5436342.7845968 ,
       4373800.5277694 ,

In [18]:
# Make a copy of the housingDF
priceDFPredicted = housingDF.copy()

# Add the predicted prices (as a new column to the new DataFrame)
priceDFPredicted["predicted_price"] = predictedPrices

priceDFPredicted.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,predicted_price
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished,5815162.0
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished,6526604.0
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished,6988578.0
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished,5852120.0
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished,5815162.0


In [23]:
# Make the line plot of the predicted values (i.e, line of best fit)
hvplot.extension('bokeh')
lineOfBestFit = priceDFPredicted.hvplot.line(
    x='area',
    y='predicted_price',
    color='red'
)
lineOfBestFit

In [22]:
# Superimpose the line of best fit onto the original dataset.
hvplot.extension('bokeh')
housingScatterPlot * lineOfBestFit

In [None]:
"""
For the linear regression model, the score and r2_score functions retrieve the same value.
We can use either metric as a general measure of the model's accuracy: the closer to 1, the better.

Remember that the RMSE is the square root of the mean squared error,
i.e.,the standard deviation of the errors. A low RMSE score means that the model fits well to the data.

Ideally, the RMSE will not exceed the standard deviation of our housing data.

The RMSE, or the standard deviation of the error, is 1577612.5628.

The standard deviation of the salaries, calculated by np.std(), is 1868722.8281.

The standard deviation exceeds the RMSE, indicating that the model may be useful.
In other words, on average, there are smaller swings in the error than for the recorded prices.
"""

In [21]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = round(model.score(X, y, sample_weight=None),5)
r2 = round(r2_score(y, predictedPrices),5)
mse = round(mean_squared_error(y, predictedPrices),4)
rmse = round(np.sqrt(mse),4)
std = round(np.std(y),4)

# Print relevant metrics
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.28729.
The r2 is 0.28729.
The mean squared error is 2488861398180.6567.
The root mean squared error is 1577612.5628.
The standard deviation is 1868722.8281.
