In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score

In [2]:
housing_int_df = pd.read_csv("../data/processed data/housing_mortgage_final.csv")
housing_int_df['Date'] = pd.to_datetime(housing_int_df['Date'])
print(housing_int_df.dtypes, housing_int_df.shape)
housing_int_df.head(3)

state_name                   object
county_name                  object
Date                 datetime64[ns]
Price                       float64
mortgage_int_rate           float64
dtype: object (13780, 5)


Unnamed: 0,state_name,county_name,Date,Price,mortgage_int_rate
0,California,Fresno,2000-01-31,121248.0,8.21
1,California,Kern,2000-01-31,108969.0,8.21
2,California,Los Angeles,2000-01-31,231141.0,8.21


In [3]:
# Removing data uptil 2012, to eliminate the non linear post recession dip
date = pd.to_datetime('2012-12-31')
mask = housing_int_df['Date'] > date
housing_int_2012 = housing_int_df.loc[mask]
housing_int_2012.head()

Unnamed: 0,state_name,county_name,Date,Price,mortgage_int_rate
8112,California,Fresno,2013-01-31,165879.0,3.414
8113,California,Kern,2013-01-31,146547.0,3.414
8114,California,Los Angeles,2013-01-31,425542.0,3.414
8115,California,Monterey,2013-01-31,342353.0,3.414
8116,California,Napa,2013-01-31,419359.0,3.414


In [4]:
housing_int_2012.reset_index(drop=True, inplace=True)
housing_int_2012.tail()

Unnamed: 0,state_name,county_name,Date,Price,mortgage_int_rate
5663,Washington,Kitsap,2022-01-31,552864.0,3.445
5664,Washington,Skagit,2022-01-31,562064.0,3.445
5665,Washington,Spokane,2022-01-31,411352.0,3.445
5666,Washington,Thurston,2022-01-31,502325.0,3.445
5667,Washington,Walla Walla,2022-01-31,381225.0,3.445


<b>Separating values from "2021-07-31" to "2022-01-31", to save it latter for model prediction. Saving data prior to "2021-07-31" into training dataframe</b>

In [5]:
dt_separate = pd.to_datetime('2021-06-30')
LA_hou_mor_train = housing_int_df[(housing_int_df["state_name"]=="California") & (housing_int_df["county_name"]=="Los Angeles") & (housing_int_df['Date'] <= dt_separate)]
LA_hou_mor_topred = housing_int_df[(housing_int_df["state_name"]=="California") & (housing_int_df["county_name"]=="Los Angeles") & (housing_int_df['Date'] > dt_separate)]

In [6]:
De_hou_mor_train = housing_int_df[(housing_int_df["state_name"]=="Colorado") & (housing_int_df["county_name"]=="Denver")& (housing_int_df['Date'] <= dt_separate)]
De_hou_mor_topred =housing_int_df[(housing_int_df["state_name"]=="Colorado") & (housing_int_df["county_name"]=="Denver")& (housing_int_df['Date'] > dt_separate)]

In [7]:
Da_hou_mor_train = housing_int_df[(housing_int_df["state_name"]=="Texas") & (housing_int_df["county_name"]=="Dallas") & (housing_int_df['Date'] <= dt_separate)]
Da_hou_mor_topred = housing_int_df[(housing_int_df["state_name"]=="Texas") & (housing_int_df["county_name"]=="Dallas") & (housing_int_df['Date'] > dt_separate)]

In [8]:
VB_hou_mor_train = housing_int_df[(housing_int_df["state_name"]=="Virginia") & (housing_int_df["county_name"]=="Virginia Beach") & (housing_int_df['Date'] <= dt_separate)]
VB_hou_mor_topred = housing_int_df[(housing_int_df["state_name"]=="Virginia") & (housing_int_df["county_name"]=="Virginia Beach") & (housing_int_df['Date'] > dt_separate)]

In [9]:
Ki_hou_mor_train = housing_int_df[(housing_int_df["state_name"]=="Washington") & (housing_int_df["county_name"]=="King") & (housing_int_df['Date'] <= dt_separate)]
Ki_hou_mor_topred = housing_int_df[(housing_int_df["state_name"]=="Washington") & (housing_int_df["county_name"]=="King") & (housing_int_df['Date'] > dt_separate)]

# Multiple linear regression model: with date and mortgage interest rate as features

In [10]:
# Using Los Angeles, California data to train multiple linear regression model
LA_hou_mor_train['Date_ord']= LA_hou_mor_train['Date'].map(dt.datetime.toordinal)
X = LA_hou_mor_train[["Date_ord","mortgage_int_rate"]]
y= LA_hou_mor_train.Price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  LA_hou_mor_train['Date_ord']= LA_hou_mor_train['Date'].map(dt.datetime.toordinal)


In [11]:
# Fitting data to model and predicting on test data
MLR_model = LinearRegression()
MLR_model.fit(X_train, y_train)
y_pred_test = MLR_model.predict(X_test)
LA_test_pred = pd.DataFrame({'Actual': y_test, 'Predicted':  y_pred_test})

LA_test_pred

Unnamed: 0,Actual,Predicted
3434,556406.0,392531.182158
2342,385571.0,358798.872975
470,250040.0,380366.049161
3798,608196.0,440095.704027
7074,409578.0,493607.314914
...,...,...
11806,666729.0,702952.704319
10870,613964.0,619968.665910
4266,626308.0,460164.231954
11650,662130.0,695436.338825


In [12]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_test))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_test))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_test)))
print('R2 Score:', r2_score(y_test,y_pred_test))

Mean Absolute Error: 67382.65716010125
Mean Squared Error: 7118698383.233811
Root Mean Squared Error: 84372.37926735154
R2 Score: 0.6317816664766178


There is a -0.23 negative correlation between Housing prices and mortgage interest rates. Maybe the low R2 score implies the same. Trying Linear regression model without mortgage interest rate below, to predict housing price.

# Linear Regression model: only considering date