In [1]:
import os
os.chdir('..')

In [2]:
from statsmodels.tsa.ar_model import AutoReg

from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score, recall_score, precision_score
import random
import csv
import pandas as pd
import numpy as np
import pickle
from math import sqrt

# Baseline model
This notebook establishes a baseline model by training a simple autoregressive model on our housing data.

### Loading the housing data
The model needs to be provided with a real estate economic index to analyze. Our data folder contains two indices, one for the direct real estate market and one for the securitized market. Additional indices can easily be added and implemented.

In [3]:
# Direct market
housing_index = "data/real_estate_data/direct/england.csv"
housing_column = "House price index"

# Securitized market 
#housing_index = "data/real_estate_data/securitized/FTSE EPRA_NAREIT UK Historical Data.csv"
#housing_column = "Price"

In [4]:
# Store
housing_data = pd.read_csv(housing_index, parse_dates=['Period'], index_col='Period',
                                        usecols=['Period', housing_column], thousands=',')

# Line below only needed for securitzed housing index
#housing_data.index = pd.to_datetime(housing_data.index, format="%b %y")

housing_data.index = housing_data.index.to_period("M")
housing_data = housing_data.sort_index()
housing_data[housing_column] = housing_data[housing_column].astype(float)
housing_data = housing_data.diff().dropna()# Differentiate data once so results match results from other models

### Split data in training and test set

In [5]:
n_test = round(len(housing_data) * 0.2)
training_data, test_data = housing_data[0:-n_test], housing_data[-n_test:]

### Fit the Autoregression model

In [6]:
model = AutoReg(list(training_data[housing_column]), lags=2)
model_fit = model.fit()
model_fit.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,192.0
Model:,AutoReg(2),Log Likelihood,-160.173
Method:,Conditional MLE,S.D. of innovations,0.562
Date:,"Mon, 25 May 2020",AIC,-1.11
Time:,10:31:30,BIC,-1.041
Sample:,2,HQIC,-1.082
,192,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,0.1048,0.046,2.266,0.023,0.014,0.195
y.L1,0.5675,0.072,7.921,0.000,0.427,0.708
y.L2,0.1482,0.072,2.069,0.039,0.008,0.289

0,1,2,3,4
,Real,Imaginary,Modulus,Frequency
AR.1,1.3123,+0.0000j,1.3123,0.0000
AR.2,-5.1406,+0.0000j,5.1406,0.5000


### Evaluate the performance of the baseline model
The following code shows some key performance statistics of the basemodel, such as the MAPE, accuracy and confusion matrix.

In [7]:
predictions = model_fit.predict(start=len(training_data), end=len(training_data)+len(test_data)-1, dynamic=False)
actuals = list(test_data[housing_column])

In [8]:
mape = 0
for i in range(len(predictions)):
    actual = actuals[i]
    predicted = predictions[i]
    mape += abs((actual-predicted)/actual)
    print('predicted=%f, expected=%f' % (predicted, actual))

mape = mape / len(predictions)
rmse = sqrt(mean_squared_error(test_data, predictions))
print('Test RMSE: %.3f' % rmse)
print('Test MAPE: %.3f' % mape)

predicted=0.400511, expected=0.130000
predicted=0.388434, expected=1.000000
predicted=0.384620, expected=0.560000
predicted=0.380666, expected=1.270000
predicted=0.377856, expected=1.020000
predicted=0.375676, expected=1.200000
predicted=0.374022, expected=0.150000
predicted=0.372760, expected=-0.160000
predicted=0.371798, expected=-0.450000
predicted=0.371066, expected=0.550000
predicted=0.370507, expected=0.430000
predicted=0.370082, expected=-0.160000
predicted=0.369758, expected=0.540000
predicted=0.369511, expected=-0.460000
predicted=0.369323, expected=1.610000
predicted=0.369179, expected=0.840000
predicted=0.369070, expected=0.920000
predicted=0.368986, expected=1.380000
predicted=0.368923, expected=0.610000
predicted=0.368875, expected=-0.290000
predicted=0.368838, expected=-0.020000
predicted=0.368810, expected=-0.450000
predicted=0.368788, expected=0.630000
predicted=0.368772, expected=-0.650000
predicted=0.368760, expected=0.460000
predicted=0.368750, expected=-0.770000
pre

In [9]:
accuracy_score(np.sign(actuals), np.sign(predictions))

0.6041666666666666

In [10]:
confusion_matrix(np.sign(actuals), np.sign(predictions))

array([[ 0, 19],
       [ 0, 29]])

In [11]:
precision_score(np.sign(actuals), np.sign(predictions))

0.6041666666666666