In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pmdarima as pm
from pmdarima.arima.stationarity import ADFTest
from pmdarima.arima import ndiffs
from sklearn.metrics import mean_squared_error
from pmdarima.metrics import smape
from statsmodels.tsa.arima.model import ARIMA

In [None]:
df = pd.read_csv('/home/zqiao/data_flake/imputed data/pho_t_data.csv',index_col=0)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
submkt_id = 'PHO038'
ntest = 24

In [None]:
# Check stationarity
grouped = df.groupby('research_submkt_id')
for name,group in grouped:
    if name == submkt_id:
        exo = group[[
            "real_market_level_rent",
            "gdp_histfc",
            "employment_histfc",
            "real_ecommerce",
            "spread_3m10y",
            "imports_us",
            "expots_us",
            "ecomm^2_pop",
            "weighted_pop_estimate_cryr",
            "weighted_hh_estimate_cryr"]]
        adf_test = ADFTest(alpha=0.05)
        p_val, should_diff = adf_test.should_diff(group['real_hedonic_rent_submarket'])
        kpss_diffs = ndiffs(group['real_hedonic_rent_submarket'], alpha=0.05, test='kpss', max_d=6)
        adf_diffs = ndiffs(group['real_hedonic_rent_submarket'], alpha=0.05, test='adf', max_d=6)
        n_diffs = max(adf_diffs, kpss_diffs)
        print(name,': ',p_val,should_diff,n_diffs)
        
        Y_train = group['real_hedonic_rent_submarket'][:-ntest]
        Y_test = group['real_hedonic_rent_submarket'][-ntest:]
        X_train = exo.iloc[:-ntest,:]
        X_test = exo.iloc[-ntest:,:]
        auto = pm.auto_arima(Y_train, X_train, d=n_diffs, 
                         suppress_warnings=True, error_action="ignore", 
                         min_p=1,min_q=1,max_p=6,max_q=6,
                         stepwise=True, scoring=smape,
                         max_order=None, trace=True)
        print(auto.summary())

In [None]:
model = auto
forecasts = model.predict(24, X_test)

# Print evaluation metrics
mse = mean_squared_error(Y_test, forecasts)
smape_ = smape(Y_test, forecasts)
print(f"Mean squared error: {mse}")
print(f"SMAPE: {smape_}")

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(12, 12))

# --------------------- Actual vs. Predicted --------------------------
axes[0].plot(y_train, color='blue', label='Training Data')
axes[0].plot(test_data.index, forecasts, color='green', marker='o',
             label='Predicted Price')

axes[0].plot(test_data.index, y_test, color='red', label='Actual Price')
axes[0].set_title('Microsoft Prices Prediction')
axes[0].set_xlabel('Dates')
axes[0].set_ylabel('Prices')

axes[0].set_xticks(np.arange(0, 7982, 1300).tolist(), df['Date'][0:7982:1300].tolist())
axes[0].legend()


# ------------------ Predicted with confidence intervals ----------------
axes[1].plot(y_train, color='blue', label='Training Data')
axes[1].plot(test_data.index, forecasts, color='green',
             label='Predicted Price')

axes[1].set_title('Prices Predictions & Confidence Intervals')
axes[1].set_xlabel('Dates')
axes[1].set_ylabel('Prices')

conf_int = np.asarray(confidence_intervals)
axes[1].fill_between(test_data.index,
                     conf_int[:, 0], conf_int[:, 1],
                     alpha=0.9, color='orange',
                     label="Confidence Intervals")

axes[1].set_xticks(np.arange(0, 7982, 1300).tolist(), df['Date'][0:7982:1300].tolist())
axes[1].legend()