In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

import yfinance as yf
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, pacf, adfuller
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


## 1. Load and Explore Data

In [2]:
# Fetch gold price data
gold = yf.download('GLD', start='2015-01-01', end='2026-01-18', progress=False)
gold['Price'] = (gold['Close'] * 10.8).round(0)
gold = gold[['Price']].copy()

print(f"Gold data shape: {gold.shape}")
print(f"Date range: {gold.index.min()} to {gold.index.max()}")
print(f"\nData summary:")
print(gold['Price'].describe())

# Plot original series
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=gold.index, y=gold['Price'],
    mode='lines',
    name='Gold Price (Daily)',
    line=dict(color='goldenrod', width=1)
))
fig.update_layout(
    title="Gold Price Time Series (2015-2026)",
    xaxis_title='Date',
    yaxis_title='Price ($)',
    template='plotly_white',
    height=500
)
fig.show()

print("✓ Original series loaded and visualized")

Gold data shape: (2777, 1)
Date range: 2015-01-02 00:00:00 to 2026-01-16 00:00:00

Data summary:
count    2777.000000
mean     1800.807706
std       669.254683
min      1085.000000
25%      1306.000000
50%      1725.000000
75%      1955.000000
max      4600.000000
Name: Price, dtype: float64


✓ Original series loaded and visualized


## 2. Stationarity Testing with ADF Test

In [3]:
# Augmented Dickey-Fuller test on original series
adf_result_original = adfuller(gold['Price'].dropna(), autolag='AIC')

print("ADF Test on Original Series:")
print(f"  Test Statistic: {adf_result_original[0]:.6f}")
print(f"  P-value: {adf_result_original[1]:.6f}")
print(f"  Critical Values:")
for key, value in adf_result_original[4].items():
    print(f"    {key}: {value:.3f}")

if adf_result_original[1] < 0.05:
    print("  Result: Series is STATIONARY (reject null hypothesis)")
else:
    print("  Result: Series is NON-STATIONARY (fail to reject null hypothesis)")

# First difference to make stationary
gold_diff = gold['Price'].diff().dropna()

# ADF test on differenced series
adf_result_diff = adfuller(gold_diff, autolag='AIC')

print("\nADF Test on First Differenced Series:")
print(f"  Test Statistic: {adf_result_diff[0]:.6f}")
print(f"  P-value: {adf_result_diff[1]:.6f}")

if adf_result_diff[1] < 0.05:
    print("  Result: Series is STATIONARY")
else:
    print("  Result: Series is NON-STATIONARY")

# Visualize original vs differenced
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=('Original Series', 'First Differenced'),
    shared_xaxes=True,
    vertical_spacing=0.1
)

fig.add_trace(
    go.Scatter(x=gold.index, y=gold['Price'], mode='lines', name='Original', line=dict(color='blue')),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=gold_diff.index, y=gold_diff.values, mode='lines', name='Differenced', line=dict(color='red')),
    row=2, col=1
)

fig.update_layout(height=600, showlegend=True, title_text="Stationarity Check")
fig.show()

print("✓ Stationarity testing complete")

ADF Test on Original Series:
  Test Statistic: 4.640484
  P-value: 1.000000
  Critical Values:
    1%: -3.433
    5%: -2.863
    10%: -2.567
  Result: Series is NON-STATIONARY (fail to reject null hypothesis)

ADF Test on First Differenced Series:
  Test Statistic: -7.554272
  P-value: 0.000000
  Result: Series is STATIONARY

ADF Test on First Differenced Series:
  Test Statistic: -7.554272
  P-value: 0.000000
  Result: Series is STATIONARY


✓ Stationarity testing complete


## 3. Calculate Autocorrelation Function (ACF)

In [4]:
# Calculate ACF
acf_vals = acf(gold_diff, nlags=40)

# Calculate confidence interval
ci_acf = 1.96 / np.sqrt(len(gold_diff))

# Create ACF plot
fig = go.Figure()

fig.add_trace(go.Bar(
    x=list(range(len(acf_vals))), 
    y=acf_vals, 
    name='ACF',
    marker=dict(color='blue')
))

# Add confidence intervals
fig.add_hline(y=ci_acf, line_dash='dash', line_color='red', annotation_text="95% CI")
fig.add_hline(y=-ci_acf, line_dash='dash', line_color='red')

fig.update_layout(
    title="Autocorrelation Function (ACF) - Differenced Series",
    xaxis_title='Lag',
    yaxis_title='ACF',
    height=500,
    template='plotly_white'
)
fig.show()

print(f"ACF Values (Lags 1-10): {acf_vals[1:11]}")
print(f"95% Confidence Interval: ±{ci_acf:.4f}")

# Count significant lags
significant_acf_lags = [i for i in range(1, 21) if abs(acf_vals[i]) > ci_acf]
print(f"Significant lags in ACF (first 20): {significant_acf_lags[:10]}")
print("✓ ACF analysis complete")

ACF Values (Lags 1-10): [-0.02978571  0.0254735  -0.02616689 -0.04282418 -0.00287334  0.02005216
 -0.04468212  0.04026662 -0.04611582  0.00880913]
95% Confidence Interval: ±0.0372
Significant lags in ACF (first 20): [4, 7, 8, 9, 18]
✓ ACF analysis complete


## 4. Calculate Partial Autocorrelation Function (PACF)

In [5]:
# Calculate PACF
pacf_vals = pacf(gold_diff, nlags=40, method='ywm')

# Calculate confidence interval for PACF
ci_pacf = 1.96 / np.sqrt(len(gold_diff))

# Create PACF plot
fig = go.Figure()

fig.add_trace(go.Bar(
    x=list(range(len(pacf_vals))), 
    y=pacf_vals, 
    name='PACF',
    marker=dict(color='green')
))

# Add confidence intervals
fig.add_hline(y=ci_pacf, line_dash='dash', line_color='red', annotation_text="95% CI")
fig.add_hline(y=-ci_pacf, line_dash='dash', line_color='red')

fig.update_layout(
    title="Partial Autocorrelation Function (PACF) - Differenced Series",
    xaxis_title='Lag',
    yaxis_title='PACF',
    height=500,
    template='plotly_white'
)
fig.show()

print(f"PACF Values (Lags 1-10): {pacf_vals[1:11]}")
print(f"95% Confidence Interval: ±{ci_pacf:.4f}")

# Find significant lags in PACF - this suggests AR order
significant_pacf_lags = [i for i in range(1, 21) if abs(pacf_vals[i]) > ci_pacf]
print(f"Significant lags in PACF (first 20): {significant_pacf_lags[:10]}")

print("\nInterpretation:")
print("  - PACF cuts off after lag p → AR(p) model suggested")
print(f"  - First significant PACF spike at lag {significant_pacf_lags[0] if significant_pacf_lags else 'N/A'}")
print("✓ PACF analysis complete")

PACF Values (Lags 1-10): [-0.02978571  0.02460815 -0.02473075 -0.0450083  -0.00418111  0.02146728
 -0.04577472  0.03468115 -0.04119934  0.00399202]
95% Confidence Interval: ±0.0372
Significant lags in PACF (first 20): [4, 7, 9, 18]

Interpretation:
  - PACF cuts off after lag p → AR(p) model suggested
  - First significant PACF spike at lag 4
✓ PACF analysis complete


## 5. Prepare Data for Modeling

In [6]:
# 80-20 train-test split on differenced series
train_size = int(len(gold_diff) * 0.8)
train = gold_diff.iloc[:train_size]
test = gold_diff.iloc[train_size:]

print(f"Training set: {len(train)} observations ({len(train)/len(gold_diff)*100:.1f}%)")
print(f"Test set: {len(test)} observations ({len(test)/len(gold_diff)*100:.1f}%)")
print(f"Train period: {train.index.min()} to {train.index.max()}")
print(f"Test period: {test.index.min()} to {test.index.max()}")

# Plot train-test split
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=train.index, y=train.values,
    mode='lines',
    name='Training Data',
    line=dict(color='blue')
))

fig.add_trace(go.Scatter(
    x=test.index, y=test.values,
    mode='lines',
    name='Test Data',
    line=dict(color='red')
))

fig.update_layout(
    title="Train-Test Split (Differenced Series)",
    xaxis_title='Date',
    yaxis_title='Price Change ($)',
    template='plotly_white',
    height=500
)
fig.show()

print("✓ Data prepared for AR modeling")

Training set: 2220 observations (80.0%)
Test set: 556 observations (20.0%)
Train period: 2015-01-05 00:00:00 to 2023-10-27 00:00:00
Test period: 2023-10-30 00:00:00 to 2026-01-16 00:00:00


✓ Data prepared for AR modeling


## 6. Fit AR Models with Different Orders

In [7]:
# Fit multiple AR models
ar_orders = [1, 2, 3, 5, 7, 10]
models = {}
aic_values = []
bic_values = []

print(f"\n{'Order':<8} {'AIC':<15} {'BIC':<15}")
print("-" * 38)

for p in ar_orders:
    model = AutoReg(train, lags=p, seasonal=False)
    fit = model.fit()
    models[p] = fit
    aic_values.append(fit.aic)
    bic_values.append(fit.bic)
    
    print(f"{p:<8} {fit.aic:<15.2f} {fit.bic:<15.2f}")

optimal_order_aic = ar_orders[np.argmin(aic_values)]
optimal_order_bic = ar_orders[np.argmin(bic_values)]

print(f"\nOptimal AR order by AIC: {optimal_order_aic}")
print(f"Optimal AR order by BIC: {optimal_order_bic}")

# Use BIC optimal model (more conservative)
optimal_order = optimal_order_bic
fit_optimal = models[optimal_order]

print(f"\nSelected: AR({optimal_order}) model")


Order    AIC             BIC            
--------------------------------------
1        18078.95        18096.07       
2        18068.85        18091.67       
3        18062.36        18090.88       
5        18048.43        18088.36       
7        18034.74        18086.06       
10       18010.50        18078.91       

Optimal AR order by AIC: 10
Optimal AR order by BIC: 10

Selected: AR(10) model


## 7. Model Selection: AIC vs BIC

In [8]:
# Visualize AIC and BIC
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=ar_orders, y=aic_values,
    mode='lines+markers',
    name='AIC',
    marker=dict(size=10, color='blue')
))

fig.add_trace(go.Scatter(
    x=ar_orders, y=bic_values,
    mode='lines+markers',
    name='BIC',
    marker=dict(size=10, color='red')
))

fig.update_layout(
    title="Model Selection: AIC vs BIC",
    xaxis_title='AR Order (p)',
    yaxis_title='Information Criterion',
    template='plotly_white',
    height=500
)
fig.show()

print("✓ Model comparison visualization complete")

✓ Model comparison visualization complete


## 8. Extract and Interpret AR Coefficients

In [9]:
# Extract AR coefficients from optimal model
params = fit_optimal.params

print(f"AR({optimal_order}) Coefficients:")
print(f"  Constant: {params.iloc[0]:.6f}")

for i in range(1, min(5, optimal_order + 1)):
    coef = params.iloc[i]
    print(f"  φ_{i} (lag {i}): {coef:.6f}")

if optimal_order > 4:
    print(f"  ... ({optimal_order - 4} more lags)")

print(f"\nInterpretation:")
print(f"  - φ_i shows the weight of past {i} periods")
print(f"  - Positive: Positive autocorrelation at that lag")
print(f"  - Negative: Negative autocorrelation at that lag")

AR(10) Coefficients:
  Constant: 0.329609
  φ_1 (lag 1): 0.026960
  φ_2 (lag 2): -0.040927
  φ_3 (lag 3): 0.019882
  φ_4 (lag 4): 0.003339
  ... (6 more lags)

Interpretation:
  - φ_i shows the weight of past 4 periods
  - Positive: Positive autocorrelation at that lag
  - Negative: Negative autocorrelation at that lag


## 9. Generate Forecasts

In [10]:
# Generate forecasts from optimal model
forecast = fit_optimal.forecast(steps=len(test))
forecast_series = pd.Series(forecast.values, index=test.index)

print(f"Forecast horizon: {len(test)} steps")
print(f"First 5 forecasts: {forecast_series.iloc[:5].values}")
print(f"Last 5 forecasts: {forecast_series.iloc[-5:].values}")

# Also generate forecasts from AR(1) and AR(2) for comparison
ar1_fit = models[1]
ar2_fit = models[2]

forecast_ar1 = pd.Series(ar1_fit.forecast(steps=len(test)).values, index=test.index)
forecast_ar2 = pd.Series(ar2_fit.forecast(steps=len(test)).values, index=test.index)

print("✓ Forecasts generated")

Forecast horizon: 556 steps
First 5 forecasts: [ 1.15000776  0.26624115  0.68043216 -0.27332016 -1.10798009]
Last 5 forecasts: [0.31242442 0.31242442 0.31242442 0.31242442 0.31242442]
✓ Forecasts generated
✓ Forecasts generated


## 10. Evaluate Model Performance

In [11]:
# Calculate performance metrics
def evaluate_ar_model(test, forecast, model_name):
    mse = mean_squared_error(test, forecast)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(test, forecast)
    
    # Naive forecast (last value)
    naive = pd.Series([train.iloc[-1]] * len(test), index=test.index)
    naive_rmse = np.sqrt(mean_squared_error(test, naive))
    improvement = ((naive_rmse - rmse) / naive_rmse * 100)
    
    print(f"\n{model_name} Performance:")
    print(f"  RMSE: {rmse:.6f}")
    print(f"  MAE: {mae:.6f}")
    print(f"  Naive RMSE: {naive_rmse:.6f}")
    print(f"  Improvement: {improvement:.2f}%")
    
    return {'rmse': rmse, 'mae': mae, 'improvement': improvement}

print("="*70)
print("MODEL PERFORMANCE COMPARISON")
print("="*70)

metrics_ar1 = evaluate_ar_model(test, forecast_ar1, "AR(1)")
metrics_ar2 = evaluate_ar_model(test, forecast_ar2, "AR(2)")
metrics_optimal = evaluate_ar_model(test, forecast_series, f"AR({optimal_order})")

print(f"\n{'Model':<12} {'RMSE':<15} {'MAE':<15} {'Improvement':<12}")
print("-" * 55)
print(f"{'AR(1)':<12} {metrics_ar1['rmse']:<15.6f} {metrics_ar1['mae']:<15.6f} {metrics_ar1['improvement']:<12.2f}%")
print(f"{'AR(2)':<12} {metrics_ar2['rmse']:<15.6f} {metrics_ar2['mae']:<15.6f} {metrics_ar2['improvement']:<12.2f}%")
print(f"{'AR({optimal_order})':<12} {metrics_optimal['rmse']:<15.6f} {metrics_optimal['mae']:<15.6f} {metrics_optimal['improvement']:<12.2f}%")

MODEL PERFORMANCE COMPARISON

AR(1) Performance:
  RMSE: 35.600897
  MAE: 24.455585
  Naive RMSE: 39.865556
  Improvement: 10.70%

AR(2) Performance:
  RMSE: 35.600979
  MAE: 24.454275
  Naive RMSE: 39.865556
  Improvement: 10.70%

AR(10) Performance:
  RMSE: 35.604593
  MAE: 24.461609
  Naive RMSE: 39.865556
  Improvement: 10.69%

Model        RMSE            MAE             Improvement 
-------------------------------------------------------
AR(1)        35.600897       24.455585       10.70       %
AR(2)        35.600979       24.454275       10.70       %
AR({optimal_order}) 35.604593       24.461609       10.69       %


## 11. Visualize AR Forecasts

In [12]:
# Create forecast comparison plot
fig = go.Figure()

# Training data
fig.add_trace(go.Scatter(
    x=train.index, y=train.values,
    mode='lines',
    name='Training Data',
    line=dict(color='blue', width=1)
))

# Test data (actual)
fig.add_trace(go.Scatter(
    x=test.index, y=test.values,
    mode='lines',
    name='Test Data (Actual)',
    line=dict(color='green', width=2)
))

# AR(1) forecast
fig.add_trace(go.Scatter(
    x=test.index, y=forecast_ar1.values,
    mode='lines',
    name='AR(1) Forecast',
    line=dict(color='orange', width=2, dash='dash')
))

# AR(2) forecast
fig.add_trace(go.Scatter(
    x=test.index, y=forecast_ar2.values,
    mode='lines',
    name='AR(2) Forecast',
    line=dict(color='purple', width=2, dash='dot')
))

# Optimal AR forecast
fig.add_trace(go.Scatter(
    x=test.index, y=forecast_series.values,
    mode='lines',
    name=f'AR({optimal_order}) Forecast',
    line=dict(color='red', width=2, dash='dash')
))

fig.update_layout(
    title="AR Models: Forecast Comparison",
    xaxis_title='Date',
    yaxis_title='Price Change ($)',
    hovermode='x unified',
    template='plotly_white',
    height=500
)
fig.show()

print("✓ Forecast comparison plotted")

✓ Forecast comparison plotted


## 12. Residual Diagnostics

In [13]:
# Residual diagnostics
residuals_optimal = test - forecast_series

# Create residual plot
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Residuals Over Time', 'Histogram', 'ACF of Residuals', 'Scatter: Actual vs Forecast'),
    specs=[[{'secondary_y': False}, {'secondary_y': False}],
           [{'secondary_y': False}, {'secondary_y': False}]]
)

# Residuals over time
fig.add_trace(
    go.Scatter(x=test.index, y=residuals_optimal, mode='lines+markers', name='Residuals'),
    row=1, col=1
)
fig.add_hline(y=0, line_dash='dash', line_color='red', row=1, col=1)

# Histogram
fig.add_trace(
    go.Histogram(x=residuals_optimal, name='Distribution', nbinsx=20),
    row=1, col=2
)

# ACF of residuals
residuals_acf = acf(residuals_optimal, nlags=20)
fig.add_trace(
    go.Bar(x=list(range(len(residuals_acf))), y=residuals_acf, name='ACF'),
    row=2, col=1
)

# Actual vs Forecast scatter
fig.add_trace(
    go.Scatter(x=test.values, y=forecast_series.values, mode='markers', name='Predictions', marker=dict(size=6)),
    row=2, col=2
)

fig.update_layout(height=700, showlegend=True, title_text=f"Residual Diagnostics - AR({optimal_order})")
fig.show()

print(f"\nResidual Statistics (AR({optimal_order})):")
print(f"  Mean: {residuals_optimal.mean():.6f}")
print(f"  Std Dev: {residuals_optimal.std():.6f}")
print("✓ Residual diagnostics complete")


Residual Statistics (AR(10)):
  Mean: 4.257994
  Std Dev: 35.380898
✓ Residual diagnostics complete


## 13. Key Insights and Summary

In [15]:
print("="*70)
print("KEY INSIGHTS: AUTOREGRESSIVE (AR) MODELS")
print("="*70)

print(f"\nOptimal AR({optimal_order}) by AIC: {optimal_order_aic}, by BIC: {optimal_order_bic}")
print(f"\nPerformance Ranking:")
print(f"  1. AR({optimal_order}): RMSE = {metrics_optimal['rmse']:.6f}")
print(f"  2. AR(2): RMSE = {metrics_ar2['rmse']:.6f}")
print(f"  3. AR(1): RMSE = {metrics_ar1['rmse']:.6f}")

print(f"\nAR Model Fundamentals:")
print(f"  - Current value depends on p past values")
print(f"  - Requires: Stationary series")
print(f"  - Solution: First differencing for non-stationary data")

print(f"\nWhen to Use AR Models:")
print(f"  ✓ Stationary time series")
print(f"  ✓ Recent history strongly affects future")
print(f"  ✗ Trending series (need differencing)")
print(f"  ✗ Strong seasonality (need SARIMA)")

KEY INSIGHTS: AUTOREGRESSIVE (AR) MODELS

Optimal AR(10) by AIC: 10, by BIC: 10

Performance Ranking:
  1. AR(10): RMSE = 35.604593
  2. AR(2): RMSE = 35.600979
  3. AR(1): RMSE = 35.600897

AR Model Fundamentals:
  - Current value depends on p past values
  - Requires: Stationary series
  - Solution: First differencing for non-stationary data

When to Use AR Models:
  ✓ Stationary time series
  ✓ Recent history strongly affects future
  ✗ Trending series (need differencing)
  ✗ Strong seasonality (need SARIMA)
