In [16]:
file_path = './datasets/cw2024AP.xlsx'

In [25]:
import pandas as pd
import numpy as np
from datetime import datetime

In [26]:
data = pd.read_excel(file_path)

data['Dates'] = pd.to_datetime(data['Dates']) 

data['Dates'] = pd.to_datetime(data['Dates']) 
data.set_index('Dates', inplace=True)

data['FTSE Log Returns'] = data['FTSE Log Returns'].replace([np.inf, -np.inf], np.nan)
data.dropna(subset=['FTSE Log Returns'], inplace=True)

data['Scaled Returns'] = data['FTSE Log Returns'] * 100

# Create 1-month lag for NASDAQ Returns
data['NASDAQ Returns Lagged'] = data['NASDAQ Returns'].shift(1)

required_columns = ['NASDAQ Returns Lagged', 'interest rate and m2 interaction']
if not all(col in data.columns for col in required_columns):
    raise ValueError("Missing one or more required columns for exogenous variables.")

data.dropna(subset=required_columns, inplace=True)

# Split Data
data.dropna(subset=required_columns + ['Scaled Returns'], inplace=True)

train_data = data[data.index <= datetime(2014, 10, 31)]['Scaled Returns']
test_data = data[data.index > datetime(2014, 10, 31)]['Scaled Returns']
exog_vars_mean = data[required_columns]
exog_train_mean = exog_vars_mean.loc[train_data.index]
exog_test_mean = exog_vars_mean.loc[test_data.index]

# Print Summary for Verification
print("Data Summary:")
print(data.describe())

print("\nMissing Values:")
print(data.isnull().sum())

print("\nTraining Data:")
print(train_data.head())

print("\nTesting Data:")
print(test_data.head())

KeyError: 'FTSE Log Returns'

In [27]:
data

Unnamed: 0_level_0,ASTRAZENECA,RIO TINTO,BP,BRITISH AMERICAN TOBACCO,DIAGEO,ROLLS-ROYCE HOLDINGS,COMPASS GROUP,TESCO,BAE SYSTEMS,VODAFONE GROUP,...,KINGFISHER,PEARSON,SAGE GROUP,TAYLOR WIMPEY,LAND SECURITIES GROUP,NATIONAL GRID,Unnamed: 24,FTSE ALL SHARE - PRICE INDEX,Unnamed: 26,UK GVT BMK BID YLD 1M - RED. YIELD
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1995-12-31,1215.37,773.23,269.50,331.94,478.62,44.12,,98.85,199.25,47.21,...,206.46,557.14,32.51,89.19,555.61,165.17,,1803.09,,6.400
1996-01-31,1257.32,757.54,264.25,341.89,460.44,47.62,,99.35,223.00,48.64,...,203.80,594.64,34.01,107.41,561.91,166.41,,1841.96,,6.110
1996-02-29,1226.10,752.58,269.75,334.28,463.47,48.32,,87.70,217.50,47.41,...,197.70,610.71,33.91,111.20,552.91,161.86,,1840.77,,6.100
1996-03-29,1326.57,783.97,286.75,282.81,480.64,50.31,,88.70,214.75,49.66,...,216.75,597.32,34.61,116.14,563.71,161.24,,1843.44,,5.910
1996-04-30,1358.76,863.28,299.75,293.92,483.16,55.33,,93.36,217.75,54.37,...,226.65,630.36,45.91,132.08,589.83,169.72,,1914.61,,5.910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-28,12356.00,5201.00,475.20,2430.00,2489.50,456.80,2160.0,306.00,1320.00,69.76,...,248.80,990.80,1088.50,142.15,619.50,882.60,,4451.92,,5.296
2024-07-31,12368.00,5026.00,458.85,2745.00,2419.00,449.60,2396.0,331.60,1297.00,72.44,...,276.30,1056.00,1086.00,159.35,635.50,986.00,,4588.31,,5.143
2024-08-30,13274.00,4770.00,429.40,2836.00,2472.50,496.40,2399.0,353.70,1363.00,74.42,...,284.20,1056.00,1011.00,161.05,629.50,998.60,,4576.73,,5.051
2024-09-30,11588.00,5299.00,391.70,2724.00,2603.00,527.20,2394.0,358.70,1237.50,75.04,...,322.00,1013.50,1024.50,164.30,651.00,1030.00,,4511.00,,5.020


In [28]:
# Stationarity Check (ADF Test)
from statsmodels.tsa.stattools import adfuller

adf_test = adfuller(data['Scaled Returns'])
print(f"ADF Statistic: {adf_test[0]}")
print(f"p-value: {adf_test[1]}")

if adf_test[1] < 0.05:
    print("The data is stationary (reject null hypothesis).")
else:
    print("The data is non-stationary (fail to reject null hypothesis).")

# Volatility Clustering    
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 7))
plt.plot(data['Scaled Returns'], label="Scaled Returns")
plt.title("Time Series of Scaled Returns")
plt.xlabel("Date")
plt.ylabel("Returns")
plt.legend()
plt.grid()
plt.show()

# Autocorrelation in Squared Returns
from statsmodels.graphics.tsaplots import plot_acf

squared_returns = data['Scaled Returns']**2
plot_acf(squared_returns, lags=20)
plt.title("ACF of Squared Returns")
plt.show()


# Presence of conditional heteroskedasticity
from statsmodels.stats.diagnostic import het_arch

arch_test = het_arch(data['Scaled Returns'])
print(f"ARCH Test Statistic: {arch_test[0]}")
print(f"p-value: {arch_test[1]}")

if arch_test[1] < 0.05:
    print("Significant ARCH effects detected (reject null hypothesis).")
else:
    print("No significant ARCH effects detected (fail to reject null hypothesis).")
    
    
# K & S and Distribution of returns - Check if returns exhibit heavy tails (leptokurtosis)
from scipy.stats import kurtosis, skew

kurt = kurtosis(data['Scaled Returns'])
skewness = skew(data['Scaled Returns'])
print(f"Kurtosis: {kurt}")
print(f"Skewness: {skewness}")

plt.figure(figsize=(10, 5))
plt.hist(data['Scaled Returns'], bins=50, density=True, alpha=0.7, label="Returns")
plt.title("Distribution of Returns")
plt.xlabel("Returns")
plt.ylabel("Density")
plt.grid()
plt.legend()
plt.show()

# checks for autocorrelation in the residuals of a mean model
from statsmodels.stats.diagnostic import acorr_ljungbox

ljung_box = acorr_ljungbox(data['Scaled Returns'], lags=[10], return_df=True)
print(ljung_box)


KeyError: 'Scaled Returns'

In [29]:
threshold = data['Scaled Returns'].std() * 2  # 2 standard deviations

high_volatility = (data['Scaled Returns'].abs() > threshold)

# Plot the time series
plt.figure(figsize=(12, 6))
plt.plot(data.index, data['Scaled Returns'], label="Scaled Returns", color='blue')

# Highlight clusters
for start, end in zip(data.index[:-1], data.index[1:]):
    if high_volatility[start]:
        plt.axvspan(start, end, color='red', alpha=0.2, label='High Volatility' if 'High Volatility' not in plt.gca().get_legend_handles_labels()[1] else None)

# Add title, labels, and legend
plt.title("Time Series of Scaled Returns Volatility Clusters", fontsize=16)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Returns", fontsize=12)
plt.legend(fontsize=12)
plt.grid(alpha=0.5)
plt.tight_layout()

# Save the plot
#plt.savefig("time_series_scaled_returns_vol.png", dpi=300)

# Show the plot
#plt.show()

KeyError: 'Scaled Returns'

In [30]:
from arch.univariate import HARX, EGARCH, StudentsT, Normal

model = HARX(y=train_data, x=exog_train_mean, lags=1)  # lags=0 indicates no autoregressive terms in mean
model.volatility = EGARCH(p=1, o=1, q=1)
model.distribution = Normal()  # Use the Student's t-distribution

# Fit the model
egarch_fit = model.fit(disp="off")

# Print summary
print(egarch_fit.summary())

NameError: name 'train_data' is not defined