In [8]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Function to calculate Newey-West variance
def newey_west_variance(differences, lag=1):
    """
    Computes the Newey-West variance for the series of differences.
    """
    X = np.ones(len(differences))  # Constant term
    ols_model = sm.OLS(differences, X).fit(cov_type='HAC', cov_kwds={'maxlags': lag})
    return ols_model.bse[0]**2  # Variance (squared standard error of the constant term)

# Function to compute DM test statistic
def dm_test_statistic(errors1, errors2, lag=1):
    """
    Computes the Diebold-Mariano test statistic between two sets of errors.
    """
    squared_errors1 = np.square(errors1)
    squared_errors2 = np.square(errors2)
    squared_diff = squared_errors1 - squared_errors2
    mean_diff = np.mean(squared_diff)
    nw_variance = newey_west_variance(squared_diff, lag)
    dm_statistic = mean_diff / np.sqrt(nw_variance)
    return dm_statistic

# Define ticker groups
ticker_groups = [
    ['AMZN'],
    ['TSLA'],
    ['META', 'MSFT', 'NVDA', 'AAPL']
]

# Base file path for loading model errors
base_path = '/Users/sbjpipers/Desktop/FinalThesisQF/FinalThesisQF/Notebooks/Notebook_modeling/db_performance_evaluation/{model}_{group}.csv'

# Loop through each group
for idx, group in enumerate(ticker_groups, start=1):
    group_name = "_".join(group).lower()
    print(f"Processing group: {group_name}")

    # Dynamically detect models based on files available
    model_errors = {}
    for model in ["ols", "enet", "glm", "rf", "xgb", "nn"]:  # Modify this list if needed
        file_path = base_path.format(model=model, group=group_name)
        try:
            df = pd.read_csv(file_path)
            model_errors[model.upper()] = df['Daily Avg Errors']
        except FileNotFoundError:
            print(f"File not found for model {model.upper()} and group {group_name}. Skipping...")
            continue

    # Prepare an empty DataFrame to store the DM test results
    dm_table = pd.DataFrame(index=model_errors.keys(), columns=model_errors.keys())

    # Perform pairwise DM tests
    for model1_name, errors1 in model_errors.items():
        for model2_name, errors2 in model_errors.items():
            if model1_name != model2_name:
                dm_statistic = dm_test_statistic(errors1, errors2)
                dm_table.loc[model2_name, model1_name] = round(dm_statistic, 2)

    # Print the DM test table for the current group
    print(f"DM Test Table for Group {group_name}:\n")
    print(dm_table, "\n")

    # Save the DM test table to a CSV file for the current group
    output_path = f'/Users/sbjpipers/Desktop/FinalThesisQF/FinalThesisQF/Notebooks/performence_evaluation/dm_test_results_{group_name}.csv'
    dm_table.to_csv(output_path)
    print(f"DM test results saved to {output_path}\n")


Processing group: amzn
DM Test Table for Group amzn:

       OLS  ENET   GLM    RF   XGB    NN
OLS    NaN  1.45 -6.79 -6.49 -6.77 -6.43
ENET -1.45   NaN -6.79 -6.49 -6.77 -6.43
GLM   6.79  6.79   NaN -3.42  -4.2 -4.22
RF    6.49  6.49  3.42   NaN -4.15 -2.87
XGB   6.77  6.77   4.2  4.15   NaN -1.41
NN    6.43  6.43  4.22  2.87  1.41   NaN 

DM test results saved to /Users/sbjpipers/Desktop/FinalThesisQF/FinalThesisQF/Notebooks/performence_evaluation/dm_test_results_amzn.csv

Processing group: tsla
DM Test Table for Group tsla:

       OLS  ENET   GLM    RF   XGB    NN
OLS    NaN  2.14 -4.08 -6.54  -5.8 -7.29
ENET -2.14   NaN -4.09 -6.54  -5.8 -7.29
GLM   4.08  4.09   NaN -5.83 -4.12 -6.75
RF    6.54  6.54  5.83   NaN  2.43 -1.89
XGB    5.8   5.8  4.12 -2.43   NaN  -2.9
NN    7.29  7.29  6.75  1.89   2.9   NaN 

DM test results saved to /Users/sbjpipers/Desktop/FinalThesisQF/FinalThesisQF/Notebooks/performence_evaluation/dm_test_results_tsla.csv

Processing group: meta_msft_nvda_aapl
DM

  return ols_model.bse[0]**2  # Variance (squared standard error of the constant term)
  return ols_model.bse[0]**2  # Variance (squared standard error of the constant term)
  return ols_model.bse[0]**2  # Variance (squared standard error of the constant term)


In [5]:
dm_table

Unnamed: 0,OLS,ENET,GLM,RF,NN
OLS,,0.54,-4.7,-6.48,-6.12
ENET,-0.54,,-4.71,-6.48,-6.12
GLM,4.7,4.71,,0.82,-6.23
RF,6.48,6.48,-0.82,,-3.76
NN,6.12,6.12,6.23,3.76,


# DB test comparison full vs reduced


In [28]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Function to calculate Newey-West variance
def newey_west_variance(differences, lag=1):
    """
    Computes the Newey-West variance for the series of differences.
    """
    X = np.ones(len(differences))  # Constant term
    ols_model = sm.OLS(differences, X).fit(cov_type='HAC', cov_kwds={'maxlags': lag})
    return ols_model.bse[0]**2  # Variance (squared standard error of the constant term)

# Function to compute DM test statistic
def dm_test_statistic(errors1, errors2, lag=1):
    """
    Computes the Diebold-Mariano test statistic between two sets of errors.
    """
    squared_errors1 = np.square(errors1)
    squared_errors2 = np.square(errors2)
    squared_diff = squared_errors1 - squared_errors2
    mean_diff = np.mean(squared_diff)
    nw_variance = newey_west_variance(squared_diff, lag)
    dm_statistic = mean_diff / np.sqrt(nw_variance)
    return dm_statistic

# Load reduced model data
reduced_model_files = {
    "OLS": "ols.csv",
    "ENet": "enet.csv",
    "GLM": "glm.csv",
    "XGBoost": "xgb.csv",
    "RF": "rf.csv",
    "NN": "nn.csv"
}

# Load full model data (appending "_all" to filenames)
full_model_files = {k: v.replace('.csv', '_all.csv') for k, v in reduced_model_files.items()}

# Load errors for both reduced and full models
base_path = '/Users/sbjpipers/Desktop/FinalThesisQF/FinalThesisQF/Notebooks/Notebook_modeling/db_performance_evaluation/'
reduced_models = {name: pd.read_csv(f'{base_path}{file}')['Daily Avg Errors'] for name, file in reduced_model_files.items()}
full_models = {name: pd.read_csv(f'{base_path}{file}')['Daily Avg Errors'] for name, file in full_model_files.items()}

# Compute DM statistics for each model
dm_results = {}
for model_name in reduced_models.keys():
    reduced_errors = reduced_models[model_name]
    full_errors = full_models[model_name]
    dm_stat = dm_test_statistic(reduced_errors, full_errors)
    dm_results[model_name] = dm_stat

# Convert results to DataFrame for display
dm_results_df = pd.DataFrame.from_dict(dm_results, orient='index', columns=["DM Statistic"])
dm_results_df


  return ols_model.bse[0]**2  # Variance (squared standard error of the constant term)


Unnamed: 0,DM Statistic
OLS,2.163842
ENet,2.931819
GLM,-15.500484
XGBoost,-1.403957
RF,-6.717921
NN,-3.365817
