# Granger causality analysis for LSTM

For the LSTM all lags up to a certain maxlag have to be considered, so instead of determining Granger causality for each lag individually, we only look at the entire $n$ lags.

In [2]:
import os 
import numpy as np
import pandas as pd

from granger_causality_functions import get_relevant_lags

In [3]:
conf_level = 0.05

## Bitcoin data

In [4]:
btc_numeric_data = pd.read_parquet('../2_data_processing/numeric_data/btc_numeric_stationary_data.parquet.gzip')
btc_nlp_data = pd.read_parquet('../3_nlp_models/4_processing/btc_stationary_text_data.parquet.gzip')
btc_targets = pd.read_parquet('../2_data_processing/numeric_data/btc_targets.parquet.gzip')

### Crop data

Remove first and last couple of days where data is inconsistent.

In [5]:
btc_data = pd.concat([btc_numeric_data, btc_nlp_data], axis=1).loc[1314662400:1678752000]
btc_data_filled = btc_data.fillna(method='ffill').fillna(0).replace([np.inf, -np.inf], 0)

### Determine relevant lags

In [6]:
directory = './btc_lstm_causality/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
for target in btc_targets.columns:
    data = pd.concat([btc_data_filled, btc_targets[target]], axis=1)
    lags = get_relevant_lags(data, target, maxlag=14, conf=conf_level)
    vars = [i for i in lags.variable if (13 in lags.relevant_lags)]
    with open(directory + target + '_causality.txt', 'w') as f:
        f.write('\n'.join(np.sort(vars)))

100%|██████████| 141/141 [00:20<00:00,  7.00it/s]
100%|██████████| 141/141 [00:17<00:00,  8.19it/s]
100%|██████████| 141/141 [00:17<00:00,  8.26it/s]
100%|██████████| 141/141 [00:15<00:00,  9.15it/s]
100%|██████████| 141/141 [00:15<00:00,  9.17it/s]
100%|██████████| 141/141 [00:15<00:00,  8.99it/s]
100%|██████████| 141/141 [00:15<00:00,  8.87it/s]


## Ethereum data

In [7]:
eth_numeric_data = pd.read_parquet('../2_data_processing/numeric_data/eth_numeric_stationary_data.parquet.gzip')
eth_nlp_data = pd.read_parquet('../3_nlp_models/4_processing/eth_stationary_text_data.parquet.gzip')
eth_targets = pd.read_parquet('../2_data_processing/numeric_data/eth_targets.parquet.gzip')

### Crop data

Remove first and last couple of days where data is inconsistent.

In [8]:
eth_data = pd.concat([eth_numeric_data, eth_nlp_data], axis=1).loc[1445472000:1678838400]
eth_data_filled = eth_data.fillna(method='ffill').fillna(0).replace([np.inf, -np.inf], 0)

### Determine relevant lags

In [10]:
directory = './eth_lstm_causality/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
for target in eth_targets.columns:
    data = pd.concat([eth_data_filled, eth_targets[target]], axis=1)
    lags = get_relevant_lags(data, target, maxlag=14, conf=conf_level)
    vars = [i for i in lags.variable if (13 in lags.relevant_lags)]
    with open(directory + target + '_causality.txt', 'w') as f:
        f.write('\n'.join(np.sort(vars)))

100%|██████████| 119/119 [00:13<00:00,  8.60it/s]
100%|██████████| 119/119 [00:10<00:00, 10.97it/s]
100%|██████████| 119/119 [00:11<00:00, 10.18it/s]
100%|██████████| 119/119 [00:10<00:00, 11.16it/s]
100%|██████████| 119/119 [00:10<00:00, 11.04it/s]
100%|██████████| 119/119 [00:10<00:00, 10.87it/s]
100%|██████████| 119/119 [00:10<00:00, 11.19it/s]
