# Granger causality analysis for LSTM

For the LSTM all lags up to a certain maxlag have to be considered, so instead of determining Granger causality for each lag individually, we only look at the entire $n$ lags.

In [None]:
import os 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from granger_causality_functions import get_relevant_lags

In [None]:
conf_level = 0.05

## Bitcoin data

In [None]:
btc_numeric_data = pd.read_parquet('../2_data_processing/numeric_data/btc_numeric_stationary_data.parquet.gzip')
btc_nlp_data = pd.read_parquet('../3_nlp_models/4_processing/btc_stationary_text_data.parquet.gzip')
btc_targets = pd.read_parquet('../2_data_processing/numeric_data/btc_targets.parquet.gzip')

### Crop data

Remove first and last couple of days where data is inconsistent.

In [None]:
btc_data = pd.concat([btc_numeric_data, btc_nlp_data], axis=1).loc[1314662400:1678752000]
btc_data_filled = btc_data.fillna(method='ffill').fillna(0).replace([np.inf, -np.inf], 0)

### Determine relevant lags

In [6]:
directory = './btc_lstm_causality/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
for target in btc_targets.columns:
    data = pd.concat([btc_data_filled, btc_targets[target]], axis=1)
    lags = get_relevant_lags(data, target, maxlag=14, conf=conf_level)
    vars = [i for i in lags.variable if (13 in lags.relevant_lags)]
    with open(directory + target + '_causality.txt', 'w') as f:
        f.write('\n'.join(np.sort(vars)))

100%|██████████| 138/138 [00:22<00:00,  6.02it/s]
100%|██████████| 138/138 [00:24<00:00,  5.71it/s]
100%|██████████| 138/138 [00:18<00:00,  7.40it/s]
100%|██████████| 138/138 [00:17<00:00,  7.80it/s]
100%|██████████| 138/138 [00:17<00:00,  7.78it/s]
100%|██████████| 138/138 [00:17<00:00,  7.93it/s]
100%|██████████| 138/138 [00:17<00:00,  7.79it/s]


## Ethereum data

In [7]:
eth_numeric_data = pd.read_parquet('../2_data_processing/numeric_data/eth_numeric_stationary_data.parquet.gzip')
eth_nlp_data = pd.read_parquet('../3_nlp_models/4_processing/eth_stationary_text_data.parquet.gzip')
eth_targets = pd.read_parquet('../2_data_processing/numeric_data/eth_targets.parquet.gzip')

### Crop data

Remove first and last couple of days where data is inconsistent.

In [35]:
eth_data = pd.concat([eth_numeric_data, eth_nlp_data], axis=1).loc[1445472000:1678838400]
eth_data_filled = eth_data.fillna(method='ffill').fillna(0).replace([np.inf, -np.inf], 0)

### Determine relevant lags

In [38]:
directory = './eth_lstm_causality/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
for target in eth_targets.columns:
    data = pd.concat([eth_data_filled, eth_targets[target]], axis=1)
    lags = get_relevant_lags(data, target, maxlag=14, conf=conf_level)
    vars = [i for i in lags.variable if (13 in lags.relevant_lags)]
    with open(directory + target + '_causality.txt', 'w') as f:
        f.write('\n'.join(np.sort(vars)))

100%|██████████| 116/116 [00:10<00:00, 10.76it/s]
100%|██████████| 116/116 [00:10<00:00, 11.43it/s]
100%|██████████| 116/116 [00:09<00:00, 11.92it/s]
100%|██████████| 116/116 [00:09<00:00, 12.18it/s]
100%|██████████| 116/116 [00:12<00:00,  8.93it/s]
100%|██████████| 116/116 [00:16<00:00,  7.10it/s]
100%|██████████| 116/116 [00:13<00:00,  8.45it/s]
