In [36]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import grangercausalitytests

In [37]:
vital_columns = ["date_", "ticker", "close", "diffs", "bin_2", "bin_3", "bin_5"]
cols = [
    "open",
    "high",
    "low",
    "vol",
    "max_comp",
    "max_neg",
    "max_neu",
    "max_pos",
    "min_comp",
    "min_neu",
    "min_pos",
    "std_comp",
    "std_neg",
    "std_neu",
    "std_pos",
    "mean_comp",
    "mean_neg",
    "mean_neu",
    "mean_pos",
    "median_comp",
    "median_neg",
    "median_neu",
    "median_pos",
    "count",
]

In [38]:
def is_stationary(ts):
    p_val = adfuller(ts)[1]
    return p_val <= 0.05


def is_relevant(granger_result):
    tests = "ssr_ftest", "ssr_chi2test", "lrtest", "params_ftest"
    p_val = []
    for i in tests:
        p_val.append(granger_result[3][0][i][1])
    p_val = round(sum(p_val) / len(p_val), 2)
    return p_val <= 0.05


def granger_test(df, main_col, cols):
    relevant = []
    for col in cols:
        try:
            x = grangercausalitytests(df[[main_col, col]], maxlag=[3], verbose=False)
            if is_relevant(x):
                relevant.append(col)
        except:
            print("There was an issue with: ", col)
    return relevant


def check_stationarity_all():
    # check stationarity of close and if it's not stationary check diffs
    for company in ["amzn", "aapl", "tsla", "msft", "goog", "googl"]:
        df = pd.read_csv(f"../datasets/binned/{company}_binned.csv")
        if not is_stationary(df["close"]):
            print(f"{company} 'close' isn't stationary")
            print("After pct_change():", is_stationary(df["diffs"]))

In [39]:
check_stationarity_all()

amzn 'close' isn't stationary
After differencing: True
aapl 'close' isn't stationary
After differencing: True
tsla 'close' isn't stationary
After differencing: True
msft 'close' isn't stationary
After differencing: True
goog 'close' isn't stationary
After differencing: True
googl 'close' isn't stationary
After differencing: True


In [40]:
for company in ["amzn", "aapl", "tsla", "msft", "goog", "googl"]:
    df = pd.read_csv(f"../datasets/binned/{company}_binned.csv")
    relevant = granger_test(df, "diffs", cols)
    print(f"{company}: {relevant}")

amzn: ['low', 'vol', 'max_pos', 'std_neg', 'std_pos', 'mean_comp', 'mean_neg', 'mean_pos', 'median_comp', 'count']
There was an issue with:  min_pos
aapl: ['high', 'vol', 'max_comp', 'max_neg', 'max_pos', 'std_comp', 'std_neg', 'std_pos', 'mean_comp', 'mean_neg', 'mean_neu', 'mean_pos', 'median_comp', 'count']
tsla: ['max_neg', 'max_pos', 'std_neg', 'std_pos', 'mean_comp', 'mean_neg', 'mean_pos', 'median_neg']
msft: ['vol', 'max_pos', 'min_neu', 'std_neg', 'std_pos', 'mean_comp', 'mean_neg', 'mean_pos', 'count']
goog: ['high', 'vol', 'max_pos', 'min_neu', 'std_neg', 'std_neu', 'std_pos', 'mean_comp', 'mean_neg', 'mean_pos', 'count']
googl: ['high', 'vol', 'max_comp', 'max_pos', 'min_neu', 'std_neg', 'std_neu', 'std_pos', 'mean_comp', 'mean_neg', 'mean_pos', 'count']
