In this file I perform:
* Granger causality test
* SVR performance test

In [136]:
from statsmodels.tsa.stattools import grangercausalitytests, adfuller
import pandas as pd
import numpy as np

In [137]:
data = pd.read_csv('stocks_sentiment/full_clean.csv', parse_dates=['date_'])

In [138]:
data.columns

Index(['date_', 'ticker', 'open', 'high', 'low', 'close', 'vol', 'max_comp',
       'max_neg', 'max_neu', 'max_pos', 'min_comp', 'min_neg', 'min_neu',
       'min_pos', 'std_comp', 'std_neg', 'std_neu', 'std_pos', 'mean_comp',
       'mean_neg', 'mean_neu', 'mean_pos', 'median_comp', 'median_neg',
       'median_neu', 'median_pos', 'count'],
      dtype='object')

In [139]:
def adf_test(timeseries):
    print("Results of Dickey-Fuller Test:")
    dftest = adfuller(timeseries, maxlag=5)
    dfoutput = pd.Series(
        dftest[0:4],
        index=[
            "Test Statistic",
            "p-value",
            "#Lags Used",
            "Number of Observations Used",
        ],
    )
    for key, value in dftest[4].items():
        dfoutput["Critical Value (%s)" % key] = value
    print(dfoutput)

def check_stationary(df):
    df = df.drop(['date_', 'ticker'], axis=1)
    for col in df.columns:
        print(col)
        adf_test(df[col].dropna())
        print()

check_stationary(data)

open
Results of Dickey-Fuller Test:
Test Statistic                   -69.022544
p-value                            0.000000
#Lags Used                         5.000000
Number of Observations Used    52031.000000
Critical Value (1%)               -3.430476
Critical Value (5%)               -2.861596
Critical Value (10%)              -2.566800
dtype: float64

high
Results of Dickey-Fuller Test:
Test Statistic                   -69.013044
p-value                            0.000000
#Lags Used                         5.000000
Number of Observations Used    52031.000000
Critical Value (1%)               -3.430476
Critical Value (5%)               -2.861596
Critical Value (10%)              -2.566800
dtype: float64

low
Results of Dickey-Fuller Test:
Test Statistic                   -69.040593
p-value                            0.000000
#Lags Used                         5.000000
Number of Observations Used    52031.000000
Critical Value (1%)               -3.430476
Critical Value (5%)      

In [140]:
# def granger_test_columns(data, target, columns):
#     irrelevant_cols = []
#     for col in columns:
#         p_val = 0
#         x = grangercausalitytests(data[[target, col]], maxlag=[3], verbose=False)
#         names = ['ssr_ftest', 'ssr_chi2test', 'lrtest', 'params_ftest']
#         p_val = sum([x[3][0][name][1] for name in names])/4
#         if p_val/4 < 0.05:
#             print(f'relevant: {col}, {round(p_val/4, 3)}')
#         else:
#             print(f'NOT relevant: {col}, {round(p_val/4, 3)}')
#             irrelevant_cols.append(col)
#     return irrelevant_cols
    
# col_names = ['high', 'low', 'vol', 'max_comp', 'max_neg', 'max_neu', 
#             'max_pos', 'min_comp', 'min_neg', 'min_neu', 'min_pos', 'std_comp',
#             'std_neg', 'std_neu', 'std_pos', 'mean_comp', 'mean_neg', 'mean_neu',
#             'mean_pos', 'median_comp', 'median_neg', 'median_neu', 'median_pos', 'count']

# cols = granger_test_columns(data, 'close', col_names)
# data = data.drop(cols, axis=1)


In [142]:
data = data[data['ticker'] == 'AMZN']

### SVR
* grid search
* 4-folded cv

In [143]:
#polynomial n sigmoid kernel
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_absolute_percentage_error

In [159]:
def create_vectors(df, cols=None, lag=6):
    X = []
    y = []
    for window in df.close.rolling(window=lag):
        if len(window) == lag:
            vector = window.to_list()
            X.append(vector[:-1])
            y.append(vector[-1])
    if cols != None:
        for col in cols:
            i = 0
            for window in df[col].rolling(window=lag-1):
                if len(window) == lag-1:
                    vector = window.to_list()
                    if i < len(X):
                        X[i].extend(vector)
                    i += 1

    return X, y

In [145]:
x, y = create_vectors(data, ['std_pos'])

In [146]:
x[:2], y[:2]

([[313.52,
   308.96,
   309.75,
   308.4,
   0.1137678057991962,
   0.1037662815006425,
   0.0903371462909915,
   0.1085833013558407],
  [308.96,
   309.75,
   308.4,
   308.35,
   0.1037662815006425,
   0.0903371462909915,
   0.1085833013558407,
   0.0740313516399709]],
 [308.35, 308.02])

In [147]:
def create_and_predict(x, y):
    tscv = TimeSeriesSplit(n_splits=6)
    regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2, cache_size=1000))
    vals = cross_val_score(regr, x, y, scoring='neg_mean_absolute_percentage_error', cv=tscv)
    return sum(vals)/len(vals) * -1

In [161]:
x, y = create_vectors(data)
nmape = create_and_predict(x, y)   
print(nmape)

0.22695174293663498


In [162]:
cols = ['open', 'high', 'low', 'vol', 'max_comp',
       'max_neg', 'max_neu', 'max_pos', 'min_comp', 'min_neg', 'min_neu',
       'min_pos', 'std_comp', 'std_neg', 'std_neu', 'std_pos', 'mean_comp',
       'mean_neg', 'mean_neu', 'mean_pos', 'median_comp', 'median_neg',
       'median_neu', 'median_pos', 'count']
error = {}
for col in cols:
    x, y = create_vectors(data, [col])
    er = create_and_predict(x, y)  
    error[col] = (er, er > nmape)


In [163]:
dict(sorted(error.items(), key=lambda item: item[1]))

{'min_pos': (0.2041520343567672, False),
 'max_neu': (0.21059922336119272, False),
 'count': (0.21598895061488801, False),
 'median_neg': (0.21988884603645067, False),
 'low': (0.2268663271369343, False),
 'min_neg': (0.22695174293663498, False),
 'open': (0.22722081167170913, True),
 'high': (0.2272268391734418, True),
 'vol': (0.2349102472435518, True),
 'median_neu': (0.25210017422182457, True),
 'max_comp': (0.2528886200306382, True),
 'median_pos': (0.25524384612527173, True),
 'mean_neg': (0.26053102073050166, True),
 'min_comp': (0.26261040408505393, True),
 'std_comp': (0.2626439150461429, True),
 'mean_neu': (0.2632215762714299, True),
 'mean_pos': (0.26408405042216443, True),
 'mean_comp': (0.26645984825771, True),
 'median_comp': (0.26865475999273447, True),
 'std_neg': (0.27027700257523585, True),
 'min_neu': (0.27170313326405265, True),
 'max_pos': (0.2720043435691623, True),
 'std_pos': (0.2720471793436503, True),
 'std_neu': (0.273154170927267, True),
 'max_neg': (0.2732