In this file I perform:
* Granger causality test
* SVR performance test

In [78]:
from statsmodels.tsa.stattools import grangercausalitytests, adfuller
import pandas as pd
import numpy as np

In [79]:
data = pd.read_csv('stocks_sentiment/full_clean.csv', parse_dates=['date_'])

In [80]:
data.columns

Index(['date_', 'ticker', 'open', 'high', 'low', 'close', 'vol', 'max_comp',
       'max_neg', 'max_neu', 'max_pos', 'min_comp', 'min_neg', 'min_neu',
       'min_pos', 'std_comp', 'std_neg', 'std_neu', 'std_pos', 'mean_comp',
       'mean_neg', 'mean_neu', 'mean_pos', 'median_comp', 'median_neg',
       'median_neu', 'median_pos', 'count'],
      dtype='object')

In [81]:
def adf_test(timeseries):
    print("Results of Dickey-Fuller Test:")
    dftest = adfuller(timeseries, maxlag=5)
    dfoutput = pd.Series(
        dftest[0:4],
        index=[
            "Test Statistic",
            "p-value",
            "#Lags Used",
            "Number of Observations Used",
        ],
    )
    for key, value in dftest[4].items():
        dfoutput["Critical Value (%s)" % key] = value
    print(dfoutput)

def check_stationary(df):
    df = df.drop(['date_', 'ticker'], axis=1)
    for col in df.columns:
        print(col)
        adf_test(df[col].dropna())
        print()

check_stationary(data)

open
Results of Dickey-Fuller Test:
Test Statistic                    -2.295868
p-value                            0.173304
#Lags Used                         5.000000
Number of Observations Used    52031.000000
Critical Value (1%)               -3.430476
Critical Value (5%)               -2.861596
Critical Value (10%)              -2.566800
dtype: float64

high
Results of Dickey-Fuller Test:
Test Statistic                    -2.275981
p-value                            0.179832
#Lags Used                         5.000000
Number of Observations Used    52031.000000
Critical Value (1%)               -3.430476
Critical Value (5%)               -2.861596
Critical Value (10%)              -2.566800
dtype: float64

low
Results of Dickey-Fuller Test:
Test Statistic                    -2.301550
p-value                            0.171469
#Lags Used                         5.000000
Number of Observations Used    52031.000000
Critical Value (1%)               -3.430476
Critical Value (5%)      

In [82]:
def granger_test_columns(data, target, columns):
    irrelevant_cols = []
    for col in columns:
        p_val = 0
        x = grangercausalitytests(data[[target, col]], maxlag=[3], verbose=False)
        names = ['ssr_ftest', 'ssr_chi2test', 'lrtest', 'params_ftest']
        p_val = sum([x[3][0][name][1] for name in names])/4
        if p_val/4 < 0.05:
            print(f'relevant: {col}, {round(p_val/4, 3)}')
        else:
            print(f'NOT relevant: {col}, {round(p_val/4, 3)}')
            irrelevant_cols.append(col)
    return irrelevant_cols
    
col_names = ['high', 'low', 'vol', 'max_comp', 'max_neg', 'max_neu', 
            'max_pos', 'min_comp', 'min_neg', 'min_neu', 'min_pos', 'std_comp',
            'std_neg', 'std_neu', 'std_pos', 'mean_comp', 'mean_neg', 'mean_neu',
            'mean_pos', 'median_comp', 'median_neg', 'median_neu', 'median_pos', 'count']

cols = granger_test_columns(data, 'close', col_names)
data = data.drop(cols, axis=1)


relevant: high, 0.0
relevant: low, 0.0
relevant: vol, 0.002
relevant: max_comp, 0.0
NOT relevant: max_neg, 0.212
NOT relevant: max_neu, 0.249
relevant: max_pos, 0.0
NOT relevant: min_comp, 0.218
NOT relevant: min_neg, 0.249
relevant: min_neu, 0.005
NOT relevant: min_pos, 0.241
relevant: std_comp, 0.002
NOT relevant: std_neg, 0.106
relevant: std_neu, 0.0
relevant: std_pos, 0.0
relevant: mean_comp, 0.0
relevant: mean_neg, 0.037
NOT relevant: mean_neu, 0.054
relevant: mean_pos, 0.0
relevant: median_comp, 0.022
NOT relevant: median_neg, 0.23
NOT relevant: median_neu, 0.249
NOT relevant: median_pos, 0.103
NOT relevant: count, 0.123


In [83]:
data.columns

Index(['date_', 'ticker', 'open', 'high', 'low', 'close', 'vol', 'max_comp',
       'max_pos', 'min_neu', 'std_comp', 'std_neu', 'std_pos', 'mean_comp',
       'mean_neg', 'mean_pos', 'median_comp'],
      dtype='object')

### SVR
* grid search
* 4-folded cv

In [118]:
#polynomial n sigmoid kernel
import sklearn as sk
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_absolute_percentage_error

In [119]:
# training examples with lagged columns
def create_vectors(df, cols, lag=5):
    # lagged close value as the main vector
    main = [window.to_list() for window in df.close.rolling(window=lag)]
    if cols == None:
        return main[lag-1:len(main)-1], data.close[lag:]

    for col in cols:
        if col in df.columns:
            minor = [window.to_list() for window in df[col].rolling(window=lag)]
            for i in range(len(main)):
                main[i].extend(minor[i])
        else:
            print(f"Couldn't find {col} in columns!")

    # remove vectors with less than lag elements, remove last from main to match 
    return main[lag-1:len(main)-1], data.close[lag:]

In [124]:
def create_and_predict(x, y):
    tscv = TimeSeriesSplit(n_splits=6)
    regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2, cache_size=1000))
    vals = cross_val_score(regr, x, y, scoring='neg_mean_absolute_percentage_error', cv=tscv)
    print(sum(vals)/len(vals))
    return vals

In [125]:
x, y = create_vectors(data, None)
create_and_predict(x, y)

-0.5345344923247141


array([-2.79997005, -0.02954781, -0.00953186, -0.28930433, -0.06207915,
       -0.01677376])

In [126]:
sk.metrics.get_scorer("neg_mean_absolute_percentage_error")

make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [None]:
x, y = create_vectors(data, ['std_pos'])
create_and_predict(x, y)