In this file I perform:
* Granger causality test
* SVR performance test

In [19]:
from statsmodels.tsa.stattools import grangercausalitytests, adfuller
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('stocks_sentiment/full_clean.csv', parse_dates=['date_'])

In [3]:
data.columns

Index(['date_', 'ticker', 'open', 'high', 'low', 'close', 'vol', 'max_comp',
       'max_neg', 'max_neu', 'max_pos', 'min_comp', 'min_neg', 'min_neu',
       'min_pos', 'std_comp', 'std_neg', 'std_neu', 'std_pos', 'mean_comp',
       'mean_neg', 'mean_neu', 'mean_pos', 'median_comp', 'median_neg',
       'median_neu', 'median_pos', 'count'],
      dtype='object')

In [17]:
def adf_test(timeseries):
    print("Results of Dickey-Fuller Test:")
    dftest = adfuller(timeseries, maxlag=5)
    dfoutput = pd.Series(
        dftest[0:4],
        index=[
            "Test Statistic",
            "p-value",
            "#Lags Used",
            "Number of Observations Used",
        ],
    )
    for key, value in dftest[4].items():
        dfoutput["Critical Value (%s)" % key] = value
    print(dfoutput)

def check_stationary(df):
    df = df.drop(['date_', 'ticker'], axis=1)
    for col in df.columns:
        print(col)
        adf_test(df[col].dropna())
        print()

check_stationary(data)

open
Results of Dickey-Fuller Test:
Test Statistic                    -2.295868
p-value                            0.173304
#Lags Used                         5.000000
Number of Observations Used    52031.000000
Critical Value (1%)               -3.430476
Critical Value (5%)               -2.861596
Critical Value (10%)              -2.566800
dtype: float64

high
Results of Dickey-Fuller Test:
Test Statistic                    -2.275981
p-value                            0.179832
#Lags Used                         5.000000
Number of Observations Used    52031.000000
Critical Value (1%)               -3.430476
Critical Value (5%)               -2.861596
Critical Value (10%)              -2.566800
dtype: float64

low
Results of Dickey-Fuller Test:
Test Statistic                    -2.301550
p-value                            0.171469
#Lags Used                         5.000000
Number of Observations Used    52031.000000
Critical Value (1%)               -3.430476
Critical Value (5%)      

In [None]:
def granger_test_columns(data, target, columns):
    relevant_cols = []
    for col in columns:
        p_val = 0
        x = grangercausalitytests(data[[target, col]], maxlag=[3], verbose=False)
        names = ['ssr_ftest', 'ssr_chi2test', 'lrtest', 'params_ftest']
        p_val = sum([x[3][0][name][1] for name in names])/4
        if p_val/4 < 0.05:
            relevant_cols.append(col)
        else:
            print(f'NOT relevant: {col}, {p_val}')
    return relevant_cols
    
col_names = ['max_comp', 'max_neg', 'max_neu', 'max_pos', 'min_comp', 'min_neg', 'min_neu',
            'min_pos', 'std_comp', 'std_neg', 'std_neu', 'std_pos', 'mean_comp',
            'mean_neg', 'mean_neu', 'mean_pos', 'median_comp', 'median_neg',
            'median_neu', 'median_pos', 'count']

cols = granger_test_columns(data, 'close', col_names)
data = data.drop(cols, axis=1)

In [8]:
data.head()

Unnamed: 0,date_,ticker,open,high,low,close,vol,max_neg,max_neu,min_comp,min_neg,min_pos,std_neg,mean_neu,median_neg,median_neu,median_pos,count
0,2015-01-02 17:00:00,AMZN,312.58,314.75,312.11,313.52,404201.0,0.247,1.0,-0.4278,0.0,0.0,0.079716,0.85525,0.0,0.843,0.043,12.0
1,2015-01-02 18:00:00,AMZN,313.25,313.59,308.63,308.96,509208.0,0.0,1.0,0.0,0.0,0.0,0.0,0.905167,0.0,0.9525,0.0475,18.0
2,2015-01-02 19:00:00,AMZN,308.84,310.12,306.9601,309.75,427854.0,0.267,1.0,-0.6124,0.0,0.0,0.119406,0.9062,0.0,1.0,0.0,5.0
3,2015-01-02 20:00:00,AMZN,309.77,309.93,308.05,308.4,209933.0,0.149,0.925,-0.5423,0.0,0.0,0.086025,0.854,0.0,0.851,0.075,3.0
4,2015-01-02 21:00:00,AMZN,308.41,308.48,307.25,308.35,158209.0,0.292,1.0,-0.2185,0.0,0.0,0.082843,0.842462,0.0,0.84,0.144,13.0


### SVR
* grid search
* 4-folded cv

In [50]:
#polynomial n sigmoid kernel
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

In [45]:
# training examples with lagged columns
def create_vectors(df, cols, lag=5):
    # lagged close value as the main vector
    main = [window.to_list() for window in df.close.rolling(window=lag)]
    
    for col in cols:
        if col in df.columns:
            minor = [window.to_list() for window in df.col.rolling(window=lag)]
            for i in range(len(main)):
                main[i] = main[i].extend(minor[i])
        else:
            print(f"Couldn't find {col} in columns!")

    # remove vectors with less than lag elements, remove last from main to match 
    return main[lag-1:len(main)-1], data.close[lag:]

In [49]:
tscv = TimeSeriesSplit(n_splits=5)
regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2, cache_size=1000))

In [48]:
x, y = create_vectors(data, None)
regr.fit(x, y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(epsilon=0.2))])

In [52]:
cross_val_score(regr, x, y, cv=tscv)

array([0.46340416, 0.92323482, 0.97857007, 0.99310925, 0.98515037,
       0.84986157, 0.99631981, 0.98501705])