##### SVM

In [30]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline
from tools import create_vectors

In [2]:
df = pd.read_csv('../datasets/binned/amzn_binned.csv', parse_dates=['date_'])

In [31]:
# def create_and_predict(x, y):
#     parameters = {'svc__kernel':('linear', 'rbf'), 'svc__C':[1, 10]}

#     tscv = TimeSeriesSplit(n_splits=4)
#     svm_pipeline = make_pipeline(StandardScaler(), SVC())
#     clf = GridSearchCV(svm_pipeline, parameters, scoring='neg_mean_absolute_percentage_error', cv=tscv)
#     clf.fit(x, y)

#     return {'best_estimator': clf.best_estimator_, 'best_params': clf.best_params_, 'best_score': clf.best_score_}


def create_and_predict(x, y):
    tscv = TimeSeriesSplit(n_splits=4)
    regr = make_pipeline(StandardScaler(), SVC(C=1.0, kernel='linear', cache_size=1000))
    vals = cross_val_score(regr, x, y, scoring='neg_mean_absolute_percentage_error', cv=tscv)
    return sum(vals)/len(vals) #* -1

In [32]:
results = {}
results[2] = create_and_predict(*create_vectors(df, main_col='bin_2'))
#results[3] = create_and_predict(*create_vectors(df, main_col='bin_3'))
#results[5] = create_and_predict(*create_vectors(df, main_col='bin_5'))

In [33]:
results[2]

-0.9761766414875073

In [8]:
results[3]

{'best_estimator': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('svc', SVC(C=1, kernel='linear'))]),
 'best_params': {'svc__C': 1, 'svc__kernel': 'linear'},
 'best_score': -0.47070789259560614}

In [9]:
results[5]

{'best_estimator': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('svc', SVC(C=1, kernel='linear'))]),
 'best_params': {'svc__C': 1, 'svc__kernel': 'linear'},
 'best_score': -0.478302142663412}

In [42]:
cols = ['open', 'high', 'low', 'vol', 'max_comp',
       'max_neg', 'max_neu', 'max_pos', 'min_comp', 'min_neg', 'min_neu',
       'min_pos', 'std_comp', 'std_neg', 'std_neu', 'std_pos', 'mean_comp',
       'mean_neg', 'mean_neu', 'mean_pos', 'median_comp', 'median_neg',
       'median_neu', 'median_pos', 'count']
lags = [3, 5, 6, 8, 10]#, 12, 16]
current_main_col = 'bin_5'

error = {i:{} for i in lags}
for lag in lags:
    x, y = create_vectors(df, main_col=current_main_col, lag=lag+1)
    mape = create_and_predict(x, y)#['best_score']
    
    for col in cols:
        x, y = create_vectors(df, main_col=current_main_col, cols=[col])
        results = create_and_predict(x, y)  
        er = results#['best_score']
        # since nmape is negative the smaller score the better
        error[lag][col] = (round(er, 3), "+" if er < mape else "-")
    error[lag]['baseline'] = (round(mape, 3), '.')


In [48]:
#[3, 5, 6, 8, 10]
print(error[5]['baseline'])
for i, j in error[3].items():
    
    if j[1] == '+':
        print(i, j)

(-0.976, '.')
max_neg (-0.976, '-')
max_neu (-0.986, '-')
min_comp (-0.983, '-')
min_pos (-0.988, '-')
std_comp (-0.981, '-')
std_neu (-0.979, '-')
median_comp (-0.98, '-')
median_neg (-0.985, '-')
median_neu (-0.984, '-')
median_pos (-0.985, '-')
count (-0.988, '-')
