In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.svm import SVR
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error as mae

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.read_csv('fourGreta.csv')

data = df.price.values
#data = df['returns'].values, df['returns'].values
assert len(data) == len(df)

In [3]:
data

array([ 771.4 ,  802.39,  818.72, ..., 6853.84, 6811.47, 6636.32])

In [4]:
def SVR_seq_length_results(lowest, highest, data):
    
    """
    Generates baseline results of linear, polynomial, sigmoid and kernel e-SVR
    
    Inputs:
    lowest : lowest sequence length which you want to test
    highest: highest sequence length which you want to test
    
    data: the feature sequence which you want to generate
    data2: the target sequence which you want to generate
    """
    
    # standardize the feaatures   
    sc= MinMaxScaler()
    sc.fit(data.reshape(-1,1))
    data = sc.transform(data.reshape(-1,1))
    # create a placeholder for results
    output = {'sequence_length' : [], 'rmse_linear' : [], 'rmse_rbf': []}
    
    # loops through selected sequence range
    for j in tqdm(range(lowest,highest+1)):
        # get batches
        data_gen = TimeseriesGenerator(np.append(data,0.00), np.append(data,0.00),
                                       length=j, sampling_rate=1,
                                       batch_size=1)

        # get features and targets from generated batches
        features = []
        targets = []
        i = 0
        while True:
            try:
                features.append(list(data_gen[i][0][0]))
                targets.append(data_gen[i][1][0])
                i += 1
            except:
                break

        # convert to numpy arrays for further use
        features = np.array(features)
        targets = np.array(targets)

        # get the training, validation and testing sets
        x_test, y_test = features[-32:], targets[-32:]
        x_hold, y_hold = features[:-32], targets[:-32]
        
        

        x_train, x_valid = x_hold[:int(len(x_hold)*.9)], x_hold[int(len(x_hold)*.9):]
        y_train, y_valid = y_hold[:int(len(y_hold)*.9)], y_hold[int(len(y_hold)*.9):]

        assert len(x_train) + len(x_valid) == len(x_hold)
        assert len(y_train) + len(y_valid) == len(y_hold)
        
        # create and fit linear SVR
        reg_linear = SVR(kernel='linear')
        reg_linear.fit(x_train, y_train)   
        y_hat_linear = reg_linear.predict(x_valid)

        # create and fit rbf kernel SVR
        reg_rbf = SVR(kernel='rbf')
        reg_rbf.fit(x_train, y_train)   
        y_hat_rbf = reg_rbf.predict(x_valid)

        # create and fit polynomial kernel SVR
        #reg_poly = SVR(kernel='poly')
        #reg_poly.fit(x_train, y_train)   
        #y_hat_poly = reg_poly.predict(x_valid)

        # create and fit sigmoid kernel SVR
        #reg_sig = SVR(kernel='poly')
        #reg_sig.fit(x_train, y_train)   
        #y_hat_sig = reg_sig.predict(x_valid)
    
        # get root mean squared error for validation set for different SVR
        output['sequence_length'].append(j)
        output['rmse_linear'].append(np.sqrt(mse(sc.inverse_transform(y_valid.reshape(-1,1)), sc.inverse_transform(y_hat_linear.reshape(-1,1)))))
        output['rmse_rbf'].append(np.sqrt(mse(sc.inverse_transform(y_valid.reshape(-1,1)), sc.inverse_transform(y_hat_rbf.reshape(-1,1)))))
        
        
        #output['rmse_poly'].append(np.sqrt(mse(y_valid, y_hat_poly)))
        #output['rmse_sig'].append(np.sqrt(mse(y_valid, y_hat_sig)))
        
    output = pd.DataFrame(output)
    
    return output

In [5]:
x = SVR_seq_length_results(2,20,data)

100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:01<00:00, 14.00it/s]


---

In [6]:
# Modelling linear SVR

In [7]:
sc= MinMaxScaler()
sc.fit(data.reshape(-1,1))
data = sc.transform(data.reshape(-1,1))

In [8]:
data_gen = TimeseriesGenerator(np.append(data,0.00), np.append(data,0.00),
                                       length=12, sampling_rate=1,
                                       batch_size=1)

# get features and targets from generated batches
features = []
targets = []
i = 0
while True:
    try:
        features.append(list(data_gen[i][0][0]))
        targets.append(data_gen[i][1][0])
        i += 1
    except:
        break

In [9]:
# convert to numpy arrays for further use
features = np.array(features)
targets = np.array(targets)

# get the training, validation and testing sets
x_test, y_test = features[-32:], targets[-32:]
x_hold, y_hold = features[:-32], targets[:-32]



x_train, x_valid = x_hold[:int(len(x_hold)*.9)], x_hold[int(len(x_hold)*.9):]
y_train, y_valid = y_hold[:int(len(y_hold)*.9)], y_hold[int(len(y_hold)*.9):]

assert len(x_train) + len(x_valid) == len(x_hold)
assert len(y_train) + len(y_valid) == len(y_hold)

In [79]:
reg_linear = SVR(kernel='rbf', C = 0.8, epsilon=0.0001)
reg_linear.fit(x_train, y_train)   

SVR(C=0.8, cache_size=200, coef0=0.0, degree=3, epsilon=0.0001, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [80]:
y_hat = reg_linear.predict(x_test)

In [81]:
pred = sc.inverse_transform(y_test.reshape(-1,1)).reshape(1,-1)[0]

In [82]:
truth = sc.inverse_transform(y_hat.reshape(-1,1)).reshape(1,-1)[0]

In [83]:
np.sqrt(mse(truth,pred))

524.8700466466113

In [84]:
sum(truth - pred)/len(y_test)

-292.87588590699454

In [85]:
mae(truth, pred)

418.8133842484517