In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.svm import SVR
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
df = pd.read_csv('ready_returns.csv', header = None)
df.columns = ['date', 'returns']
df.returns = df.returns
data, data2 = df['returns'].values, df['returns'].values
assert len(data) == len(df) == len(data2)

In [5]:
data.shape

(1557,)

In [6]:
def SVR_seq_length_results(lowest, highest, data):
    
    """
    Generates baseline results of linear, polynomial, sigmoid and kernel e-SVR
    
    Inputs:
    lowest : lowest sequence length which you want to test
    highest: highest sequence length which you want to test
    
    data: the feature sequence which you want to generate
    data2: the target sequence which you want to generate
    """
    
    # standardize the feaatures   
    sc= StandardScaler()
    sc.fit(data.reshape(-1,1))
    data = sc.transform(data.reshape(-1,1))
    # create a placeholder for results
    output = {'sequence_length' : [], 'rmse_linear' : [], 'rmse_rbf': []}
    
    # loops through selected sequence range
    for j in tqdm(range(lowest,highest+1)):
        # get batches
        data_gen = TimeseriesGenerator(np.append(data,0.00), np.append(data,0.00),
                                       length=j, sampling_rate=1,
                                       batch_size=1)

        # get features and targets from generated batches
        features = []
        targets = []
        i = 0
        while True:
            try:
                features.append(list(data_gen[i][0][0]))
                targets.append(data_gen[i][1][0])
                i += 1
            except:
                break

        # convert to numpy arrays for further use
        features = np.array(features)
        targets = np.array(targets)

        # get the training, validation and testing sets
        x_test, y_test = features[-32:], targets[-32:]
        x_hold, y_hold = features[:-32], targets[:-32]
        
        

        x_train, x_valid = x_hold[:int(len(x_hold)*.9)], x_hold[int(len(x_hold)*.9):]
        y_train, y_valid = y_hold[:int(len(y_hold)*.9)], y_hold[int(len(y_hold)*.9):]

        assert len(x_train) + len(x_valid) == len(x_hold)
        assert len(y_train) + len(y_valid) == len(y_hold)
        
        # create and fit linear SVR
        reg_linear = SVR(kernel='linear')
        reg_linear.fit(x_train, y_train)   
        y_hat_linear = reg_linear.predict(x_valid)

        # create and fit rbf kernel SVR
        reg_rbf = SVR(kernel='rbf')
        reg_rbf.fit(x_train, y_train)   
        y_hat_rbf = reg_rbf.predict(x_valid)

        # create and fit polynomial kernel SVR
        #reg_poly = SVR(kernel='poly')
        #reg_poly.fit(x_train, y_train)   
        #y_hat_poly = reg_poly.predict(x_valid)

        # create and fit sigmoid kernel SVR
        #reg_sig = SVR(kernel='poly')
        #reg_sig.fit(x_train, y_train)   
        #y_hat_sig = reg_sig.predict(x_valid)
    
        # get root mean squared error for validation set for different SVR
        output['sequence_length'].append(j)
        output['rmse_linear'].append(np.sqrt(mse(sc.inverse_transform(y_valid), sc.inverse_transform(y_hat_linear))))
        output['rmse_rbf'].append(np.sqrt(mse(sc.inverse_transform(y_valid), sc.inverse_transform(y_hat_rbf))))
        
        
        #output['rmse_poly'].append(np.sqrt(mse(y_valid, y_hat_poly)))
        #output['rmse_sig'].append(np.sqrt(mse(y_valid, y_hat_sig)))
        
    output = pd.DataFrame(output)
    
    return output

---

In [7]:
x = SVR_seq_length_results(2,11,data)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  2.92it/s]


In [8]:
x

Unnamed: 0,rmse_linear,rmse_rbf,sequence_length
0,6.361036,6.488763,2
1,6.350175,6.413241,3
2,6.363933,6.460996,4
3,6.387836,6.50329,5
4,6.374216,6.411332,6
5,6.385812,6.346584,7
6,6.376358,6.294669,8
7,6.380317,6.322109,9
8,6.373929,6.396566,10
9,6.364029,6.338166,11


---

# Optimize linear e-SVR

In [4]:
sc= StandardScaler()
sc.fit(data.reshape(-1,1))
data = sc.transform(data.reshape(-1,1))

In [5]:
data_gen = TimeseriesGenerator(np.append(data,0.00), np.append(data,0.00),
                                       length=8, sampling_rate=1,
                                       batch_size=1)

# get features and targets from generated batches
features = []
targets = []
i = 0
while True:
    try:
        features.append(list(data_gen[i][0][0]))
        targets.append(data_gen[i][1][0])
        i += 1
    except:
        break

In [6]:
# convert to numpy arrays for further use
features = np.array(features)
targets = np.array(targets)

# get the training, validation and testing sets
x_test, y_test = features[-32:], targets[-32:]
x_hold, y_hold = features[:-32], targets[:-32]



x_train, x_valid = x_hold[:int(len(x_hold)*.9)], x_hold[int(len(x_hold)*.9):]
y_train, y_valid = y_hold[:int(len(y_hold)*.9)], y_hold[int(len(y_hold)*.9):]

assert len(x_train) + len(x_valid) == len(x_hold)
assert len(y_train) + len(y_valid) == len(y_hold)

In [7]:
reg_linear = SVR(kernel='rbf', C=0.01, epsilon = 0.1, gamma = 0.9)
reg_linear.fit(x_train, y_train)   
y_hat_linear = reg_linear.predict(x_valid)

In [8]:
np.sqrt(mse(sc.inverse_transform(y_valid), sc.inverse_transform(y_hat_linear)))

6.316795169283296

In [9]:
y_hat_linear = reg_linear.predict(x_test)

In [10]:
np.sqrt(mse(sc.inverse_transform(y_test), sc.inverse_transform(y_hat_linear)))

4.749848637951993

In [11]:
sum(sc.inverse_transform(y_test) - sc.inverse_transform(y_hat_linear))/len(y_test)

-1.8000608658225687

In [12]:
from sklearn.metrics import mean_absolute_error as mae

In [16]:
mae(sc.inverse_transform(y_test), sc.inverse_transform(y_hat_linear))

3.8196019210693235

In [9]:
y_test.var()

NameError: name 'y_test' is not defined

In [None]:
y_test

In [44]:
np.sqrt(mse(sc.inverse_transform(y_test), sc.inverse_transform(y_hat_linear)))/y_test.var()

3.986073150561512