In [65]:
import pandas as pd
import numpy as np
import math
import datetime as dt
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score 
from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, GRU

from itertools import cycle

# ! pip install plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [66]:
df = pd.read_csv('mldata.csv')
print(len(df))
df = df[~df['spy_oc_ret'].isna()]
print(len(df))

728
703


In [67]:
df['date'] = pd.to_datetime(df.date,format='%d/%m/%Y')
df.sort_values(by='date', inplace=True)
df.head()

Unnamed: 0,date,spy_pco_ret,spy_oc_ret,vix_pco_ret,vix_oc_ret,tnx_pco_ret,tnx_oc_ret,dxy_pco_ret,dxy_oc_ret,cl_pco_ret,...,xlc_pco_ret,xlc_oc_ret,xlk_pco_ret,xlk_oc_ret,xlu_pco_ret,xlu_oc_ret,xlv_pco_ret,xlv_oc_ret,xly_pco_ret,xly_oc_ret
1,2020-01-02,0.00522,0.004111,-0.023222,-0.073551,-0.008338,-0.011035,0.000934,0.003835,0.008844,...,0.00634,0.005373,0.009054,0.009622,0.001702,-0.014213,0.003043,-0.000392,0.006299,0.005546
2,2020-01-03,-0.01142,0.003892,0.203689,-0.065956,-0.028693,-0.021882,-0.000619,0.000517,0.0,...,-0.010874,0.004472,-0.014563,0.003368,0.000784,0.001253,-0.011554,0.002873,-0.012686,0.00423
3,2020-01-06,-0.005955,0.009829,0.101997,-0.10356,-0.001678,0.014566,0.00062,-0.002374,0.000471,...,-0.005565,0.019026,-0.009963,0.01247,-0.001251,0.002192,-0.004544,0.010816,-0.007311,0.010167
4,2020-01-07,-0.001916,-0.000898,-0.000722,-0.003613,-0.00773,0.016694,-0.000207,0.003414,-0.00569,...,0.001098,0.000183,0.001729,-0.002157,-0.0025,0.001096,-0.002749,0.000787,-0.000634,-0.000952
5,2020-01-08,0.000651,0.004676,0.099347,-0.112797,-0.002189,0.027976,-0.001547,0.004854,0.002233,...,0.000548,0.006578,0.001081,0.009609,0.000469,-0.000938,0.000492,0.005997,-0.000873,0.003893


In [68]:
# Monthwise comparision
monthvise= df.groupby(df['date'].dt.strftime('%B'))[['spy_pco_ret','spy_oc_ret']].mean().sort_values(by='spy_oc_ret')
monthvise.head()

Unnamed: 0_level_0,spy_pco_ret,spy_oc_ret
date,Unnamed: 1_level_1,Unnamed: 2_level_1
September,-0.000516,-0.002477
June,-5.1e-05,-0.000737
January,-0.000422,-0.000596
November,0.002864,-0.000483
April,0.001854,-0.000441


In [69]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=monthvise.index,
    y=monthvise['spy_oc_ret'],
    name='spy_oc_ret',
    marker_color='crimson'
))
fig.add_trace(go.Bar(
    x=monthvise.index,
    y=monthvise['spy_pco_ret'],
    name='spy_pco_ret',
    marker_color='lightsalmon'
))

fig.update_layout(barmode='group', xaxis_tickangle=-45, 
                  title='Monthwise comparision between SPY OC and PCO RET')
fig.show()

In [70]:
# Plotting stock close price chart

fig = px.line(df, x=df.date, y=df.spy_oc_ret,labels={'date':'date','spy_oc_ret':'spy_oc_ret'})
fig.update_traces(marker_line_width=2, opacity=0.6)
fig.update_layout(title_text='spy_oc_ret', plot_bgcolor='white', font_size=15, font_color='black')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()


In [71]:
# Normalise data

df_1 = df.copy()
#def df_1['date']
df_1.drop('date', axis=1, inplace=True)
#scaler=MinMaxScaler(feature_range=(-2,2))
#scaler=MinMaxScaler()
#scaler=StandardScaler()
df_2=np.array(df_1)
#df_2 = scaler.fit_transform(np.array(df_1))

#df_2 = df_1.copy()
print(df_2.shape)


(703, 28)


In [72]:
training_size=int(len(df_2)*0.7)
test_size=len(df_2)-training_size
train_data,test_data=df_2[0:training_size,:],df_2[training_size:len(df_2),:]
print("train_data: ", train_data.shape)
print("test_data: ", test_data.shape)

train_data:  (492, 28)
test_data:  (211, 28)


In [73]:
test_data

array([[-7.4587110e-03,  5.8301600e-04, -3.1511542e-02, ...,
         5.9164300e-04, -1.1066079e-02,  5.3164710e-03],
       [ 1.2954900e-04,  1.5493489e-02, -1.3248013e-02, ...,
         1.8290516e-02, -3.5255450e-03,  1.6274961e-02],
       [ 4.1861480e-03, -1.2950452e-02, -3.7843513e-02, ...,
         6.0118920e-03,  4.8739030e-03, -2.6825034e-02],
       ...,
       [ 1.2019900e-03, -4.4950720e-03, -2.6761820e-03, ...,
        -8.0559850e-03,  7.2100000e-05,  2.5243420e-03],
       [-2.0627664e-02,  4.8009049e-02,  8.9365500e-04, ...,
         3.8202247e-02, -2.9928058e-02,  4.2123999e-02],
       [ 7.0497580e-03, -2.9629630e-02, -1.5654350e-03, ...,
        -1.2512951e-02,  9.9629950e-03, -4.6857384e-02]])

In [74]:
test_data = np.nan_to_num(test_data, nan=0)

print(test_data.sum())
print(train_data.sum())


0.8076596769999993
2.3474808438800006


In [75]:
# convert an array of values into a dataset matrix
def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step-1):
        a = dataset[i:(i+time_step), :]   ###i=0, 0,1,2,3-----99   100 
        #print(a.shape)
        dataX.append(a)
        dataY.append(dataset[i + time_step, :])
    return np.array(dataX), np.array(dataY)

In [76]:
# reshape into X=t,t+1,t+2,t+3 and Y=t+4
time_step = 15
X_train, y_train = create_dataset(train_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test", y_test.shape)

X_train:  (476, 15, 28)
y_train:  (476, 28)
X_test:  (195, 15, 28)
y_test (195, 28)


In [77]:
X_train.shape[1]

15

In [78]:
X_train = X_train.reshape(X_train.shape[0],X_train.shape[1]*X_train.shape[2])
X_test = X_test.reshape(X_test.shape[0],X_test.shape[1]*X_test.shape[2])
print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)


X_train:  (476, 420)
X_test:  (195, 420)


In [79]:
df_2.shape

(703, 28)

In [41]:
# need to make the dataset
# X t-15 X t - 14 X t - 13 X t - 12 etc etc 
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor

#svr_rbf = SVR(kernel= 'rbf', C= 1e2, gamma= 0.1) epsilon=0.2
svr_rbf = SVR(epsilon=0.002)
# Create the Multioutput Regressor
mor = MultiOutputRegressor(svr_rbf)



In [42]:
# Train the regressor
mor = mor.fit(X_train, y_train)

# Generate predictions for testing data
y_pred = mor.predict(X_test)

# Evaluate the regressor
mse_one = mean_squared_error(y_test[:,0], y_pred[:,0])
mse_two = mean_squared_error(y_test[:,1], y_pred[:,1])
mse_three = mean_squared_error(y_test[:,2], y_pred[:,2])
print(f'MSE for first regressor: {mse_one} - second regressor: {mse_two} - third regressor: {mse_three}')
mae_one = mean_absolute_error(y_test[:,0], y_pred[:,0])
mae_two = mean_absolute_error(y_test[:,1], y_pred[:,1])
mae_three = mean_absolute_error(y_test[:,2], y_pred[:,2])
print(f'MAE for first regressor: {mae_one} - second regressor: {mae_two} - third regressor: {mae_three}')

ValueError: y must have at least two dimensions for multi-output regression but has only one.

In [43]:
fig = px.line(y_test[:,1],labels={'date':'date','spy_oc_ret':'spy_oc_ret'})
fig.update_traces(marker_line_width=2, opacity=0.6)
fig.update_layout(title_text='spy_oc_ret', plot_bgcolor='white', font_size=15, font_color='black')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

fig.add_scatter(y=y_pred[:,1], line_width=2)

fig.show()


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [44]:
np.nansum(y_train)

0.09354989600000002

In [45]:
X_test.shape

(195, 420)

In [46]:
from scipy import stats
import statsmodels.api as sm
from statsmodels.tsa.stattools import acf
from statsmodels.graphics.tsaplots import plot_acf
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, cross_val_score, cross_validate

In [47]:
import xgboost as xgb
from sklearn.svm import SVR, LinearSVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso

In [48]:
#Make an inner validation scheme for Gridsearch Cross-Validation
time_split_inner = TimeSeriesSplit(n_splits = 3) # These splits are used to find the best GS hyper-para

# Linear Support Vector Regression
#svr = LinearSVR(epsilon=10, C=35, fit_intercept=True, loss='squared_epsilon_insensitive', 
#                max_iter=30000, random_state=8)
svr = LinearSVR()

mor_svr = MultiOutputRegressor(svr)

#svr_params = {'epsilon': [0.001, 0.01, 0.02, 0.2, 0.5],
#              'fit_intercept': [0, 1],
#              'C': [1, 5, 10, 20]}

svr_params = {'epsilon': [0.2],
              'C': [20]}

In [49]:
#gs_svr = GridSearchCV(mor_svr, svr_params, cv=time_split_inner, scoring = 'neg_mean_squared_error', 
#                      n_jobs=-1, verbose=3)

#gs_svr.fit(X_train, y_train)

In [56]:
# Lasso Regression
#lasso = Lasso(fit_intercept=1, alpha=0.05, max_iter=10000, random_state=8)
lasso = Lasso()
#mor_lasso = MultiOutputRegressor(lasso)

las_params = {'fit_intercept': [1, 0],
              'alpha': [0.005, 0.01, 0.03, 0.05, 0.07, 0.1]}
gs_las = GridSearchCV(lasso, las_params, cv=time_split_inner, scoring='neg_mean_squared_error', 
                      n_jobs=-1, verbose=3)

gs_las.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
             estimator=Lasso(), n_jobs=-1,
             param_grid={'alpha': [0.005, 0.01, 0.03, 0.05, 0.07, 0.1],
                         'fit_intercept': [1, 0]},
             scoring='neg_mean_squared_error', verbose=3)

In [57]:
gs_las.best_params_

{'alpha': 0.005, 'fit_intercept': 0}

In [58]:
# Generate predictions for testing data
y_pred = gs_las.predict(X_test)

# Evaluate the regressor
#mse_one = mean_squared_error(y_test[0], y_pred[0])
#mse_two = mean_squared_error(y_test[1], y_pred[1])
#mse_three = mean_squared_error(y_test[2], y_pred[:,2])
#print(f'MSE for first regressor: {mse_one} - second regressor: {mse_two} - third regressor: {mse_three}')
#mae_one = mean_absolute_error(y_test[:,0], y_pred[:,0])
#mae_two = mean_absolute_error(y_test[:,1], y_pred[:,1])
#mae_three = mean_absolute_error(y_test[:,2], y_pred[:,2])
#print(f'MAE for first regressor: {mae_one} - second regressor: {mae_two} - third regressor: {mae_three}')

In [59]:
fig = px.line(y_test)
fig.update_traces(marker_line_width=2, opacity=0.6)
fig.update_layout(title_text='spy_oc_ret', plot_bgcolor='white', font_size=15, font_color='black')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

fig.add_scatter(y=y_pred, line_width=2)

fig.show()


In [81]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipe_svr = Pipeline([('scl', StandardScaler()),
        ('reg', MultiOutputRegressor(SVR()))])

grid_param_svr = {
    'reg__estimator__C': [0.1,1,10]
}

gs_svr = (GridSearchCV(estimator=pipe_svr, 
                      param_grid=grid_param_svr, 
                      cv=2,
                      scoring = 'neg_mean_squared_error',
                      n_jobs = -1))

gs_svr = gs_svr.fit(X_train,y_train)
gs_svr.best_estimator_    

Pipeline(steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), 
('reg', MultiOutputRegressor(estimator=SVR(C=10, cache_size=200,
 coef0=0.0, degree=3, epsilon=0.1, gamma='auto', kernel='rbf', max_iter=-1,    
 shrinking=True, tol=0.001, verbose=True), n_jobs=1))])

Pipeline(steps=[('scl', StandardScaler()),
                ('reg',
                 MultiOutputRegressor(estimator=SVR(C=10, gamma='auto',
                                                    verbose=True),
                                      n_jobs=1))])

In [82]:
# Generate predictions for testing data
y_pred = gs_svr.predict(X_test)

# Evaluate the regressor
mse_one = mean_squared_error(y_test[:,0], y_pred[:,0])
mse_two = mean_squared_error(y_test[:,1], y_pred[:,1])
mse_three = mean_squared_error(y_test[:,2], y_pred[:,2])
print(f'MSE for first regressor: {mse_one} - second regressor: {mse_two} - third regressor: {mse_three}')
mae_one = mean_absolute_error(y_test[:,0], y_pred[:,0])
mae_two = mean_absolute_error(y_test[:,1], y_pred[:,1])
mae_three = mean_absolute_error(y_test[:,2], y_pred[:,2])
print(f'MAE for first regressor: {mae_one} - second regressor: {mae_two} - third regressor: {mae_three}')

MSE for first regressor: 0.0005068122466433905 - second regressor: 0.00019148554797326561 - third regressor: 0.0020197793940289915
MAE for first regressor: 0.021012959830769225 - second regressor: 0.01120100949230769 - third regressor: 0.03746083953041102


In [87]:
fig = px.line(y_test[:,5])
fig.update_traces(marker_line_width=2, opacity=0.6)
fig.update_layout(title_text='spy_oc_ret', plot_bgcolor='white', font_size=15, font_color='black')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

fig.add_scatter(y=y_pred[:,5], line_width=2)

fig.show()