In [1]:
# imports

import numpy as np
from numpy import mean
from numpy import std
from numpy import absolute
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from scipy.stats.mstats import winsorize
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import math
from sklearn import linear_model
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
from sklearn.preprocessing import normalize
from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler


In [4]:
data = pd.read_csv('Train.csv')
data.index = data['Date']

# https://www.kaggle.com/datasets/abhishekgautam12/jpmc-quant?select=Train.csv

In [5]:
# filter into a single stock

train_1 = data[data['Stock'] == 'Stock 1']

test_1 = data[data['Stock'] == 'Stock 1']

In [6]:
train_start_dt = '2006-07-31'
test_start_dt = '2016-02-29' # index of 115

In [7]:
train = train_1.copy()[(train_1.index >= train_start_dt) & (train_1.index < test_start_dt)][['price']]
test = test_1.copy()[test_1.index >= test_start_dt][['price']]

print('Training data shape: ', train.shape)
print('Test data shape: ', test.shape)

Training data shape:  (115, 1)
Test data shape:  (35, 1)


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 115 entries, 2006-07-31 to 2016-01-29
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   price   115 non-null    float64
dtypes: float64(1)
memory usage: 1.8+ KB


In [9]:
# standardize
scaler = MinMaxScaler()
train['price'] = scaler.fit_transform(train)
train.head(5)


Unnamed: 0_level_0,price
Date,Unnamed: 1_level_1
2006-07-31,0.0
2006-08-31,0.069586
2006-09-29,0.114503
2006-10-31,0.161978
2006-11-30,0.150476


In [10]:
test['price'] = scaler.transform(test)
test.head(5)

Unnamed: 0_level_0,price
Date,Unnamed: 1_level_1
2016-02-29,0.961011
2016-03-31,0.925429
2016-04-29,0.955056
2016-05-31,0.945989
2016-06-30,0.936828


In [11]:
# Converting to numpy arrays

train_data = train.values
test_data = test.values

In [12]:
# add some time steps

timesteps=3

In [13]:
train_data_timesteps=np.array([[j for j in train_data[i:i+timesteps]] for i in range(0,len(train_data)-timesteps+1)])[:,:,0]
train_data_timesteps.shape

(113, 3)

In [14]:
test_data_timesteps=np.array([[j for j in test_data[i:i+timesteps]] for i in range(0,len(test_data)-timesteps+1)])[:,:,0]
test_data_timesteps.shape

(33, 3)

In [15]:
x_train, y_train = train_data_timesteps[:,:timesteps-1],train_data_timesteps[:,[timesteps-1]]

x_test, y_test = test_data_timesteps[:,:timesteps-1],test_data_timesteps[:,[timesteps-1]]

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(113, 2) (113, 1)
(33, 2) (33, 1)


In [16]:
model = SVR(kernel='rbf',gamma=0.5, C=10, epsilon = 0.05)


In [17]:
model.fit(x_train, y_train[:,0])


SVR(C=10, epsilon=0.05, gamma=0.5)

In [18]:
SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.05, gamma=0.5,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

SVR(C=10, epsilon=0.05, gamma=0.5)

In [19]:
y_train_pred = model.predict(x_train).reshape(-1,1)
y_test_pred = model.predict(x_test).reshape(-1,1)

print(y_train_pred.shape, y_test_pred.shape)

(113, 1) (33, 1)


In [20]:
y_train_pred = scaler.inverse_transform(y_train_pred)
y_test_pred = scaler.inverse_transform(y_test_pred)

print(len(y_train_pred), len(y_test_pred))

113 33


In [21]:
# Scaling the original values
y_train = scaler.inverse_transform(y_train)
y_test = scaler.inverse_transform(y_test)

print(len(y_train), len(y_test))

113 33


In [22]:
train_timestamps = train_1[(train_1.index < test_start_dt) & (train_1.index >= train_start_dt)].index[timesteps-1:]
test_timestamps = train_1[test_start_dt:].index[timesteps-1:]

print(len(train_timestamps), len(test_timestamps))

113 33


In [23]:
# defining MAPE

def mape(predictions, actuals):
    """Mean absolute percentage error"""
    predictions = np.array(predictions)
    actuals = np.array(actuals)
    return (np.absolute(predictions - actuals) / actuals).mean()

In [24]:
print('MAPE for training data: ', mape(y_train_pred, y_train)*100, '%')


MAPE for training data:  0.6501702034172042 %


In [25]:
print('MAPE for testing data: ', mape(y_test_pred, y_test)*100, '%')


MAPE for testing data:  0.6509046338767247 %


In [26]:
# accuracy score 

def accuracy_fn(y_true, y_pred):
  n_correct = 0; n_wrong = 0
  for i in range(y_pred.shape[0]):
    abs_delta = np.abs(y_pred[i].item() - y_true[i].item())
    max_allow = np.abs(0.1 * y_true[i].item())
    if abs_delta < max_allow:
      n_correct +=1
    else:
      n_wrong += 1
  acc = (n_correct * 1.0) / (n_correct + n_wrong)
  return acc*100

In [27]:
print('Accuracy Score: ', accuracy_fn(y_test, y_test_pred))

Accuracy Score:  100.0
