In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from pathlib import Path
import math

In [2]:
#Variables
first_year = 2019
last_year = 2021
split_date ="2021-01-01"

epochs = 100

features = ["Underlying_last", "Strike", "Ttl", "Volatility_GJR_GARCH", "R"]
num_features = len(features)
num_outputs = 1
seq_length = 5



In [3]:
def read_file(file):
    """Read a single file and return a dataframe"""
    return pd.read_csv(file, skipinitialspace=True)

def lag_features(df, features, seq_length):
    """Transforms a raw 2D dataframe of option data into 2D dataframe ofsequence data.
    Last 2 indexes per sequence are bid and ask price. The len(features)*seq_length
    features before are sequences of features"""
    df = df.sort_values(["Expire_date", "Strike", "Ttl"], ascending = [True, True, False])

    for step in range(seq_length)[::-1]:
        for feature in features:
            df[feature + "-" + str(step)] = df[feature].shift(step)
    
    df["Check_strike"] = df["Strike"] == df["Strike"].shift(seq_length-1)
    df["Check_expire"] = df["Expire_date"] == df["Expire_date"].shift(seq_length-1)
    df = df[(df["Check_strike"] == True) & (df["Check_expire"] == True)]
    df = df.drop(["Check_strike", "Check_expire"], axis=1)
    #df[["Bid_strike_last", "Ask_strike_last"]] = df[["Bid_strike", "Ask_strike"]]
    #df[["Bid_last", "Ask_last"]] = df[["Bid", "Ask"]]
    df["Price_last"] = df["Price"]
    df = df.sort_values(["Quote_date"], ascending = [True])
    return df

def create_train_test(df, split_date):
    """Splits data in training and test set, and transforms data to right 2D format"""
    return df[df["Quote_date"] < split_date], df[df["Quote_date"] >= split_date]

def df_to_xy(df):
    """Transforms a dataframe into two arrays of explanatory variables x and explained variables y"""
    dx = df[["Underlying_last", "Strike", "Ttl", "Volatility_GJR_GARCH", "R"]]
    dy = df["Price"]
    array_x, array_y = dx.to_numpy().astype(np.float32), dy.to_numpy().astype(np.float32)
    return array_x, array_y

def min_max_scale(train, test):
    """Scales a training and test set using MinMaxScaler. The scaler is calibrated on the training set"""
    scaler = MinMaxScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test

In [4]:
# Load dataset
df_read = read_file("../data/processed_data/2019-2021_underlying-strike_only-price.csv")
print(df_read)
df_read.info()
print(df_read)
print(df_read["Ttl"].max())


         Unnamed: 0  Quote_date Expire_date     Price  Underlying_last  \
0           1354913  2019-01-02  2019-01-04  1707.050          2509.98   
1           1354914  2019-01-02  2019-01-04  1607.495          2509.98   
2           1354915  2019-01-02  2019-01-04  1507.500          2509.98   
3           1354916  2019-01-02  2019-01-04  1458.295          2509.98   
4           1354917  2019-01-02  2019-01-04  1408.300          2509.98   
...             ...         ...         ...       ...              ...   
5123793     6521988  2021-12-31  2024-12-20   150.000          4766.39   
5123794     6521989  2021-12-31  2024-12-20   150.000          4766.39   
5123795     6521990  2021-12-31  2024-12-20   150.900          4766.39   
5123796     6521991  2021-12-31  2024-12-20   150.000          4766.39   
5123797     6521992  2021-12-31  2024-12-20   150.000          4766.39   

         Strike   Ttl  Volatility  Volatility_GJR_GARCH     R  
0         800.0     2    0.202726              

In [5]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

training_period = 10
val_period = 1
test_period = 1
num_models = 12

features = ["Underlying_last", "Strike", "Ttl", "Volatility_GJR_GARCH", "R"]
seq_length = 5
num_features = len(features)
num_outputs = 1

df_read_lags = lag_features(df_read, features, seq_length)

train_val_test = []

month = 4
year = 0
for i in range(num_models):
    if month == 13:
        year += 1
        month = 1
    train_start = datetime(2020 + year, month, 1)
    val_start = train_start + relativedelta(months=8)
    test_start = val_start + relativedelta(months=1)
    test_end = test_start + relativedelta(months=1)

    month += 1

    df_train_orginal = df_read_lags.loc[(df_read_lags.loc[:, "Quote_date"] >= str(train_start)) & (df_read_lags.loc[:, "Quote_date"] < str(val_start)), :]
    df_val_orginal = df_read_lags.loc[(df_read_lags.loc[:, "Quote_date"] >= str(val_start)) & (df_read_lags.loc[:, "Quote_date"] < str(test_start)), :]
    df_test_orginal = df_read_lags.loc[(df_read_lags.loc[:, "Quote_date"] >= str(test_start)) & (df_read_lags.loc[:, "Quote_date"] < str(test_end)), :]

    train_x_org, train_y_org = df_to_xy(df_train_orginal)
    val_x_org, val_y_org = df_to_xy(df_val_orginal)
    test_x_org, test_y_org = df_to_xy(df_test_orginal)

    scaler = MinMaxScaler()
    train_x_scaled = scaler.fit_transform(train_x_org)
    val_x_scaled = scaler.transform(val_x_org)
    test_x_scaled = scaler.transform(test_x_org)

    """shuffle = np.random.permutation(len(train_x_scaled))
    train_x_scaled, train_y_scaled = train_x_scaled[shuffle], train_y_scaled[shuffle]"""

    train_x_scaled = np.reshape(train_x_scaled, (len(train_x_scaled), num_features))
    val_x_scaled = np.reshape(val_x_scaled, (len(val_x_scaled), num_features))
    test_x_scaled = np.reshape(test_x_scaled, (len(test_x_scaled), num_features))

    # print(f"Train_x shape: {train_x_scaled.shape}, train_y shape: {train_y_org.shape}")
    # print(f"Test_x shape: {test_x_scaled.shape}, test_y shape: {test_y_org.shape}")
    # print("------------------------------------------------")
    train_val_test.append(((train_x_scaled, train_y_org), (val_x_scaled, val_y_org), (test_x_scaled, test_y_org)))




In [6]:
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout, BatchNormalization
from keras import backend as K
from tensorflow.keras.optimizers import AdamW
import keras as KER
from sklearn.model_selection import train_test_split
from keras.activations import linear, relu
from datetime import datetime
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError

In [7]:
def create_model(config):
  """Builds a model of minimum 2 layers sequentially from a given config dictionary"""
  model = Sequential()

  model.add(Dense(
    units = config["units"],
    activation = relu,
    input_shape = (config["num_features"],)
  ))

  model.add(BatchNormalization(
    momentum = config["bn_momentum"]
  ))


  for i in range(config["layers"]-2):
    model.add(Dense(
      units = config["units"],
      activation = relu
    ))
    model.add(BatchNormalization(
      momentum = config["bn_momentum"]
    ))

  model.add(Dense(
    units = config["units"],
    activation = relu
  ))

  model.add(BatchNormalization(
    momentum = config["bn_momentum"]
  ))

  model.add(Dense(
    units = num_outputs,
    activation = relu
  ))  

  model.compile(
    optimizer = AdamW(
      learning_rate = config["learning_rate"],
      weight_decay = config["weight_decay"]
    ),
    loss = "mse",
  )

  return model

In [27]:
from keras.callbacks import EarlyStopping
import tensorflow as tf

config = {
    "units": 96,
    "learning_rate": 0.004102449498283615,
    "layers": 6,
    "seq_length": seq_length,
    "num_features": num_features,
    "bn_momentum" : 0.32753376728017486,
    "weight_decay" : 0.0002017422068564576
}

def trainer(train_x, train_y, model, val_x, val_y):
    epochs = 100
    minibatch_size = 2048

    tf.random.set_seed(5)

    early_stopping = EarlyStopping(
        monitor='val_loss',
        mode='min',
        min_delta = 1,
        patience = 5,
    )

    model.fit(
        train_x,
        train_y,
        batch_size = minibatch_size,
        validation_data = (val_x, val_y),
        epochs = epochs,
        callbacks = [early_stopping]
    )

predictions = []
for i, ((x_train, y_train), (x_val, y_val), (x_test, y_test)) in enumerate(train_val_test):
    if i == 9:
        model = create_model(config)
        model.summary()
        trainer(x_train, y_train, model, x_val, y_val)

        pred = np.array(model(x_test)).flatten()
        print("Test loss:", np.mean((pred-y_test)**2))
        
        predictions.append(np.array(model(x_test)))

# predictions = np.array(predictions)
predictions = np.concatenate(predictions)

Epoch 1/100
[1m585/585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - loss: 781317.6875 - val_loss: 2645.0242
Epoch 2/100
[1m585/585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - loss: 1869.5724 - val_loss: 2582.5642
Epoch 3/100
[1m585/585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - loss: 981.2454 - val_loss: 2540.4434
Epoch 4/100
[1m585/585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 924.7442 - val_loss: 2356.3928
Epoch 5/100
[1m585/585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 895.4520 - val_loss: 2271.3467
Epoch 6/100
[1m585/585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 890.8079 - val_loss: 2277.8105
Epoch 7/100
[1m585/585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 881.9214 - val_loss: 1761.8035
Epoch 8/100
[1m585/585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 868.7173 - val_loss

In [13]:
def prediction(df_test, predictions):
    # df_test["Prediction"] = predictions.flatten()
    df_test["Prediction"] = predictions

    # m = MeanSquaredError()
    # m.update_state(test_y_org, predictions)
    # print("MSE from model:", m.result().numpy())
    # m = RootMeanSquaredError()
    # m.update_state(test_y_org, predictions)
    # print("RMSE from model:", m.result().numpy())

    return df_test

df_test_whole = df_read_lags.loc[df_read_lags.loc[:, "Quote_date"] >= "2021-01-01", :]
df_test_whole = prediction(df_test_whole, predictions)

#print(train_y_org[:, :1].min(), train_y_org[:, :1].max())
#print(train_y_org[:, 1:].min(), train_y_org[:, 1:].max())

from pathlib import Path
from datetime import datetime

time = datetime.now()
time = time.strftime("%m-%d_%H-%M")

filename = f"../data/Predictions/{last_year}_predictions_{time}_GARCH.csv"
filepath = Path(filename)
filepath.parent.mkdir(parents=True, exist_ok = True)
df_test_whole.to_csv(filename)
#df_test.info()
#print(df_test.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["Prediction"] = predictions


In [29]:
# Run this if one month is very bad

if True:
    df_test_whole = pd.read_csv("../data/Predictions/2021_predictions_09-30_14-42_GARCH.csv")
    df_test_whole.loc[(df_test_whole.loc[:, "Quote_date"] >= "2021-10-01 00:00:00") & (df_test_whole.loc[:, "Quote_date"] < "2021-11-01 00:00:00"), "Prediction"] = predictions

    from datetime import datetime
    time = datetime.now()
    time = time.strftime("%m-%d_%H-%M")

    filename = f"../data/Predictions/{last_year}_predictions_{time}_GARCH.csv"
    filepath = Path(filename)
    filepath.parent.mkdir(parents=True, exist_ok = True)
    df_test_whole.to_csv(filename)

    df_test_whole.info()
    print(df_test_whole.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1845482 entries, 0 to 1845481
Data columns (total 39 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Unnamed: 0.2            int64  
 1   Unnamed: 0.1            int64  
 2   Unnamed: 0              int64  
 3   Quote_date              object 
 4   Expire_date             object 
 5   Price                   float64
 6   Underlying_last         float64
 7   Strike                  float64
 8   Ttl                     int64  
 9   Volatility              float64
 10  Volatility_GJR_GARCH    float64
 11  R                       float64
 12  Underlying_last-4       float64
 13  Strike-4                float64
 14  Ttl-4                   float64
 15  Volatility_GJR_GARCH-4  float64
 16  R-4                     float64
 17  Underlying_last-3       float64
 18  Strike-3                float64
 19  Ttl-3                   float64
 20  Volatility_GJR_GARCH-3  float64
 21  R-3                     float64