In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
#Variables
first_year = 2019
last_year = 2021
split_date ="2021-01-01"

epochs = 100

features = ["Underlying_last", "Strike", "Ttl", "Volatility", "R"]
num_features = len(features)
num_outputs = 1
seq_length = 5



In [3]:
def read_file(file):
    """Read a single file and return a dataframe"""
    return pd.read_csv(file, skipinitialspace=True)

def lag_features(df, features, seq_length):
    """Transforms a raw 2D dataframe of option data into 2D dataframe ofsequence data.
    Last 2 indexes per sequence are bid and ask price. The len(features)*seq_length
    features before are sequences of features"""
    df = df.sort_values(["Expire_date", "Strike", "Ttl"], ascending = [True, True, False])

    for step in range(seq_length)[::-1]:
        for feature in features:
            df[feature + "-" + str(step)] = df[feature].shift(step)
    
    df["Check_strike"] = df["Strike"] == df["Strike"].shift(seq_length-1)
    df["Check_expire"] = df["Expire_date"] == df["Expire_date"].shift(seq_length-1)
    df = df[(df["Check_strike"] == True) & (df["Check_expire"] == True)]
    df = df.drop(["Check_strike", "Check_expire"], axis=1)
    #df[["Bid_strike_last", "Ask_strike_last"]] = df[["Bid_strike", "Ask_strike"]]
    #df[["Bid_last", "Ask_last"]] = df[["Bid", "Ask"]]
    df["Price_last"] = df["Price"]
    df = df.sort_values(["Quote_date"], ascending = [True])
    return df

def create_train_test(df, split_date):
    """Splits data in training and test set, and transforms data to right 2D format"""
    return df[df["Quote_date"] < split_date], df[df["Quote_date"] >= split_date]

def df_to_xy(df):
    """Transforms a dataframe into two arrays of explanatory variables x and explained variables y"""
    dx = df[["Underlying_last", "Strike", "Ttl", "Volatility", "R"]]
    dy = df["Price"]
    array_x, array_y = dx.to_numpy().astype(np.float32), dy.to_numpy().astype(np.float32)
    return array_x, array_y

def min_max_scale(train, test):
    """Scales a training and test set using MinMaxScaler. The scaler is calibrated on the training set"""
    scaler = MinMaxScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test

In [4]:
# Load dataset
df_read = read_file("../data/processed_data/2019-2021_underlying-strike_only-price.csv")


In [5]:
from datetime import datetime
from dateutil.relativedelta import relativedelta


num_models = 12

features = ["Underlying_last", "Strike", "Ttl", "Volatility", "R"]
seq_length = 5
num_features = len(features)
num_outputs = 1

df_read_lags = lag_features(df_read, features, seq_length)

train_val_test = []

month = 4
year = 0
for i in range(num_models):
    if month == 13:
        year += 1
        month = 1
    train_start = datetime(2020 + year, month, 1)
    val_start = train_start + relativedelta(months=8)
    test_start = val_start + relativedelta(months=1)
    test_end = test_start + relativedelta(months=1)

    month += 1

    df_train_orginal = df_read_lags.loc[(df_read_lags.loc[:, "Quote_date"] >= str(train_start)) & (df_read_lags.loc[:, "Quote_date"] < str(val_start)), :]
    df_val_orginal = df_read_lags.loc[(df_read_lags.loc[:, "Quote_date"] >= str(val_start)) & (df_read_lags.loc[:, "Quote_date"] < str(test_start)), :]
    df_test_orginal = df_read_lags.loc[(df_read_lags.loc[:, "Quote_date"] >= str(test_start)) & (df_read_lags.loc[:, "Quote_date"] < str(test_end)), :]

    train_x_org, train_y_org = df_to_xy(df_train_orginal)
    val_x_org, val_y_org = df_to_xy(df_val_orginal)
    test_x_org, test_y_org = df_to_xy(df_test_orginal)

    scaler = MinMaxScaler()
    train_x_scaled = scaler.fit_transform(train_x_org)
    val_x_scaled = scaler.transform(val_x_org)
    test_x_scaled = scaler.transform(test_x_org)

    """shuffle = np.random.permutation(len(train_x_scaled))
    train_x_scaled, train_y_scaled = train_x_scaled[shuffle], train_y_scaled[shuffle]"""

    train_x_scaled = np.reshape(train_x_scaled, (len(train_x_scaled), num_features))
    val_x_scaled = np.reshape(val_x_scaled, (len(val_x_scaled), num_features))
    test_x_scaled = np.reshape(test_x_scaled, (len(test_x_scaled), num_features))

    # print(f"Train_x shape: {train_x_scaled.shape}, train_y shape: {train_y_org.shape}")
    # print(f"Test_x shape: {test_x_scaled.shape}, test_y shape: {test_y_org.shape}")
    # print("------------------------------------------------")
    train_val_test.append(((train_x_scaled, train_y_org), (val_x_scaled, val_y_org), (test_x_scaled, test_y_org)))




In [6]:
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout, BatchNormalization
from keras import backend as K
from tensorflow.keras.optimizers import AdamW
import keras as KER
from sklearn.model_selection import train_test_split
from keras.activations import linear, relu
from datetime import datetime
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError

In [7]:
def create_model(config):
  """Builds a model of minimum 2 layers sequentially from a given config dictionary"""
  model = Sequential()

  model.add(Dense(
    units = config["units"],
    activation = relu,
    input_shape = (config["num_features"],)
  ))

  model.add(BatchNormalization(
    momentum = config["bn_momentum"]
  ))


  for i in range(config["layers"]-2):
    model.add(Dense(
      units = config["units"],
      activation = relu
    ))
    model.add(BatchNormalization(
      momentum = config["bn_momentum"]
    ))

  model.add(Dense(
    units = config["units"],
    activation = relu
  ))

  model.add(BatchNormalization(
    momentum = config["bn_momentum"]
  ))

  model.add(Dense(
    units = num_outputs,
    activation = relu
  ))  

  model.compile(
    optimizer = AdamW(
      learning_rate = config["learning_rate"],
      weight_decay = config["weight_decay"]
    ),
    loss = "mse",
  )

  return model

In [11]:
from keras.callbacks import EarlyStopping
import tensorflow as tf

config = {
    "units": 32,
    "learning_rate": 0.004469423596275494,
    "layers": 4,
    "seq_length": seq_length,
    "num_features": num_features,
    "bn_momentum" : 0.30057069329591907,
    "weight_decay" : 0.00042470893538329376,
}

def trainer(train_x, train_y, model, val_x, val_y):
    epochs = 100
    minibatch_size = 4096

    tf.random.set_seed(2)

    early_stopping = EarlyStopping(
        monitor='val_loss',
        mode='min',
        min_delta = 1,
        patience = 15,
    )

    model.fit(
        train_x,
        train_y,
        batch_size = minibatch_size,
        validation_data = (val_x, val_y),
        epochs = epochs,
        callbacks = [early_stopping]
    )

predictions = []
for i, ((x_train, y_train), (x_val, y_val), (x_test, y_test)) in enumerate(train_val_test):
    if i == 10:
        model = create_model(config)
        model.summary()
        trainer(x_train, y_train, model, x_val, y_val)

        pred = np.array(model(x_test)).flatten()
        print("Test loss:", np.mean((pred-y_test)**2))

        predictions.append(np.array(model(x_test)))

predictions = np.concatenate(predictions)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 1068984.0000 - val_loss: 731658.2500
Epoch 2/100
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 574335.0000 - val_loss: 155501.0938
Epoch 3/100
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 107305.3438 - val_loss: 9192.6895
Epoch 4/100
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 4849.9995 - val_loss: 289.0832
Epoch 5/100
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 576.2460 - val_loss: 996.8809
Epoch 6/100
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 542.3729 - val_loss: 143.3677
Epoch 7/100
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 515.4399 - val_loss: 283.8382
Epoch 8/100
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 518.2140 - v

In [9]:
def prediction(df_test, predictions):
    # df_test["Prediction"] = predictions.flatten()
    df_test["Prediction"] = predictions
    return df_test

df_test_whole = df_read_lags.loc[df_read_lags.loc[:, "Quote_date"] >= "2021-01-01", :]
df_test_whole = prediction(df_test_whole, predictions)

from pathlib import Path
from datetime import datetime

time = datetime.now()
time = time.strftime("%m-%d_%H-%M")

filename = f"../data/Predictions/{last_year}_predictions_{time}.csv"
filepath = Path(filename)
filepath.parent.mkdir(parents=True, exist_ok = True)
df_test_whole.to_csv(filename)
#df_test.info()
#print(df_test.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["Prediction"] = predictions


In [15]:
# Run this if one month is very bad

if False:
    df_test_whole = pd.read_csv("../data/Predictions/2021_predictions_09-30_09-45.csv")
    df_test_whole.loc[(df_test_whole.loc[:, "Quote_date"] >= "2021-11-01 00:00:00") & (df_test_whole.loc[:, "Quote_date"] < "2021-12-01 00:00:00"), "Prediction"] = predictions

    from datetime import datetime
    time = datetime.now()
    time = time.strftime("%m-%d_%H-%M")

    filename = f"../data/Predictions/{last_year}_predictions_{time}.csv"
    filepath = Path(filename)
    filepath.parent.mkdir(parents=True, exist_ok = True)
    df_test_whole.to_csv(filename)

    df_test_whole.info()
    print(df_test_whole.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1845482 entries, 0 to 1845481
Data columns (total 38 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Unnamed: 0.1          int64  
 1   Unnamed: 0            int64  
 2   Quote_date            object 
 3   Expire_date           object 
 4   Price                 float64
 5   Underlying_last       float64
 6   Strike                float64
 7   Ttl                   int64  
 8   Volatility            float64
 9   Volatility_GJR_GARCH  float64
 10  R                     float64
 11  Underlying_last-4     float64
 12  Strike-4              float64
 13  Ttl-4                 float64
 14  Volatility-4          float64
 15  R-4                   float64
 16  Underlying_last-3     float64
 17  Strike-3              float64
 18  Ttl-3                 float64
 19  Volatility-3          float64
 20  R-3                   float64
 21  Underlying_last-2     float64
 22  Strike-2              float64
 23  Ttl-2  