In [14]:
from lstm import create_model
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from datetime import date
from pathlib import Path

In [15]:
def read_file(file):
    """Read a single file and return a dataframe"""
    return pd.read_csv(file, skipinitialspace=True)

def lag_features(df, features, seq_length):
    """Transforms a raw 2D dataframe of option data into 2D dataframe ofsequence data.
    Last 2 indexes per sequence are bid and ask price. The len(features)*seq_length
    features before are sequences of features"""
    df = df.sort_values(["Expire_date", "Strike", "Ttl"], ascending = [True, True, False])
    
    # Adding lag for naive benchmarking
    #df["Naive"] = df["Price"].shift(1)

    for step in range(seq_length)[::-1]:
        for feature in features:
            df[feature + "-" + str(step)] = df[feature].shift(step)
    
    df["Check_strike"] = df["Strike"] == df["Strike"].shift(seq_length-1)
    df["Check_expire"] = df["Expire_date"] == df["Expire_date"].shift(seq_length-1)
    df = df[(df["Check_strike"] == True) & (df["Check_expire"] == True)]
    df = df.drop(["Check_strike", "Check_expire"], axis=1)
    #df[["Bid_strike_last", "Ask_strike_last"]] = df[["Bid_strike", "Ask_strike"]]
    #df[["Bid_last", "Ask_last"]] = df[["Bid", "Ask"]]
    df["Price_last"] = df["Price"]
    df = df.sort_values(["Quote_date"], ascending = [True])
    return df

def df_to_xy(df, num_features, seq_length, num_outputs):
    """Transforms a dataframe into two arrays of explanatory variables x and explained variables y"""
    array = df.to_numpy()
    array_x, array_y = array[:, -num_features*seq_length - num_outputs:-num_outputs].astype(np.float32), array[:,-num_outputs:].astype(np.float32)
    return array_x, array_y

In [16]:
first_year = 2019
last_year = 2021
file = f"../data/processed_data/{first_year}-{last_year}_underlying-strike_only-price.csv"

df_read = read_file(file)
print(df_read)
df_read.info()
print(df_read)
print(df_read["Ttl"].max())

         Unnamed: 0  Quote_date Expire_date     Price  Underlying_last  \
0           1354913  2019-01-02  2019-01-04  1707.050          2509.98   
1           1354914  2019-01-02  2019-01-04  1607.495          2509.98   
2           1354915  2019-01-02  2019-01-04  1507.500          2509.98   
3           1354916  2019-01-02  2019-01-04  1458.295          2509.98   
4           1354917  2019-01-02  2019-01-04  1408.300          2509.98   
...             ...         ...         ...       ...              ...   
5123793     6521988  2021-12-31  2024-12-20   150.000          4766.39   
5123794     6521989  2021-12-31  2024-12-20   150.000          4766.39   
5123795     6521990  2021-12-31  2024-12-20   150.900          4766.39   
5123796     6521991  2021-12-31  2024-12-20   150.000          4766.39   
5123797     6521992  2021-12-31  2024-12-20   150.000          4766.39   

         Strike   Ttl  Volatility  Volatility_GJR_GARCH     R  
0         800.0     2    0.202726              

In [17]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

training_period = 10
val_period = 1
test_period = 1
num_models = 12

features = ["Underlying_last", "Strike", "Ttl", "Volatility", "R"]
seq_length = 5
num_features = len(features)
num_outputs = 1

df_read_lags = lag_features(df_read, features, seq_length)

train_val_test = []

month = 4
year = 0
for i in range(num_models):
    if month == 13:
        year += 1
        month = 1
    train_start = datetime(2020 + year, month, 1)
    val_start = train_start + relativedelta(months=8)
    test_start = val_start + relativedelta(months=1)
    test_end = test_start + relativedelta(months=1)

    month += 1

    df_train_orginal = df_read_lags.loc[(df_read_lags.loc[:, "Quote_date"] >= str(train_start)) & (df_read_lags.loc[:, "Quote_date"] < str(val_start)), :]
    df_val_orginal = df_read_lags.loc[(df_read_lags.loc[:, "Quote_date"] >= str(val_start)) & (df_read_lags.loc[:, "Quote_date"] < str(test_start)), :]
    df_test_orginal = df_read_lags.loc[(df_read_lags.loc[:, "Quote_date"] >= str(test_start)) & (df_read_lags.loc[:, "Quote_date"] < str(test_end)), :]

    train_x_org, train_y_org = df_to_xy(df_train_orginal, num_features, seq_length, num_outputs)
    val_x_org, val_y_org = df_to_xy(df_val_orginal, num_features, seq_length, num_outputs)
    test_x_org, test_y_org = df_to_xy(df_test_orginal, num_features, seq_length, num_outputs)

    scaler = MinMaxScaler()
    train_x_scaled = scaler.fit_transform(train_x_org)
    val_x_scaled = scaler.transform(val_x_org)
    test_x_scaled = scaler.transform(test_x_org)

    print(month, test_x_scaled.shape)
    print(test_start, test_end)

    """shuffle = np.random.permutation(len(train_x_scaled))
    train_x_scaled, train_y_scaled = train_x_scaled[shuffle], train_y_scaled[shuffle]"""

    train_x_scaled = np.reshape(train_x_scaled, (len(train_x_scaled), seq_length, num_features))
    val_x_scaled = np.reshape(val_x_scaled, (len(val_x_scaled), seq_length, num_features))
    test_x_scaled = np.reshape(test_x_scaled, (len(test_x_scaled), seq_length, num_features))

    # print(f"Train_x shape: {train_x_scaled.shape}, train_y shape: {train_y_org.shape}")
    # print(f"Test_x shape: {test_x_scaled.shape}, test_y shape: {test_y_org.shape}")
    # print("------------------------------------------------")
    train_val_test.append(((train_x_scaled, train_y_org), (val_x_scaled, val_y_org), (test_x_scaled, test_y_org)))




5 (134329, 25)
2021-01-01 00:00:00 2021-02-01 00:00:00
6 (123140, 25)
2021-02-01 00:00:00 2021-03-01 00:00:00
7 (159777, 25)
2021-03-01 00:00:00 2021-04-01 00:00:00
8 (149256, 25)
2021-04-01 00:00:00 2021-05-01 00:00:00
9 (155258, 25)
2021-05-01 00:00:00 2021-06-01 00:00:00
10 (152434, 25)
2021-06-01 00:00:00 2021-07-01 00:00:00
11 (153033, 25)
2021-07-01 00:00:00 2021-08-01 00:00:00
12 (169230, 25)
2021-08-01 00:00:00 2021-09-01 00:00:00
13 (142146, 25)
2021-09-01 00:00:00 2021-10-01 00:00:00
2 (162490, 25)
2021-10-01 00:00:00 2021-11-01 00:00:00
3 (168877, 25)
2021-11-01 00:00:00 2021-12-01 00:00:00
4 (175512, 25)
2021-12-01 00:00:00 2022-01-01 00:00:00


In [18]:
# df_to = df_test_orginal

# df_a = df_to[(df_to["Expire_date"] == "2021-05-19") & (df_to["Strike"] == 3500)]

# print(df_a)

In [19]:
# df_ax, df_ay = df_to_xy(df_a, num_features, seq_length, num_outputs)

# print(df_ax[-1])
# print(df_ay[-1])

In [23]:
from keras.callbacks import EarlyStopping
import tensorflow as tf

config = {
        "units": 64,
        "learning_rate": 0.002594627161103502,
        "layers": 5,
        "bn_momentum" : 0.26212094315874734,
        "weight_decay": 0.0003327609151101109,
        "seq_length": seq_length,
        "num_features": num_features,
    }

def trainer(train_x, train_y, model, val_x, val_y):
    epochs = 100
    minibatch_size = 4096

    tf.random.set_seed(2)

    early_stopping = EarlyStopping(
        monitor='val_loss',
        mode='min',
        min_delta = 0,
        patience = 10,
    )

    model.fit(
        train_x,
        train_y,
        batch_size = minibatch_size,
        # validation_split = 0.3,
        validation_data = (val_x, val_y),
        epochs = epochs,
        callbacks = [early_stopping]
    )

predictions = []
for i, ((x_train, y_train), (x_val, y_val), (x_test, y_test)) in enumerate(train_val_test):
    if i == 10:
        model = create_model(config)
        model.summary()
        trainer(x_train, y_train, model, x_val, y_val)
        predictions.append(np.array(model(x_test)))

# predictions = np.array(predictions)
predictions = np.concatenate(predictions)


"""path = f"./runs/model_w_validation/{first_year}-{last_year}-{date.today()}"
model.save(path)"""

Epoch 1/100
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 152ms/step - loss: 1075989.8750 - mae: 683.3654 - val_loss: 867477.8125 - val_mae: 605.0748
Epoch 2/100
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 154ms/step - loss: 704393.1875 - mae: 560.7054 - val_loss: 341707.1875 - val_mae: 388.0759
Epoch 3/100
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 159ms/step - loss: 244127.2969 - mae: 332.8283 - val_loss: 72072.1016 - val_mae: 188.2404
Epoch 4/100
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 152ms/step - loss: 34498.8242 - mae: 124.7368 - val_loss: 5504.6323 - val_mae: 51.5197
Epoch 5/100
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 152ms/step - loss: 2054.4939 - mae: 29.3727 - val_loss: 765.4004 - val_mae: 19.7287
Epoch 6/100
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 151ms/step - loss: 522.1740 - mae: 14.4236 - val_loss: 573.5667 - val_mae: 20.2686

'path = f"./runs/model_w_validation/{first_year}-{last_year}-{date.today()}"\nmodel.save(path)'

In [12]:
def prediction(df_test, predictions):
    # df_test["Prediction"] = predictions.flatten()
    df_test["Prediction"] = predictions
    return df_test

df_test_whole = df_read_lags.loc[df_read_lags.loc[:, "Quote_date"] >= "2021-01-01", :]
df_test_whole = prediction(df_test_whole, predictions)

from datetime import datetime
time = datetime.now()
time = time.strftime("%m-%d_%H-%M")

filename = f"../data/Predictions/{last_year}_predictions_{time}_LSTM.csv"
filepath = Path(filename)
filepath.parent.mkdir(parents=True, exist_ok = True)
df_test_whole.to_csv(filename)

df_test_whole.info()
print(df_test_whole.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["Prediction"] = predictions


<class 'pandas.core.frame.DataFrame'>
Index: 1845482 entries, 3102998 to 5123797
Data columns (total 37 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Unnamed: 0            int64  
 1   Quote_date            object 
 2   Expire_date           object 
 3   Price                 float64
 4   Underlying_last       float64
 5   Strike                float64
 6   Ttl                   int64  
 7   Volatility            float64
 8   Volatility_GJR_GARCH  float64
 9   R                     float64
 10  Underlying_last-4     float64
 11  Strike-4              float64
 12  Ttl-4                 float64
 13  Volatility-4          float64
 14  R-4                   float64
 15  Underlying_last-3     float64
 16  Strike-3              float64
 17  Ttl-3                 float64
 18  Volatility-3          float64
 19  R-3                   float64
 20  Underlying_last-2     float64
 21  Strike-2              float64
 22  Ttl-2                 float64
 23  Volati

In [24]:
# Run this if one month is very bad

if False:
    df_test_whole = pd.read_csv("../data/Predictions/2021_predictions_09-24_17-53_LSTM.csv")
    df_test_whole.loc[(df_test_whole.loc[:, "Quote_date"] >= "2021-11-01 00:00:00") & (df_test_whole.loc[:, "Quote_date"] < "2021-12-01 00:00:00"), "Prediction"] = predictions

    from datetime import datetime
    time = datetime.now()
    time = time.strftime("%m-%d_%H-%M")

    filename = f"../data/Predictions/{last_year}_predictions_{time}_LSTM.csv"
    filepath = Path(filename)
    filepath.parent.mkdir(parents=True, exist_ok = True)
    df_test_whole.to_csv(filename)

    df_test_whole.info()
    print(df_test_whole.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1845482 entries, 0 to 1845481
Data columns (total 38 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Unnamed: 0.1          int64  
 1   Unnamed: 0            int64  
 2   Quote_date            object 
 3   Expire_date           object 
 4   Price                 float64
 5   Underlying_last       float64
 6   Strike                float64
 7   Ttl                   int64  
 8   Volatility            float64
 9   Volatility_GJR_GARCH  float64
 10  R                     float64
 11  Underlying_last-4     float64
 12  Strike-4              float64
 13  Ttl-4                 float64
 14  Volatility-4          float64
 15  R-4                   float64
 16  Underlying_last-3     float64
 17  Strike-3              float64
 18  Ttl-3                 float64
 19  Volatility-3          float64
 20  R-3                   float64
 21  Underlying_last-2     float64
 22  Strike-2              float64
 23  Ttl-2  