In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import datetime
import os
import matplotlib.pyplot as plt
import math

In [2]:
options = pd.read_csv("../csv/options/raw/2015-16options.csv")

In [3]:
trends = pd.read_csv("../csv/trends/input.csv")

In [4]:
pruned_trends = trends
pruned_trends["date"] = pd.to_datetime(trends["date"])
pruned_trends.head()

Unnamed: 0,date,0,1,2,3,4,5,6,7,8,...,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974
0,2015-01-02 09:00:00,37.0,30.0,55.0,49.0,18.0,47.0,22.0,30.0,12.0,...,47.0,35.0,60.0,57.0,29.0,40.0,29.0,0.0,34.0,60.0
1,2015-01-03 09:00:00,47.0,53.0,95.0,39.0,29.0,40.0,23.0,42.0,12.0,...,43.0,32.0,56.0,42.0,29.0,39.0,15.0,0.0,39.0,57.0
2,2015-01-04 09:00:00,41.0,33.0,65.0,41.0,28.0,49.0,38.0,39.0,11.0,...,38.0,25.0,51.0,29.0,31.0,38.0,13.0,0.0,39.0,60.0
3,2015-01-05 09:00:00,38.0,49.0,43.0,41.0,19.0,71.0,30.0,36.0,5.0,...,42.0,38.0,93.0,70.0,34.0,51.0,22.0,0.0,30.0,51.0
4,2015-01-06 09:00:00,59.0,38.0,100.0,45.0,21.0,53.0,23.0,57.0,6.0,...,47.0,34.0,86.0,77.0,34.0,56.0,25.0,0.0,30.0,53.0


In [5]:
pruned_trends["date"] = pruned_trends["date"].apply(lambda x: pd.Timestamp(x.date()))
pruned_trends = pruned_trends.set_index("date")

In [6]:
pruned_trends.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,37.0,30.0,55.0,49.0,18.0,47.0,22.0,30.0,12.0,4.0,...,47.0,35.0,60.0,57.0,29.0,40.0,29.0,0.0,34.0,60.0
2015-01-03,47.0,53.0,95.0,39.0,29.0,40.0,23.0,42.0,12.0,18.0,...,43.0,32.0,56.0,42.0,29.0,39.0,15.0,0.0,39.0,57.0
2015-01-04,41.0,33.0,65.0,41.0,28.0,49.0,38.0,39.0,11.0,21.0,...,38.0,25.0,51.0,29.0,31.0,38.0,13.0,0.0,39.0,60.0
2015-01-05,38.0,49.0,43.0,41.0,19.0,71.0,30.0,36.0,5.0,8.0,...,42.0,38.0,93.0,70.0,34.0,51.0,22.0,0.0,30.0,51.0
2015-01-06,59.0,38.0,100.0,45.0,21.0,53.0,23.0,57.0,6.0,10.0,...,47.0,34.0,86.0,77.0,34.0,56.0,25.0,0.0,30.0,53.0


In [7]:
options.head()

Unnamed: 0,secid,date,exdate,cp_flag,strike_price,volume,open_interest,impl_volatility,opprc,moneyness,tte,close,spread,noi
0,5284,01/02/2015,03/20/2015,C,2500,0,5,1.21251,0.1,1.677852,77,1.49,0.1,5
1,5284,01/02/2015,03/20/2015,P,2500,0,10,1.389238,1.15,1.677852,77,1.49,0.5,10
2,5284,01/05/2015,03/20/2015,C,2500,0,5,1.322124,0.1,1.760563,74,1.42,0.1,0
3,5284,01/05/2015,03/20/2015,P,2500,0,10,1.533191,1.225,1.760563,74,1.42,0.45,0
4,5284,01/06/2015,03/20/2015,C,2500,0,5,1.369239,0.1,1.798561,73,1.39,0.1,0


In [8]:
def setup_options_input(options_df, numrows):
    pruned_options = options_df[options_df["volume"] > 0].sample(numrows)
    pruned_options = pruned_options.drop(["secid", "opprc", "moneyness", "close", "spread", "noi", "open_interest", "exdate", "volume"], axis=1)
    pruned_options = pd.get_dummies(pruned_options, columns=["cp_flag"])
    pruned_options["date"] = pd.to_datetime(pruned_options["date"])
    pruned_options = pruned_options.set_index("date")
    df = pruned_options.apply(lambda x: pd.concat([x, pruned_trends.loc[pd.Timestamp(x.name)]], axis=0), axis=1)
    return df.drop(["impl_volatility"], axis=1).to_numpy(), df["impl_volatility"].to_numpy()

In [9]:
class OptionsDataSequence(tf.keras.utils.Sequence):

    def __init__(self, options_data, trends_data, batch_size):
        self.options = options_data
        self.options = self.options[self.options["volume"] > 0]
        self.options["past_vol"] = self.options.map()
        self.options = self.options.drop(["secid", "opprc", "moneyness", "close", "spread", "noi", "open_interest", "exdate", "volume"], axis=1)
        self.options = pd.get_dummies(self.options, columns=["cp_flag"])
        self.options["date"] = pd.to_datetime(self.options["date"])
        self.options = self.options.set_index("date")

        self.trends = trends_data
        self.batch_size = batch_size

    def __len__(self):
        return math.ceil(len(self.options) / self.batch_size)

    def __getitem__(self, item):
        data = self.options.iloc[item * self.batch_size:(item + 1) * self.batch_size]
        data = data.apply(lambda x: pd.concat([x, self.trends.loc[pd.Timestamp(x.name)]], axis=0), axis=1)
        return data.drop(["impl_volatility"], axis=1).to_numpy(), data["impl_volatility"].to_numpy()

    def getitem(self, item):
        return self.__getitem__(item)



In [16]:
options_generator = OptionsDataSequence(options, pruned_trends, 512)
print(len(options_generator))

def data_generator():
    options_generator = OptionsDataSequence(options, pruned_trends, 512)
    for i in range(len(options_generator)):
        yield options_generator.__getitem__(i)

44826


In [17]:
dataset = tf.data.Dataset.from_generator(data_generator, output_signature=(
    tf.TensorSpec(shape=(None, 1979), dtype=tf.float64),
    tf.TensorSpec(shape=(None, ), dtype=tf.float64)
))

In [18]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(1979, activation='relu'),
    tf.keras.layers.Dense(1979*2, activation='relu'),
    tf.keras.layers.Dense(1979*2, activation='relu'),
    tf.keras.layers.Dense(1979, activation='relu'),
    tf.keras.layers.Dense(1, activation='linear'),
])

In [19]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.01),
    loss=keras.losses.MeanAbsolutePercentageError(),
    metrics=[
        keras.metrics.MeanAbsolutePercentageError(),
        keras.metrics.MeanAbsoluteError(),
        keras.metrics.MeanSquaredError()
             ]
)

In [20]:
model.fit(
    x=dataset,
    batch_size=512,
    epochs=1, # seems like enough to reach the the minimum loss
    shuffle=True,
    verbose=1,
    use_multiprocessing=True,
    workers=12,
)

Epoch 1/3
Epoch 2/3

KeyboardInterrupt: 

In [31]:
model.save("./generator_1")

INFO:tensorflow:Assets written to: ./mape_loss_2/assets


In [32]:
# modell = tf.keras.models.load_model("./initial_model_2")
modell = tf.keras.models.load_model("./mape_loss_2")

In [33]:
opt_test = pd.read_csv("../csv/options/raw/2017-18options.csv")

In [34]:
x_test, y_test = setup_options_input(opt_test, 200000)

In [22]:
modell.evaluate(x_test, y_test)



[39.219417572021484,
 39.219417572021484,
 0.26837822794914246,
 0.17069818079471588]

In [15]:
modell.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_16 (Dense)            (None, 1979)              3918420   
                                                                 
 dense_17 (Dense)            (None, 3958)              7836840   
                                                                 
 dense_18 (Dense)            (None, 3958)              15669722  
                                                                 
 dense_19 (Dense)            (None, 1979)              7834861   
                                                                 
 dense_20 (Dense)            (None, 1)                 1980      
                                                                 
Total params: 35,261,823
Trainable params: 35,261,823
Non-trainable params: 0
_________________________________________________________________


In [39]:
y_predict = modell.predict(x_test)

In [None]:
plt.plot(y_test - y_predict)