In [2]:
import pandas as pd
import numpy as np
import tqdm
from scipy import sparse
import torch
from torch import nn
from collections import Counter
from typing import *
import time
import logging
import altair as alt
import re
import numpy.linalg as la
import json
import Word2Vec
import Models
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from torch import nn
from sklearn.metrics import f1_score
import numpy.random
alt.data_transformers.disable_max_rows
torch.manual_seed(0)
np.random.seed(0)

# 1. Data Preprocessing

In [7]:
### 1.1 Load the Data and Merge with Text
def tokenize(text):
    tokens = nlp(" ".join([token.lower() for token in re.sub("\.|,|:|!|\?", "", text).replace("-", " ").split(" ") if token.isalpha()]))
    return " ".join([token.lemma_ for token in tokens])

def load_data(token: str, freq: int):
    df_text = pd.read_csv("Data/{}Text.csv".format(token), sep="\t", index_col="TimeStamp")
    df_token = pd.read_csv("Data/{}{}h.csv".format(token, freq), sep="\t", index_col="TimeStamp")
    df_text.index = pd.to_datetime(df_text.index)
    df_token.index = pd.to_datetime(df_token.index)
    df_data = pd.merge_asof(df_text, df_token, left_index=True, right_index=True, direction="forward")
    df_data.text = df_data.text.astype(str)
    df = df_data.groupby(["PriceTime"])[["Return", "ClosePrice"]].mean()
    df["Return"] = df["Return"].shift(-1)
    df["Text"] = df_data.groupby(["PriceTime"])["text"].agg(lambda x:" ".join(x))  
    df.index = pd.DatetimeIndex(df.index)
    return {"train": df["20210101":"20211231"], "val": df["20220101":"20220401"]}

In [8]:
%%time
data = dict()
for token in ["ETH", "AAVE"]:
    data[token] = dict()
    for freq in [4, 1]:
        data[token][freq] = load_data(token, freq)
tokens = " ".join(data["ETH"][4]["train"].Text.tolist() + data["AAVE"][4]["train"].Text.tolist()).split(" ")

CPU times: user 2.32 s, sys: 366 ms, total: 2.69 s
Wall time: 3.14 s


# 2. Word2Vec Training

In [9]:
%%time
# 2.1 Use 5 min to load the corpus and 30min to generate the training data
corpus = Word2Vec.Corpus()
corpus.load_data(tokens, 5) 
corpus.generate_training_data()

  0%|          | 0/8107896 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
model = Word2Vec.Word2Vec(len(corpus.word_to_index), 50)
evaluator = Word2Vec.Evaluator(model, corpus)
evaluator.train(lr=1e-2)

In [None]:
# Save the Word2Vec
with open("WordIndex.json", "w") as f:
    f.write(json.dumps(corpus.word_to_index))
model.save("Word2Vec")

In [4]:
%%time
# Load the Word2Vec
with open("WordIndex.json", "r") as f:
        word_to_index = json.loads(f.read())
word2vec = Word2Vec.Word2Vec(len(word_to_index), 50)
word2vec.load_state_dict(torch.load("Word2Vec"))
w2v = Word2Vec.W2V(word2vec, word_to_index)

def embed(text):
    seq = [w2v.get_vec(token) for token in text.split(" ")]
    return torch.from_numpy(np.array(seq))

# 3 min to embed
for token in ["ETH", "AAVE"]:
    for freq in [1, 4]:
        for data_type in ["train", "val"]:
            df = data[token][freq][data_type]
            df["Embedding"] = df.Text.apply(embed)

CPU times: user 8min 28s, sys: 2.91 s, total: 8min 30s
Wall time: 8min 32s


# 3. BaseLine

### 3.1 Define the pipeline

In [10]:
def plot(rev_df):
    rev_df = rev_df.melt(id_vars=["PriceTime"], value_vars=["ClosePrice", "RevenuePrice"], var_name="PriceType", value_name="Price").sort_values(["PriceTime", "PriceType"], ignore_index=True)
    price_plot = alt.Chart(rev_df, width=750, height=300).mark_line(
        point=alt.OverlayMarkDef(color="red", size=0.1)
    ).encode(
        x=alt.X("PriceTime:T"),
        y=alt.Y("Price:Q"),
        color=alt.Color("PriceType:N"),
        tooltip=["Price:Q", "PriceType:N", "PriceTime:T"]
    ).interactive(bind_y=False)
    return price_plot

def analyze(rev_df: pd.DataFrame):
    mse = np.sum((rev_df["RetPred"] - rev_df["Return"]) ** 2 / len(rev_df))
    f1 = f1_score(rev_df["Return"] > 0, rev_df["RetPred"] > 0)
    ret = rev_df["RevenuePrice"].iloc[-1] / rev_df["ClosePrice"].iloc[-1] - 1
    return {"plot": plot(rev_df), "mse": mse, "rev_df": rev_df, "f1": f1, "ret": ret}

def pipeline_ml(token: str, freq: int, model: nn.Module, epochs: int=3, lr: float=5e-5, buy_ret: float=0, sell_ret: float=0):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    # Load data
    train_df = data[token][freq]["train"]
    val_df = data[token][freq]["val"]
    
    # Train the model
    predictor = Models.Predictor(model, device)
    labels = torch.Tensor(train_df.Return)
    features = train_df.Embedding.tolist()
    predictor.train(features, labels, epochs=epochs, lr=lr)
    
    # Validate the model
    pred_ret = predictor.predict(val_df.Embedding)
    rev_df = predictor.validate(val_df, pred_ret, buy_ret=buy_ret, sell_ret=sell_ret)
    result = analyze(rev_df)
    result["model"] = model
    return result

### 3.2 Simple NN with two linear layers

In [6]:
result_nn = pipeline_ml("ETH", 4, Models.LinearNN(),2)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2190 [00:00<?, ?it/s]

  0%|          | 0/2190 [00:00<?, ?it/s]

  0%|          | 0/546 [00:00<?, ?it/s]

### 3.3 RNN

In [11]:
result_rnn = pipeline_ml("ETH", 4, Models.RNNModel(), 2)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2190 [00:00<?, ?it/s]

  0%|          | 0/2190 [00:00<?, ?it/s]

  0%|          | 0/546 [00:00<?, ?it/s]

### 3.1 ARIMA Model

In [11]:
def pipeline_arima(token: str, freq: int, order=(2, 0, 2), buy_ret: float=0, sell_ret: float=0):
    train_df = data[token][freq]["train"]
    val_df = data[token][freq]["val"]
    arima = ARIMA(train_df.Return, order=order)  
    model = arima.fit() 
    pred_ret = model.forecast(len(val_df), alpha=0.05).values  # 95% conf
    rev_df = Models.Predictor.validate(val_df, pred_ret, buy_ret=0, sell_ret=sell_ret)
    result = analyze(rev_df)
    result["model"] = model
    return result

In [13]:
result_arima = pipeline_arima("ETH", 4, order=(2, 0, 2))
result_arima["plot"]

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


# 4. LSTM Pipeline

In [25]:
eth_4h["mse"]

0.001963672948583579

In [26]:
eth_4h["ret"]

0.07714938942291405

In [24]:
eth_4h["plot"]

In [23]:
%%time
eth_4h = pipeline_ml("ETH", 4, Models.LSTM(), 2)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2190 [00:00<?, ?it/s]

  0%|          | 0/2190 [00:00<?, ?it/s]

  0%|          | 0/546 [00:00<?, ?it/s]

CPU times: user 4min 52s, sys: 1.34 s, total: 4min 54s
Wall time: 4min 59s


In [None]:
%%time
eth_1h = pipeline_ml("ETH", 1, Models.LSTM(), 3)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8747 [00:00<?, ?it/s]

  0%|          | 0/8747 [00:00<?, ?it/s]

In [None]:
eth_1h["plot"]

In [None]:
%%time
aave_4h = pipeline_ml("AAVE", 4, Models.LSTM(), 3)

In [None]:
%%time
aave_1h = pipeline_ml("AAVE", 1, Models.LSTM(), 3)

In [26]:
result_arima["ret"], result_arima["mse"], result_arima["f1"]

(0.0, 0.0003166121033062214, 0.6937799043062202)

# 5. Summary

| Model        | Token | Freq (H) | MSE    | Return (%) |
| ------------ | ----- | -------- | ------ | ---------- |
| ARIMA(4,0,4) | ETH   | 4        | 3E-4   | 0          |
| Linear NN    | ETH   | 4        | 1e-3 | -9      |
| RNN          | ETH   | 4        | 3E-3   | 12      |
| LSTM         | ETH   | 4        | 6.7E-4 | 16|
| LSTM         | ETH   | 1        | 2.9E-4 | 19          |
| LSTM         | AAVE  | 4        | 1.4E-3 | 23         |
| LSTM         | AAVE  | 1        | 1.2E-3 | 25         |

In [3]:
width, height = 350, 150
mse = alt.Chart(
    pd.DataFrame({
    "Model":["ARIMA", "Linear", "RNN", "LSTM"],
    "MSE":[3e-4, 1e-3, 6e-3, 6.7e-4],
    }),width=width,height=height
).mark_bar().encode(
    x='Model',
    y='MSE',
    color='Model'
)

ret = alt.Chart(
    pd.DataFrame({
    "Model":["ARIMA", "Linear", "RNN", "LSTM"],
    "Return":[0, -9, 12, 16],
    }),width=width,height=height
).mark_bar().encode(
    x='Model',
    y='Return',
    color='Model'
)
ret & mse