In [1]:
# !wget https://launchpad.net/~mario-mariomedina/+archive/ubuntu/talib/+files/libta-lib0_0.4.0-oneiric1_amd64.deb -qO libta.deb
# !wget https://launchpad.net/~mario-mariomedina/+archive/ubuntu/talib/+files/ta-lib0-dev_0.4.0-oneiric1_amd64.deb -qO ta.deb
# !dpkg -i libta.deb ta.deb
# !pip install ta-lib

Selecting previously unselected package libta-lib0.
(Reading database ... 123598 files and directories currently installed.)
Preparing to unpack libta.deb ...
Unpacking libta-lib0 (0.4.0-oneiric1) ...
Selecting previously unselected package ta-lib0-dev.
Preparing to unpack ta.deb ...
Unpacking ta-lib0-dev (0.4.0-oneiric1) ...
Setting up libta-lib0 (0.4.0-oneiric1) ...
Setting up ta-lib0-dev (0.4.0-oneiric1) ...
Processing triggers for man-db (2.10.2-1) ...
Processing triggers for libc-bin (2.35-0ubuntu3.4) ...
/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libur_adapter_opencl.so.0 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero.so.0 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libur_loader.so.0 is not a s

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertModel, BertTokenizer, BertConfig
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import talib as ta
from scipy.stats import norm

In [50]:
def calculate_rsi(series, period=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

class TechnicalIndicators:
    def __init__(self, data):
        self.data = data

    def add_momentum_indicators(self):
        self.data['RSI'] = ta.RSI(self.data['Close'], timeperiod=14)
        self.data['MACD'], self.data['MACD_signal'], self.data['MACD_hist'] = ta.MACD(self.data['Close'], fastperiod=12, slowperiod=26, signalperiod=9)
        self.data['Stoch_k'], self.data['Stoch_d'] = ta.STOCH(self.data['High'], self.data['Low'], self.data['Close'],
                                                              fastk_period=14, slowk_period=3, slowd_period=3)

    def add_volume_indicators(self):
        self.data['OBV'] = ta.OBV(self.data['Close'], self.data['Volume'])

    def add_volatility_indicators(self):
        self.data['Upper_BB'], self.data['Middle_BB'], self.data['Lower_BB'] = ta.BBANDS(self.data['Close'], timeperiod=20)
        self.data['ATR_1'] = ta.ATR(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=1)

    def add_trend_indicators(self):
        self.data['ADX'] = ta.ADX(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['+DI'] = ta.PLUS_DI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)
        self.data['-DI'] = ta.MINUS_DI(self.data['High'], self.data['Low'], self.data['Close'], timeperiod=14)

    def add_other_indicators(self):
        self.data['DLR'] = np.log(self.data['Close'] / self.data['Close'].shift(1))
        self.data['VWAP'] = (self.data['Volume'] * (self.data['High'] + self.data['Low']) / 2).cumsum() / self.data['Volume'].cumsum()

    def add_all_indicators(self):
        self.add_momentum_indicators()
        self.add_volume_indicators()
        self.add_volatility_indicators()
        self.add_trend_indicators()
        self.add_other_indicators()
        return self.data

data = pd.read_csv('/content/xnas-itch-20230703.tbbo.csv')

data['price'] = data['price'] / 1e9
data['bid_px_00'] = data['bid_px_00'] / 1e9
data['ask_px_00'] = data['ask_px_00'] / 1e9

data['Close'] = data['price']
data['Volume'] = data['size']
data['High'] = data[['bid_px_00', 'ask_px_00']].max(axis=1)
data['Low'] = data[['bid_px_00', 'ask_px_00']].min(axis=1)
data['Open'] = data['Close'].shift(1).fillna(data['Close'])

data['ts_event'] = pd.to_datetime(data['ts_event'])
data['hour'] = data['ts_event'].dt.hour
data['day_of_week'] = data['ts_event'].dt.dayofweek
data['month'] = data['ts_event'].dt.month

ti = TechnicalIndicators(data)
df_with_indicators = ti.add_all_indicators()

df_with_indicators['RSI'] = calculate_rsi(df_with_indicators['Close'], period=14)

df_with_indicators['signal'] = 1  # Default to Hold

df_with_indicators.loc[df_with_indicators['RSI'] < 30, 'signal'] = 2  # Buy
df_with_indicators.loc[df_with_indicators['RSI'] > 70, 'signal'] = 0  # Sell

market_features_df = df_with_indicators.dropna()

feature_columns = [
    'Close',        # Closing price of the asset
    'Volume',       # Trading volume
    'High',         # High price within the period
    'Low',          # Low price within the period
    'RSI',          # Relative Strength Index
    'MACD',         # Moving Average Convergence Divergence
    'MACD_hist',    # MACD Histogram
    'OBV',          # On-Balance Volume
    'Upper_BB',     # Upper Bollinger Band
    'Lower_BB',     # Lower Bollinger Band
    'ATR_1',        # Average True Range
    'ADX',          # Average Directional Index
    'DLR',          # Discrete Log Return
    'VWAP',         # Volume Weighted Average Price
    'hour',         # Hour of the day
    'day_of_week',  # Day of the week
    'month'         # Month of the year
]
features = market_features_df[feature_columns].values

scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

y = market_features_df['signal'].values

X_train, X_test, y_train, y_test = train_test_split(normalized_features, y, test_size=0.2, random_state=42)


In [15]:
market_features_df.head()

Unnamed: 0,ts_recv,ts_event,rtype,publisher_id,instrument_id,action,side,depth,price,size,...,Upper_BB,Middle_BB,Lower_BB,ATR_1,ADX,+DI,-DI,DLR,VWAP,signal
33,1688371212400094716,2023-07-03 08:00:12.399929624,1,2,32,T,B,0,194.05,112,...,194.058166,194.01,193.961834,0.05,97.468037,8.22949,0.300857,0.0,194.018217,0
34,1688371212400103305,2023-07-03 08:00:12.399937688,1,2,32,T,B,0,194.05,56,...,194.061497,194.014,193.966503,0.05,97.145048,7.681639,0.280828,0.0,194.018423,0
35,1688371214386057385,2023-07-03 08:00:14.385893078,1,2,32,T,N,0,194.05,50,...,194.065621,194.017,193.968379,0.3,97.257397,30.435801,0.196362,0.0,194.021894,0
36,1688371214386063777,2023-07-03 08:00:14.385899379,1,2,32,T,N,0,194.05,50,...,194.06899,194.02,193.97101,0.3,97.361721,22.989295,0.14832,0.0,194.025188,0
37,1688371215804852019,2023-07-03 08:00:15.804687301,1,2,32,T,B,0,194.21,10,...,194.125889,194.0305,193.935111,0.21,97.458593,19.409454,0.125224,0.000824,194.025596,0


In [37]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [38]:
class TradeSignalBERT(nn.Module):
    def __init__(self, input_size):
        super(TradeSignalBERT, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.embedding_layer = nn.Linear(input_size, self.bert.config.hidden_size)

        self.classifier = nn.Linear(self.bert.config.hidden_size, 3)  # 3 classes: Buy, Hold, Sell

    def forward(self, inputs):
        inputs_embeds = self.embedding_layer(inputs).unsqueeze(1)

        attention_mask = torch.ones(inputs_embeds.size()[:2]).to(inputs.device)

        outputs = self.bert(inputs_embeds=inputs_embeds, attention_mask=attention_mask)

        cls_output = outputs.last_hidden_state[:, 0, :]
        return self.classifier(cls_output)

# Training

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TradeSignalBERT(input_size=X_train_tensor.shape[1]).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for step, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        print(f"Epoch [{epoch + 1}/{epochs}], Step [{step + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")


Epoch [1/3], Step [1/1349], Loss: 1.1136
Epoch [1/3], Step [2/1349], Loss: 1.1242
Epoch [1/3], Step [3/1349], Loss: 1.0646
Epoch [1/3], Step [4/1349], Loss: 1.0499
Epoch [1/3], Step [5/1349], Loss: 1.0466
Epoch [1/3], Step [6/1349], Loss: 1.0445
Epoch [1/3], Step [7/1349], Loss: 1.0830
Epoch [1/3], Step [8/1349], Loss: 0.9905
Epoch [1/3], Step [9/1349], Loss: 0.9589
Epoch [1/3], Step [10/1349], Loss: 0.9856
Epoch [1/3], Step [11/1349], Loss: 0.9929
Epoch [1/3], Step [12/1349], Loss: 1.0384
Epoch [1/3], Step [13/1349], Loss: 0.9490
Epoch [1/3], Step [14/1349], Loss: 0.9340
Epoch [1/3], Step [15/1349], Loss: 0.8670
Epoch [1/3], Step [16/1349], Loss: 0.9906
Epoch [1/3], Step [17/1349], Loss: 0.9678
Epoch [1/3], Step [18/1349], Loss: 0.9172
Epoch [1/3], Step [19/1349], Loss: 0.9936
Epoch [1/3], Step [20/1349], Loss: 0.9129
Epoch [1/3], Step [21/1349], Loss: 0.8550
Epoch [1/3], Step [22/1349], Loss: 0.7828
Epoch [1/3], Step [23/1349], Loss: 0.8008
Epoch [1/3], Step [24/1349], Loss: 0.7363
E

# Evaluation on test


In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

model.eval()
with torch.no_grad():
    X_test_tensor = X_test_tensor.to(device)
    y_test_tensor = y_test_tensor.to(device)
    outputs = model(X_test_tensor)
    _, predicted_labels = torch.max(outputs, 1)

# Move predictions and labels to CPU for sklearn metrics
predicted_labels = predicted_labels.cpu().numpy()
y_test = y_test_tensor.cpu().numpy()

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_labels)
precision = precision_score(y_test, predicted_labels, average='weighted')
recall = recall_score(y_test, predicted_labels, average='weighted')
f1 = f1_score(y_test, predicted_labels, average='weighted')
classification_rep = classification_report(y_test, predicted_labels, target_names=['SELL', 'HOLD', 'BUY'])

print(f"Model Accuracy: {accuracy:.2f}")
print(f"Model Precision: {precision:.2f}")
print(f"Model Recall: {recall:.2f}")
print(f"Model F1 Score: {f1:.2f}")
print("\nClassification Report:\n", classification_rep)


Model Accuracy: 0.99
Model Precision: 0.99
Model Recall: 0.99
Model F1 Score: 0.99

Classification Report:
               precision    recall  f1-score   support

        SELL       1.00      0.99      0.99      3093
        HOLD       0.99      0.99      0.99      4216
         BUY       0.99      1.00      0.99      3479

    accuracy                           0.99     10788
   macro avg       0.99      0.99      0.99     10788
weighted avg       0.99      0.99      0.99     10788



# Generate predictions for X_test


In [20]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
    normalized_features, y, np.arange(len(y)), test_size=0.2, random_state=42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predictions = torch.max(outputs, 1)

action_mapping = {0: 'SELL', 1: 'HOLD', 2: 'BUY'}
transformer_actions = [action_mapping[pred.item()] for pred in predictions]

timestamp = pd.to_datetime(data.loc[test_indices, 'ts_event'])
symbol = data.loc[test_indices, 'symbol']

transformer_results_df = pd.DataFrame({
    'timestamp': timestamp,
    'symbol': symbol,
    'predicted_action': transformer_actions,
})

transformer_results_df.head()


Unnamed: 0,timestamp,symbol,predicted_action
6976,2023-07-03 13:33:00.369389502,AAPL,HOLD
44896,2023-07-03 15:45:02.864795506,AAPL,BUY
44691,2023-07-03 15:43:09.480320052,AAPL,HOLD
7789,2023-07-03 13:34:10.406059043,AAPL,HOLD
13297,2023-07-03 13:46:18.374664839,AAPL,SELL
