In [1]:
seed_value = 29
import os
os.environ['PYTHONHASHSEED'] = str(seed_value)
import random
random.seed(seed_value)
import torch
import tensorflow as tf
tf.random.set_seed(seed_value)
import tensorflow.keras as keras
import tensorflow_addons as tfa
import tensorflow.keras.layers as layers
import tensorflow.keras.regularizers as regularizers
from tensorflow.keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers_interpret import SequenceClassificationExplainer
from sklearn.preprocessing import OneHotEncoder as ohe
from sklearn.metrics import auc, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, matthews_corrcoef, roc_auc_score
import numpy as np
np.random.seed(seed_value)
np.set_printoptions(precision=2)
import pandas as pd
import re
import matplotlib.pyplot as plt
import pickle

2024-07-04 11:16:09.334199: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-04 11:16:09.374475: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-04 11:16:09.374509: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-04 11:16:09.375626: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-04 11:16:09.382273: I tensorflow/core/platform/cpu_feature_guar

In [2]:
GPU_INDEX = 3
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(physical_devices[GPU_INDEX], 'GPU')
logical_devices = tf.config.list_logical_devices('GPU')

2024-07-04 11:16:13.194584: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22293 MB memory:  -> device: 3, name: NVIDIA A30, pci bus id: 0000:4a:00.0, compute capability: 8.0


## Load Financial Sentiment Analysis data samples

In [3]:
training_set = pd.read_csv("data/training_samples.csv") # 1000 items sample
test_set = pd.read_csv("data/test_samples.csv") # 100 items sample

sentences_train = training_set["text"].values
labels_train = training_set["label"].values
sentences_test = test_set["text"].values
labels_test = test_set["label"].values
labels_train_oh = tf.keras.utils.to_categorical(labels_train, num_classes = 3)
labels_test_oh = tf.keras.utils.to_categorical(labels_test, num_classes = 3)

## Compute Teacher logits and explanations

In [4]:
BERT = "nickmuchi/finbert-tone-finetuned-fintwitter-classification"
N_CLASSES = 3
SEQUENCE_LEN = 150
model = AutoModelForSequenceClassification.from_pretrained(BERT)
tokenizer = AutoTokenizer.from_pretrained(BERT)
explainer = SequenceClassificationExplainer(model, tokenizer)

# compute logits
def compute_logits(sentences, model, tokenizer):
    model.to("cuda")
    encoded_sentences = tokenizer(sentences.tolist(), truncation=True, padding='max_length', max_length=SEQUENCE_LEN, return_tensors='pt').to("cuda")
    with torch.no_grad():
        logits = model(**encoded_sentences)
    return logits

# compute IG-based explanations
def compute_explanations(sentences, explainer, tokenizer):
    model.to("cpu")
    expl = []
    MAX_LEN = 512
    for s in sentences:
        tok_s = tokenizer.tokenize(s, add_special_tokens=True)
        detok_s = tokenizer.convert_tokens_to_string(tok_s[:MAX_LEN])
        s = " ".join(detok_s.split(" ")[1:-1]).strip() # remove CLS and SEP tokens
        word_attributions = explainer(s)
        expl.append(word_attributions)
    model.to("cuda")
    return expl

teacher_logits_train = compute_logits(sentences_train, model, tokenizer).logits.cpu()
teacher_logits_test = compute_logits(sentences_test, model, tokenizer).logits.cpu()

explanation_train = compute_explanations(sentences_train, explainer, tokenizer)
explanation_test = compute_explanations(sentences_test, explainer, tokenizer)

## DiXtill - Data preparation

In [5]:
def merge_tokens(token_list):
    merged_list = []
    i = 0
    while i < len(token_list):
        current_token, current_score = token_list[i]

        if current_token.startswith('##'):
            (start_token, start_score) = merged_list.pop()
            merged_tokens = [start_token, current_token[2:]]
            list_score = [start_score, current_score]
            i += 1

            while i < len(token_list) and token_list[i][0].startswith('##'):
                merged_tokens.append(token_list[i][0][2:])
                list_score.append(token_list[i][1])
                i += 1

            merged_score = np.mean(list_score)

            merged_list.append((''.join(merged_tokens), merged_score))
        else:
            merged_list.append((current_token, current_score))
            i += 1

    return merged_list[1:len(merged_list)-1] # remove CLS and SEP tokens

explanation_train = [merge_tokens(tokens) for tokens in explanation_train]
explanation_test = [merge_tokens(tokens) for tokens in explanation_test]

In [6]:
def build_sentence(tokens):
    return ' '.join([token[0] for token in tokens])

lstm_input_train = [build_sentence(tokens) for tokens in explanation_train]
lstm_input_test = [build_sentence(tokens) for tokens in explanation_test]

In [7]:
def add_padding_explanation(explanation):
  padding_length = SEQUENCE_LEN - len(explanation)
  padding_tuples = [('<pad>', 0.0) for _ in range(padding_length)]
  return explanation + padding_tuples

explanation_train_padded = [add_padding_explanation(explanation) for explanation in explanation_train]
explanation_test_padded = [add_padding_explanation(explanation) for explanation in explanation_test]

In [8]:
MAX_NB_WORDS = 20000
t = Tokenizer(num_words=MAX_NB_WORDS, filters='', lower=True, oov_token = "<UNK>")
t.fit_on_texts(lstm_input_train)
vocab_size = len(t.word_index) + 1
# integer encode the documents
index_train_lstm = t.texts_to_sequences(lstm_input_train)
index_train_lstm = pad_sequences(index_train_lstm, maxlen=SEQUENCE_LEN, padding='post', truncating='post')
index_test_lstm = t.texts_to_sequences(lstm_input_test)
index_test_lstm = pad_sequences(index_test_lstm, maxlen=SEQUENCE_LEN, padding='post', truncating='post')

In [9]:
#attention mask
attention_mask_train = []
attention_mask_test = []

for tokens in index_train_lstm:
    att = []
    for token in tokens:
        att.append(1 if token != 0 else 0)
    attention_mask_train.append(att)

for tokens in index_test_lstm:
    att = []
    for token in tokens:
        att.append(1 if token != 0 else 0)
    attention_mask_test.append(att)

In [10]:
lstm_input_train = [np.array([np.array(m) for m in index_train_lstm]), np.array([np.array(pair)[:,1] for pair in explanation_train_padded], dtype=float), np.array([np.array(m) for m in teacher_logits_train]), np.array([np.array(m) for m in attention_mask_train])]
lstm_input_test = [np.array([np.array(m) for m in index_test_lstm]), np.array([np.array(pair)[:,1] for pair in explanation_test_padded], dtype=float), np.array([np.array(m) for m in teacher_logits_test]), np.array([np.array(m) for m in attention_mask_test])]

## DiXtill - Student model architecture

In [11]:
# Prepare Glove embedding matrix

EMBEDDING_DIM = 50
embeddings_index = dict()
f = open(f'data/glove.6B.{EMBEDDING_DIM}d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.array(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

c = 0
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in t.word_index.items():
  embedding_vector = embeddings_index.get(word, None)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector
  else:
    c+=1
print(f'not found: {c} out of {len(t.word_index.items())}')

Loaded 400000 word vectors.
not found: 571 out of 4167


In [12]:
# Define Student model architecture

units=EMBEDDING_DIM
def get_student_model(use_attention = False, return_student_inputs = False, add_metrics = False, temperature = 5):
    student_input_ids = layers.Input(shape=(SEQUENCE_LEN,), dtype=tf.int32, name='input_ids')
    student_input_explanation = layers.Input(shape=(SEQUENCE_LEN,), dtype=tf.float32, name='explanation_weights')
    student_input_teacher_logits = layers.Input(shape=(N_CLASSES), dtype=tf.float32, name='teacher_logits')
    student_input_att_mask = layers.Input(shape=(SEQUENCE_LEN,), dtype=tf.int32, name='attention_mask')
    student_inputs = [student_input_ids, student_input_explanation, student_input_teacher_logits, student_input_att_mask]
    emb = layers.Embedding(vocab_size, EMBEDDING_DIM, weights=[embedding_matrix], input_length=SEQUENCE_LEN, trainable=True)(student_input_ids)
    states, forward_h, _, backward_h, _ = layers.Bidirectional(layers.LSTM(units, return_sequences=True, return_state=True))(emb)
    if not use_attention:
        ctx = layers.Concatenate()([forward_h, backward_h])
    else:
        hidden = layers.Dense(units, activation="tanh", use_bias=False)(states)
        out = layers.Dense(1, activation='linear', use_bias=False)(hidden)
        flat = layers.Flatten()(out)
        
        student_input_att_mask_cast = tf.cast(student_input_att_mask, "float32")
        masked_logits = layers.Multiply(name="masked_logits")([flat,student_input_att_mask_cast]) ## for loss fn

        ones = tf.ones_like(student_input_att_mask)
        att_mask = layers.Subtract()([student_input_att_mask, ones])
        att_mask = att_mask*10000
        att_mask = tf.cast(att_mask, "float32")
        energy = layers.Add()([masked_logits, att_mask])
        
        normalize = layers.Softmax()
        normalize._init_set_name("alpha")
        alpha = normalize(energy)
        ctx = layers.Dot(axes=1)([states, alpha])
    pred = layers.Dense(N_CLASSES)(ctx)
    student_model = keras.Model(inputs=student_inputs, outputs=pred, name = 'student')

    if add_metrics:
        # add internal metrics
        distillation_loss = keras.losses.kl_divergence(
                tf.nn.softmax(student_input_teacher_logits / temperature),
                tf.nn.softmax(pred / temperature)
        ) * (temperature**2)
        norm_kl = 1-tf.exp(distillation_loss*(-1)) ## output in [0,1]
        
        student_model.add_metric(norm_kl, name='dist_loss_norm', aggregation='mean') ## for visualization
        student_model.add_metric(distillation_loss, name='dist_loss', aggregation='mean') ## for visualization
        
        xai_loss = 0.5 * (1 + keras.losses.cosine_similarity(student_input_explanation, masked_logits)) ## output in [0,1]

        student_model.add_metric(xai_loss, name='xai_loss', aggregation='mean') ## for visualization
    
    if return_student_inputs:
        return student_model, student_inputs
    return student_model

## DiXtill - XAI-driven distillation

In [13]:
class DistillerXAI(keras.Model):
    def __init__(self, student, student_inputs, temperature):
        super().__init__()
        self.student = student
        self.student_inputs = student_inputs
        self.loss_tracker = keras.metrics.Mean(name='XAI-DIST-LOSS')
        self.temperature = temperature

    def compile(
        self,
        optimizer,
        metrics,
        student_loss_fn,
        distillation_loss_fn,
        xai_loss_fn,
        student_beta = 0.1,
        distillation_beta = 0.9,
        xai_beta = 0.9
    ):
        super().compile(optimizer=optimizer, metrics=metrics)
        self.student_loss_fn = student_loss_fn
        self.distillation_loss_fn = distillation_loss_fn
        self.xai_loss_fn = xai_loss_fn
        self.student_beta = student_beta
        self.distillation_beta = distillation_beta
        self.xai_beta = xai_beta

    def compute_loss(
        self, x=None, y=None, y_pred=None, sample_weight=None, allow_empty=False
    ):
        input_ids, input_explanation, teacher_logits, att_mask = x
        student_logits = y_pred

        student_loss = self.student_loss_fn(y, student_logits)

        distillation_loss = self.distillation_loss_fn(
            tf.nn.softmax(teacher_logits / self.temperature, axis=1),
           tf.nn.softmax(student_logits / self.temperature, axis=1),
        ) * (self.temperature**2)

        norm_kl = 1-tf.exp(distillation_loss*(-1))
        
        alpha_model = keras.Model(inputs=self.student_inputs, outputs=self.student.get_layer("masked_logits").output, name = 'alpha_model')
        attention_weights = alpha_model(x, training=False)
        xai_loss = 0.5 * (1 + self.xai_loss_fn(input_explanation, attention_weights))

        loss = self.student_beta * student_loss + self.distillation_beta * norm_kl + self.xai_beta * xai_loss

        self.loss_tracker.update_state(loss)
        
        return loss

    def call(self, x):
        return self.student(x)

In [14]:
TEMPERATURE = 5
BEST_WEIGHTS_FILE=f"DIXTILL_MODEL_WEIGHTS.h5"
student_model, student_inputs = get_student_model(use_attention=True, add_metrics=True, return_student_inputs=True, temperature=TEMPERATURE)

distiller_xai = DistillerXAI(student=student_model, student_inputs=student_inputs, temperature=TEMPERATURE)
m_ckpt = ModelCheckpoint(BEST_WEIGHTS_FILE, monitor=f'val_XAI-DIST-LOSS', mode='min', verbose=2,
                          save_weights_only=True, save_best_only=True)

distiller_xai.compile(
    optimizer=keras.optimizers.SGD(momentum=0.99),
    metrics=[keras.metrics.CategoricalAccuracy(), keras.losses.CategoricalCrossentropy(from_logits=True), keras.metrics.F1Score(average="macro")],
    student_loss_fn=keras.losses.CategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    xai_loss_fn = tf.keras.losses.CosineSimilarity()
)

distiller_xai.fit(lstm_input_train, labels_train_oh,
validation_split=0.1,
epochs=15,
callbacks=[m_ckpt],
verbose=1
)

2024-07-04 11:21:13.980133: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1101] failed to allocate 21.77GiB (23376166912 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2024-07-04 11:21:13.980606: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1101] failed to allocate 19.59GiB (21038548992 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2024-07-04 11:21:13.981005: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1101] failed to allocate 17.63GiB (18934693888 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2024-07-04 11:21:13.981403: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1101] failed to allocate 15.87GiB (17041223680 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory


Epoch 1/15


2024-07-04 11:21:22.120716: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
2024-07-04 11:21:23.235697: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f4d74005780 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-07-04 11:21:23.235738: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A30, Compute Capability 8.0
I0000 00:00:1720092083.304911  151671 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 1: val_XAI-DIST-LOSS improved from inf to 1.31335, saving model to DIXTILL_MODEL_WEIGHTS.h5
Epoch 2/15
Epoch 2: val_XAI-DIST-LOSS improved from 1.31335 to 1.29809, saving model to DIXTILL_MODEL_WEIGHTS.h5
Epoch 3/15
Epoch 3: val_XAI-DIST-LOSS did not improve from 1.29809
Epoch 4/15
Epoch 4: val_XAI-DIST-LOSS improved from 1.29809 to 1.27905, saving model to DIXTILL_MODEL_WEIGHTS.h5
Epoch 5/15
Epoch 5: val_XAI-DIST-LOSS improved from 1.27905 to 1.24083, saving model to DIXTILL_MODEL_WEIGHTS.h5
Epoch 6/15
Epoch 6: val_XAI-DIST-LOSS improved from 1.24083 to 1.21577, saving model to DIXTILL_MODEL_WEIGHTS.h5
Epoch 7/15
Epoch 7: val_XAI-DIST-LOSS improved from 1.21577 to 1.14431, saving model to DIXTILL_MODEL_WEIGHTS.h5
Epoch 8/15
Epoch 8: val_XAI-DIST-LOSS improved from 1.14431 to 1.12743, saving model to DIXTILL_MODEL_WEIGHTS.h5
Epoch 9/15
Epoch 9: val_XAI-DIST-LOSS did not improve from 1.12743
Epoch 10/15
Epoch 10: val_XAI-DIST-LOSS improved from 1.12743 to 1.10607, saving model to 

<keras.src.callbacks.History at 0x7f4d24c81990>

In [15]:
def build_distiller_and_compile(distiller_model):
    distiller_model.compile(
    optimizer=keras.optimizers.SGD(momentum=0.99),
    metrics=[keras.metrics.CategoricalAccuracy(), keras.losses.CategoricalCrossentropy(from_logits=True), keras.metrics.F1Score(average="macro")],
    student_loss_fn=keras.losses.CategoricalCrossentropy(from_logits=True),
    distillation_loss_fn=keras.losses.KLDivergence(),
    xai_loss_fn = tf.keras.losses.CosineSimilarity()
    )
    # dummy input for init
    dummy_train = [[np.array(lstm_input_train[i][:1]) for i in range(len(lstm_input_train))], labels_train_oh[:1]]
    dummy_test = ([np.array(lstm_input_test[i][:1]) for i in range(len(lstm_input_test))], labels_test_oh[:1])
    #dummy train for var creation
    distiller_model.fit(
        *dummy_train,
        validation_split=0,
        epochs = 1,
        verbose=0
    )

In [16]:
# compute predictions for test samples

BEARISH = 0
BULLISH = 1
NEUTRAL = 2
id_to_label = {BEARISH:"BEARISH", BULLISH:"BULLISH", NEUTRAL:"NEUTRAL"}

from sklearn.metrics import classification_report
student_model, student_inputs = get_student_model(use_attention=True, add_metrics=True, return_student_inputs=True)
distiller_xai = DistillerXAI(student=student_model, student_inputs=student_inputs, temperature=TEMPERATURE)
build_distiller_and_compile(distiller_xai) # workaround for on-the-fly model building and weights loading
distiller_xai.load_weights("DIXTILL_MODEL_WEIGHTS.h5")

y_pred_logits = distiller_xai.predict(lstm_input_test)
y_pred = np.argmax(y_pred_logits, axis=1)
labels_pred = [id_to_label[x] for x in np.argmax(labels_test_oh, axis=1)]
for test_s, pred_label in zip(sentences_test, labels_pred):
    print(f"text: {test_s}\npred: {pred_label}\n")

text: heres what we like about dcc plc londccs upcoming dividend
pred: NEUTRAL

text: $icpt intercept pharma reports publication of positive results from phase 3 regenerate
pred: BULLISH

text: porsche is going all in on electric vehicles but the legendary 911 will be the last of the brands lineup to make
pred: NEUTRAL

text: would xiamen international port co ltd hkg3378 be valuable to income investors?
pred: NEUTRAL

text: hes in to defeat donald trump and rebuild america
pred: NEUTRAL

text: teslas stock ticks up after deutsche bank lifts price target which implies an 18 decline
pred: BULLISH

text: $fisi  financial institutions inc fisi ceo marty birmingham on q4 2019 results  earnings call transcript
pred: NEUTRAL

text: replacing pratt engines in indigo fleet by end of january a challenge ceo
pred: NEUTRAL

text: qatar began marketing us dollardenominated bonds the first persian gulf state to tap the debt markets since the
pred: NEUTRAL

text: nvidia earnings what to watch on thu