In [1]:
import pandas as pd
from transformers import TFRobertaForSequenceClassification, AutoTokenizer
seed_value = 29
import os
os.environ['PYTHONHASHSEED'] = str(seed_value)
import random
random.seed(seed_value)
import numpy as np
np.random.seed(seed_value)
np.set_printoptions(precision=2)
import tensorflow as tf
tf.random.set_seed(seed_value)
import tensorflow.keras as keras
import tensorflow_addons as tfa
import tensorflow.keras.layers as layers
import tensorflow.keras.regularizers as regularizers
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_addons as tfa
import re
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder as ohe
from sklearn.metrics import auc, roc_curve

  from .autonotebook import tqdm as notebook_tqdm
2024-02-09 17:03:22.587413: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-09 17:03:22.625170: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-09 17:03:22.625196: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-09 17:03:22.626132: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-09 17:03:22.6

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(physical_devices[0], 'GPU')
logical_devices = tf.config.list_logical_devices('GPU')

2024-02-09 17:03:25.488567: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22056 MB memory:  -> device: 0, name: NVIDIA A30, pci bus id: 0000:12:00.0, compute capability: 8.0


In [3]:
MAX_SEQ_LEN = 200
BERT = 'vinai/bertweet-large'
N_CLASSES = 3

In [4]:
def read_data(path):
    print(f'reading {path}')
    data = pd.read_csv(path)
    data.text = data.apply(lambda row: row.text.encode('ascii', 'ignore').decode('ascii').lower(), 1)
    data.text = data.apply(lambda row: re.sub(r"http\S+", "", row.text), 1)
    data.text = data.apply(lambda row: re.sub("removed|deleted", "", row.text), 1)
    data.text = data.apply(lambda row: re.sub(" :", "", row.text), 1)
    data.text = data.apply(lambda row: re.sub("[a-zA-Z]*lt;3[a-zA-Z]*", "", row.text), 1)
    data.text = data.apply(lambda row: re.sub("[a-zA-Z]&[a-zA-Z]*", "", row.text), 1)
    data.text = data.apply(lambda row: re.sub("[^a-zA-Z:.,;'!?\d]+", " ", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("i m |im |i'm ", "i am ", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("ive ", "i have ", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("wasnt|wasn't", "was not", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("werent|weren't", "were not", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("dont|don't", "do not", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("doesnt|doesn't", "does not", row.text).strip(), 1)
    texts = data.text.values
    labels = data.labels.values
    encoder = ohe(sparse=False)
    labels = np.array(labels).reshape(-1, 1)
    enc_labels = encoder.fit_transform(labels)
    print(f'texts shape: {texts.shape}, labels shape: {enc_labels.shape}')
    return texts, enc_labels

In [5]:
def prepare_bert_input(sentences, seq_len, bert_name):
    tokenizer = AutoTokenizer.from_pretrained(bert_name)
    encodings = tokenizer(sentences.tolist(), truncation=True, padding='max_length', max_length=seq_len)
    if bert_name.startswith("roberta") or "bertweet" in bert_name or "distilbert" in bert_name:
        input = [np.array(encodings["input_ids"]), np.array(encodings["attention_mask"])]
    else:
        input = [np.array(encodings["input_ids"]), np.array(encodings["token_type_ids"]),
               np.array(encodings["attention_mask"])]
    return input

In [6]:
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
nlp = English()

def compute_att_mask(ids):
    tokenizer = AutoTokenizer.from_pretrained(BERT)
    no_sw_att_mask = []
    for enc_sentence in ids:
        tokens = [tokenizer.decode([i]) for i in enc_sentence]
        lexemes = [nlp.vocab[token.strip()] for token in tokens]
        mask = [1 if not (lexeme.is_stop or lexeme.is_punct or len(lexeme.text) <= 2 or "<" in lexeme.text) else 0 for lexeme in lexemes]
        no_sw_att_mask.append(np.array(mask))
    return np.array(no_sw_att_mask)

In [7]:
sentences_train, labels_train = read_data("../dep-det-data/train.csv")
sentences_val, labels_val = read_data("../dep-det-data/dev.csv")
sentences_test, labels_test = read_data("../dep-det-data/test.csv")

# permutation train
perm_train = np.random.permutation(len(sentences_train))
sentences_train = sentences_train[perm_train]
labels_train = labels_train[perm_train]

# permutation val
perm_val = np.random.permutation(len(sentences_val))
sentences_val = sentences_val[perm_val]
labels_val = labels_val[perm_val]

# permutation test
perm_test = np.random.permutation(len(sentences_test))
sentences_test = sentences_test[perm_test]
labels_test = labels_test[perm_test]

# prepare model input
X_train = prepare_bert_input(sentences_train, MAX_SEQ_LEN, BERT)
X_val = prepare_bert_input(sentences_val, MAX_SEQ_LEN, BERT)
X_test = prepare_bert_input(sentences_test, MAX_SEQ_LEN, BERT)

# add custom attention mask
mask_train = compute_att_mask(X_train[0])
X_train = [X_train[0], X_train[1], mask_train]
mask_val = compute_att_mask(X_val[0])
X_val = [X_val[0], X_val[1], mask_val]
mask_test = compute_att_mask(X_test[0])
X_test = [X_test[0], X_test[1], mask_test]

reading ../dep-det-data/train.csv




texts shape: (6006,), labels shape: (6006, 3)
reading ../dep-det-data/dev.csv




texts shape: (1000,), labels shape: (1000, 3)
reading ../dep-det-data/test.csv




texts shape: (3245,), labels shape: (3245, 3)


In [8]:
# BERT-XDD model initialization
roberta_model_input_ids = layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name='roberta_model_input_ids')
roberta_model_input_mask = layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name='roberta_model_attention_mask')
custom_att_mask = layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name='custom_att_mask')
roberta_model_inputs = [roberta_model_input_ids, roberta_model_input_mask]
roberta_model = TFRobertaForSequenceClassification.from_pretrained(BERT, num_labels=N_CLASSES)
roberta_model_encoder, roberta_model_classifier, config = roberta_model.roberta, roberta_model.classifier, roberta_model.config

encoder_output = roberta_model_encoder(roberta_model_inputs)
hidden_state = encoder_output[0]

units=256

states, forward_h, _, backward_h, _ = layers.Bidirectional(layers.LSTM(units, return_sequences=True, return_state=True))(hidden_state)
hidden = layers.Dense(units*2, activation="tanh", use_bias=False)(states)
out = layers.Dense(1, activation='linear', use_bias=False)(hidden)
energy = layers.Flatten()(out)
ones = tf.ones_like(custom_att_mask)
att_mask = layers.Subtract()([custom_att_mask, ones])
att_mask = att_mask*10000
att_mask = tf.cast(att_mask, "float32")
flat = layers.Add()([energy, att_mask])
normalize = layers.Softmax()
normalize._init_set_name("alpha")
alpha = normalize(flat)
ctx = layers.Dot(axes=1)([hidden_state, alpha])
pred = layers.Dense(N_CLASSES, activation="softmax")(ctx)

BERT_XDD_model = keras.Model(inputs=[roberta_model_input_ids, roberta_model_input_mask, custom_att_mask], outputs=pred)
BERT_XDD_model.summary()

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-02-09 17:04:02.293529: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 roberta_model_input_ids (I  [(None, 200)]                0         []                            
 nputLayer)                                                                                       
                                                                                                  
 roberta_model_attention_ma  [(None, 200)]                0         []                            
 sk (InputLayer)                                                                                  
                                                                                                  
 roberta (TFRobertaMainLaye  TFBaseModelOutputWithPooli   3543101   ['roberta_model_input_ids[0][0
 r)                          ngAndCrossAttentions(last_   44        ]',                       

In [9]:
#load and freeze pre-finetuned encoder's layers

# Bertweet pre-finetuned (Roberta architecture)
roberta_input_ids = layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name='roberta_input_ids')
roberta_input_mask = layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name='roberta_attention_mask')
roberta_inputs = [roberta_input_ids, roberta_input_mask]
roberta = TFRobertaForSequenceClassification.from_pretrained(BERT, num_labels=N_CLASSES)
roberta_output = roberta(roberta_inputs).logits
pre_finetuned_model = keras.Model(inputs=roberta_inputs, outputs=roberta_output)

pre_finetuned_roberta_input_layer = [pre_finetuned_model.layers[0], pre_finetuned_model.layers[1]]
pre_finetuned_roberta_layer = pre_finetuned_model.layers[2]
pre_finetuned_encoder, pre_finetuned_classifier = pre_finetuned_roberta_layer.roberta, pre_finetuned_roberta_layer.classifier

pre_finetuned_model.load_weights("../1_pre-fine-tuning/bertweet.h5")
BERT_XDD_encoder = BERT_XDD_model.layers[2]
BERT_XDD_encoder.set_weights(pre_finetuned_encoder.get_weights())

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# train the model's head
BERT_XDD_encoder.trainable = False
BERT_XDD_model.summary()

max_epochs = 6
batch_size = 16
opt = tf.optimizers.Adam()
loss = keras.losses.CategoricalCrossentropy()
best_weights_file = f"BERT-XDD_TL.h5"
acc = keras.metrics.CategoricalAccuracy()
f1_macro = keras.metrics.F1Score(average='macro')
m_ckpt = ModelCheckpoint(best_weights_file, monitor='val_'+f1_macro.name, mode='max', verbose=2,
                          save_weights_only=True, save_best_only=True)

BERT_XDD_model.compile(loss=loss, optimizer=opt, metrics=[f1_macro,acc])

### uncomment to train model's head ###

# BERT_XDD_model.fit(
# X_train, labels_train,
# validation_data=(X_val, labels_val),
# epochs=max_epochs,
# batch_size=batch_size,
# callbacks=[m_ckpt],
# verbose=2
# )

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 roberta_model_input_ids (I  [(None, 200)]                0         []                            
 nputLayer)                                                                                       
                                                                                                  
 roberta_model_attention_ma  [(None, 200)]                0         []                            
 sk (InputLayer)                                                                                  
                                                                                                  
 roberta (TFRobertaMainLaye  TFBaseModelOutputWithPooli   3543101   ['roberta_model_input_ids[0][0
 r)                          ngAndCrossAttentions(last_   44        ]',                       

In [11]:
from sklearn.metrics import classification_report

# test the model

best_weights_file = f"BERT-XDD_TL.h5"
BERT_XDD_model.load_weights(best_weights_file)

BERT_XDD_model.compile(loss=loss, optimizer=opt, metrics=[f1_macro,acc])
y_pred_probs = BERT_XDD_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
labels_test_decode = np.argmax(labels_test, axis=1)
report = classification_report(labels_test_decode, y_pred, digits=3)
print(report)

  1/102 [..............................] - ETA: 14:49

2024-02-09 17:04:23.967660: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902


              precision    recall  f1-score   support

           0      0.433     0.557     0.488       228
           1      0.781     0.631     0.698      2169
           2      0.435     0.614     0.509       848

    accuracy                          0.622      3245
   macro avg      0.550     0.601     0.565      3245
weighted avg      0.666     0.622     0.634      3245



In [12]:
## end-to-end fine-tuning

best_weights_file = f"BERT-XDD_TL.h5"
BERT_XDD_model.load_weights(best_weights_file)
BERT_XDD_model.trainable = True

max_epochs = 2
batch_size = 16
opt = tfa.optimizers.RectifiedAdam(learning_rate=3e-5)
loss = keras.losses.CategoricalCrossentropy()
best_weights_file = f"BERT-XDD_FT.h5"
acc = keras.metrics.CategoricalAccuracy()
f1_macro = keras.metrics.F1Score(average='macro')
m_ckpt = ModelCheckpoint(best_weights_file, monitor='val_'+f1_macro.name, mode='max', verbose=2,
                          save_weights_only=True, save_best_only=True)

BERT_XDD_model.compile(loss=loss, optimizer=opt, metrics=[f1_macro,acc])
BERT_XDD_model.summary()

### uncomment to perform the end-to-end fine-tuning step ###

# BERT_XDD_model.fit(
# X_train, labels_train,
# validation_data=(X_val, labels_val),
# epochs=max_epochs,
# batch_size=batch_size,
# callbacks=[m_ckpt],
# verbose=2
# )

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 roberta_model_input_ids (I  [(None, 200)]                0         []                            
 nputLayer)                                                                                       
                                                                                                  
 roberta_model_attention_ma  [(None, 200)]                0         []                            
 sk (InputLayer)                                                                                  
                                                                                                  
 roberta (TFRobertaMainLaye  TFBaseModelOutputWithPooli   3543101   ['roberta_model_input_ids[0][0
 r)                          ngAndCrossAttentions(last_   44        ]',                       

In [13]:
# test the model
from sklearn.metrics import classification_report
best_weights_file = f"BERT-XDD_FT.h5"
BERT_XDD_model.load_weights(best_weights_file)
BERT_XDD_model.compile(loss=loss, optimizer=opt, metrics=[f1_macro,acc])
y_pred_probs = BERT_XDD_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
labels_test_decode = np.argmax(labels_test, axis=1)

report = classification_report(labels_test_decode, y_pred, digits=3)
print(report)

              precision    recall  f1-score   support

           0      0.498     0.535     0.516       228
           1      0.771     0.722     0.745      2169
           2      0.478     0.546     0.510       848

    accuracy                          0.663      3245
   macro avg      0.582     0.601     0.590      3245
weighted avg      0.675     0.663     0.668      3245



In [14]:
# create the attention inspection model (for explainability purposes)
best_weights_file = f"BERT-XDD_FT.h5"
BERT_XDD_model.load_weights(best_weights_file)
BERT_XDD_model.compile(loss=loss, optimizer=opt, metrics=[f1_macro,acc])
weight_model = keras.Model(
    inputs=[roberta_model_input_ids, roberta_model_input_mask, custom_att_mask],
    outputs=BERT_XDD_model.get_layer("alpha").output)
# restore weights
for l1, l2 in zip(weight_model.layers, BERT_XDD_model.layers):
    l1.set_weights(l2.get_weights())
weight_model.compile(loss=loss, optimizer=opt, metrics=[f1_macro,acc])

out_p = weight_model.predict(X_test)



In [15]:
def get_label_name(arr):
    n = {0:"SEVERE_DEP", 1: "DEP", 2: "NORMAL"}
    return n[np.argmax(arr)]

tokenizer = AutoTokenizer.from_pretrained(BERT)
def get_word_weights(id, top_n=200):
    ids_test = X_test[0]
    tokens = [tokenizer.decode([i]) for i in ids_test[id]]
    d = {}
    for token, weight in zip(tokens, out_p[id]):
        weight = weight
        if token not in d:
            d[token]=weight
        else:
            d[token] = max(d[token], weight)
    d_sorted = dict(sorted(d.items(), key=lambda item: item[1], reverse=True)[:top_n])
    in_sentence = tokenizer.decode(ids_test[id])
    end = in_sentence.index("</s>")
    return {"sentence": in_sentence[:end],
            "pred_label":get_label_name(y_pred_probs[id]),
            "real_label":get_label_name(labels_test[id]),
            "weights": d_sorted}

In [16]:
# test
for test_id in range(10): # example expl
    print(get_word_weights(test_id), "\n")

{'sentence': '<s>at the doctor... getting refills on pain meds for my shitty back. already made to feel like a drug addict despite taking only 1 pill per day.... anyway...the nurse chomping her gum in my ear and clickety clacking her laptop keys with her bedazzled nails squints her fake tanned eyes over her reading glasses and blurts it says i have to ask some screener questions have you lost interest in things you normally enjoy? every day, sometimes, once or twice, or never? in my head : yes. i do not have any desire to do anything. nothing sounds fun. if i didnt have responsibilities id likely spend days in my bed. out loud : nah. in the past month have you felt that your life was out of control or have you felt overwhelmed? every day, sometimes, once or twice, never? in my head : constantly our loud : never ok last one, do you ever feel hopeless or sad for no reason', 'pred_label': 'DEP', 'real_label': 'DEP', 'weights': {' pill': 0.0482861, ' despite': 0.047813658, ' taking': 0.047