In [1]:
import pandas as pd
from transformers import TFRobertaForSequenceClassification, AutoTokenizer
seed_value = 29
import os
os.environ['PYTHONHASHSEED'] = str(seed_value)
import random
random.seed(seed_value)
import numpy as np
np.random.seed(seed_value)
np.set_printoptions(precision=2)
import tensorflow as tf
tf.random.set_seed(seed_value)
import tensorflow.keras as keras
import tensorflow_addons as tfa
import tensorflow.keras.layers as layers
import tensorflow.keras.regularizers as regularizers
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_addons as tfa
import re
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder as ohe
from sklearn.metrics import auc, roc_curve

  from .autonotebook import tqdm as notebook_tqdm
2024-02-09 16:05:40.856399: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-09 16:05:41.309664: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-09 16:05:41.309721: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-09 16:05:41.344578: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-09 16:05:41.4

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(physical_devices[6], 'GPU')
logical_devices = tf.config.list_logical_devices('GPU')

2024-02-09 16:05:46.313744: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22287 MB memory:  -> device: 6, name: NVIDIA A30, pci bus id: 0000:c1:00.0, compute capability: 8.0


In [3]:
MAX_SEQ_LEN = 200
BERT = 'vinai/bertweet-large'
N_CLASSES = 3

In [4]:
def read_data(path):
    print(f'reading {path}')
    data = pd.read_csv(path)
    data.text = data.apply(lambda row: row.text.encode('ascii', 'ignore').decode('ascii').lower(), 1)
    data.text = data.apply(lambda row: re.sub(r"http\S+", "", row.text), 1)
    data.text = data.apply(lambda row: re.sub("removed|deleted", "", row.text), 1)
    data.text = data.apply(lambda row: re.sub(" :", "", row.text), 1)
    data.text = data.apply(lambda row: re.sub("[a-zA-Z]*lt;3[a-zA-Z]*", "", row.text), 1)
    data.text = data.apply(lambda row: re.sub("[a-zA-Z]&[a-zA-Z]*", "", row.text), 1)
    data.text = data.apply(lambda row: re.sub("[^a-zA-Z:.,;'!?\d]+", " ", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("i m |im |i'm ", "i am ", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("ive ", "i have ", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("wasnt|wasn't", "was not", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("werent|weren't", "were not", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("dont|don't", "do not", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("doesnt|doesn't", "does not", row.text).strip(), 1)
    texts = data.text.values
    labels = data.labels.values
    encoder = ohe(sparse=False)
    labels = np.array(labels).reshape(-1, 1)
    enc_labels = encoder.fit_transform(labels)
    print(f'texts shape: {texts.shape}, labels shape: {enc_labels.shape}')
    return texts, enc_labels

In [5]:
def prepare_bert_input(sentences, seq_len, bert_name):
    tokenizer = AutoTokenizer.from_pretrained(bert_name)
    encodings = tokenizer(sentences.tolist(), truncation=True, padding='max_length', max_length=seq_len)
    if bert_name.startswith("roberta") or "bertweet" in bert_name or "distilbert" in bert_name:
        input = [np.array(encodings["input_ids"]), np.array(encodings["attention_mask"])]
    else:
        input = [np.array(encodings["input_ids"]), np.array(encodings["attention_mask"]),
               np.array(encodings["token_type_ids"])]
    return input

In [6]:
sentences_train, labels_train = read_data("../dep-det-data/train.csv")
sentences_val, labels_val = read_data("../dep-det-data/dev.csv")
sentences_test, labels_test = read_data("../dep-det-data/test.csv")

# permutation train
perm_train = np.random.permutation(len(sentences_train))
sentences_train = sentences_train[perm_train]
labels_train = labels_train[perm_train]

# permutation val
perm_val = np.random.permutation(len(sentences_val))
sentences_val = sentences_val[perm_val]
labels_val = labels_val[perm_val]

# permutation test
perm_test = np.random.permutation(len(sentences_test))
sentences_test = sentences_test[perm_test]
labels_test = labels_test[perm_test]

# prepare model input
X_train = prepare_bert_input(sentences_train, MAX_SEQ_LEN, BERT)
X_val = prepare_bert_input(sentences_val, MAX_SEQ_LEN, BERT)
X_test = prepare_bert_input(sentences_test, MAX_SEQ_LEN, BERT)

reading ../dep-det-data/train.csv




texts shape: (6006,), labels shape: (6006, 3)
reading ../dep-det-data/dev.csv




texts shape: (1000,), labels shape: (1000, 3)
reading ../dep-det-data/test.csv




texts shape: (3245,), labels shape: (3245, 3)


config.json: 100%|██████████| 614/614 [00:00<00:00, 1.92MB/s]
vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 2.01MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.36MB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 2.47MB/s]


In [7]:
# Bertweet initialization for pre-fine-tuning (it builds upon a Roberta model. See HuggingFace docs.)
roberta_input_ids = layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name='roberta_input_ids')
roberta_input_mask = layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name='roberta_attention_mask')
roberta_inputs = [roberta_input_ids, roberta_input_mask]
roberta = TFRobertaForSequenceClassification.from_pretrained(BERT, num_labels=N_CLASSES)
roberta_output = roberta(roberta_inputs).logits
pre_finetuned_model = keras.Model(inputs=roberta_inputs, outputs=roberta_output)
pre_finetuned_model.summary()

tf_model.h5: 100%|██████████| 1.63G/1.63G [01:54<00:00, 14.2MB/s]
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 roberta_input_ids (InputLa  [(None, 200)]                0         []                            
 yer)                                                                                             
                                                                                                  
 roberta_attention_mask (In  [(None, 200)]                0         []                            
 putLayer)                                                                                        
                                                                                                  
 tf_roberta_for_sequence_cl  TFSequenceClassifierOutput   3553628   ['roberta_input_ids[0][0]',   
 assification (TFRobertaFor  (loss=None, logits=(None,    19         'roberta_attention_mask[0

In [8]:
max_epochs = 6
batch_size = 16

opt = tfa.optimizers.RectifiedAdam(learning_rate=3e-5)
loss = keras.losses.CategoricalCrossentropy(from_logits=True)
best_weights_file = f"bertweet_preft.h5"
acc = keras.metrics.CategoricalAccuracy()
f1_macro = keras.metrics.F1Score(average='macro')
m_ckpt = ModelCheckpoint(best_weights_file, monitor='val_'+f1_macro.name, mode='max', verbose=2,
                          save_weights_only=True, save_best_only=True)

pre_finetuned_model.compile(loss=loss, optimizer=opt, metrics=[f1_macro,acc])
pre_finetuned_model.summary()

### uncomment to perform the pre-fine-tuning step ###

# pre_finetuned_model.fit(
# X_train, labels_train,
# validation_data=(X_val, labels_val),
# epochs=max_epochs,
# batch_size=batch_size,
# callbacks=[m_ckpt],
# verbose=2
# )

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 roberta_input_ids (InputLa  [(None, 200)]                0         []                            
 yer)                                                                                             
                                                                                                  
 roberta_attention_mask (In  [(None, 200)]                0         []                            
 putLayer)                                                                                        
                                                                                                  
 tf_roberta_for_sequence_cl  TFSequenceClassifierOutput   3553628   ['roberta_input_ids[0][0]',   
 assification (TFRobertaFor  (loss=None, logits=(None,    19         'roberta_attention_mask[0

In [9]:
# test the model
from sklearn.metrics import classification_report
best_weights_file = f"bertweet.h5"
pre_finetuned_model.load_weights(best_weights_file)
pre_finetuned_model.compile(loss=loss, optimizer=opt, metrics=[f1_macro,acc])
y_pred_probs = pre_finetuned_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
labels_test_decode = np.argmax(labels_test, axis=1)

report = classification_report(labels_test_decode, y_pred, digits=3)
print(report)

2024-02-09 16:08:09.403288: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


              precision    recall  f1-score   support

           0      0.435     0.539     0.481       228
           1      0.750     0.772     0.761      2169
           2      0.491     0.423     0.455       848

    accuracy                          0.664      3245
   macro avg      0.559     0.578     0.566      3245
weighted avg      0.660     0.664     0.661      3245

