In [1]:
import pandas as pd
from transformers import TFRobertaForSequenceClassification, AutoTokenizer
seed_value = 29
import os
os.environ['PYTHONHASHSEED'] = str(seed_value)
import random
random.seed(seed_value)
import numpy as np
np.random.seed(seed_value)
np.set_printoptions(precision=2)
import tensorflow as tf
tf.random.set_seed(seed_value)
import tensorflow.keras as keras
import tensorflow_addons as tfa
import tensorflow.keras.layers as layers
import tensorflow.keras.regularizers as regularizers
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_addons as tfa
import re
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder as ohe
from sklearn.metrics import auc, roc_curve

  from .autonotebook import tqdm as notebook_tqdm

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [3]:
# physical_devices = tf.config.list_physical_devices('GPU')
# tf.config.set_visible_devices(physical_devices[6], 'GPU')
# logical_devices = tf.config.list_logical_devices('GPU')

physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.set_visible_devices(physical_devices[0], 'GPU')
logical_devices = tf.config.list_logical_devices('GPU')

In [4]:
MAX_SEQ_LEN = 200
BERT = 'vinai/bertweet-large'
N_CLASSES = 3

In [5]:
def read_data(path):
    print(f'reading {path}')
    data = pd.read_csv(path)
    data.text = data.apply(lambda row: row.text.encode('ascii', 'ignore').decode('ascii').lower(), 1)
    data.text = data.apply(lambda row: re.sub(r"http\S+", "", row.text), 1)
    data.text = data.apply(lambda row: re.sub("removed|deleted", "", row.text), 1)
    data.text = data.apply(lambda row: re.sub(" :", "", row.text), 1)
    data.text = data.apply(lambda row: re.sub("[a-zA-Z]*lt;3[a-zA-Z]*", "", row.text), 1)
    data.text = data.apply(lambda row: re.sub("[a-zA-Z]&[a-zA-Z]*", "", row.text), 1)
    data.text = data.apply(lambda row: re.sub("[^a-zA-Z:.,;'!?\d]+", " ", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("i m |im |i'm ", "i am ", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("ive ", "i have ", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("wasnt|wasn't", "was not", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("werent|weren't", "were not", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("dont|don't", "do not", row.text).strip(), 1)
    data.text = data.apply(lambda row: re.sub("doesnt|doesn't", "does not", row.text).strip(), 1)
    texts = data.text.values
    labels = data.labels.values
    encoder = ohe(sparse=False)
    labels = np.array(labels).reshape(-1, 1)
    enc_labels = encoder.fit_transform(labels)
    print(f'texts shape: {texts.shape}, labels shape: {enc_labels.shape}')
    return texts, enc_labels

In [6]:
def prepare_bert_input(sentences, seq_len, bert_name):
    tokenizer = AutoTokenizer.from_pretrained(bert_name)
    encodings = tokenizer(sentences.tolist(), truncation=True, padding='max_length', max_length=seq_len)
    if bert_name.startswith("roberta") or "bertweet" in bert_name or "distilbert" in bert_name:
        input = [np.array(encodings["input_ids"]), np.array(encodings["attention_mask"])]
    else:
        input = [np.array(encodings["input_ids"]), np.array(encodings["attention_mask"]),
               np.array(encodings["token_type_ids"])]
    return input

In [8]:
# from sklearn.preprocessing import OneHotEncoder

# def read_data(file_path):
#     # Read the CSV file
#     data = pd.read_csv(file_path)
    
#     # Extract texts and labels
#     texts = data['text'].values
#     labels = data['labels'].values
    
#     # Initialize the OneHotEncoder with the correct parameter
#     encoder = OneHotEncoder(sparse_output=False)
    
#     # Reshape labels and apply one-hot encoding
#     labels = np.array(labels).reshape(-1, 1)
#     enc_labels = encoder.fit_transform(labels)
    
#     return texts, enc_labels

sentences_train, labels_train = read_data("../dep-det-data/train.csv")
sentences_val, labels_val = read_data("../dep-det-data/dev.csv")
sentences_test, labels_test = read_data("../dep-det-data/test.csv")

# sentences_train, labels_train = read_data("train.csv")
# sentences_val, labels_val = read_data("dev.csv")
# sentences_test, labels_test = read_data("test.csv")

# permutation train
perm_train = np.random.permutation(len(sentences_train))
sentences_train = sentences_train[perm_train]
labels_train = labels_train[perm_train]

# permutation val
perm_val = np.random.permutation(len(sentences_val))
sentences_val = sentences_val[perm_val]
labels_val = labels_val[perm_val]

# permutation test
perm_test = np.random.permutation(len(sentences_test))
sentences_test = sentences_test[perm_test]
labels_test = labels_test[perm_test]

# prepare model input
X_train = prepare_bert_input(sentences_train, MAX_SEQ_LEN, BERT)
X_val = prepare_bert_input(sentences_val, MAX_SEQ_LEN, BERT)
X_test = prepare_bert_input(sentences_test, MAX_SEQ_LEN, BERT)



In [9]:
# Bertweet initialization for pre-fine-tuning (it builds upon a Roberta model. See HuggingFace docs.)
roberta_input_ids = layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name='roberta_input_ids')
roberta_input_mask = layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name='roberta_attention_mask')
roberta_inputs = [roberta_input_ids, roberta_input_mask]
roberta = TFRobertaForSequenceClassification.from_pretrained(BERT, num_labels=N_CLASSES)
roberta_output = roberta(roberta_inputs).logits
pre_finetuned_model = keras.Model(inputs=roberta_inputs, outputs=roberta_output)
pre_finetuned_model.summary()

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 roberta_input_ids (InputLa  [(None, 200)]                0         []                            
 yer)                                                                                             
                                                                                                  
 roberta_attention_mask (In  [(None, 200)]                0         []                            
 putLayer)                                                                                        
                                                                                                  
 tf_roberta_for_sequence_cl  TFSequenceClassifierOutput   3553628   ['roberta_input_ids[0][0]',   
 assification (TFRobertaFor  (loss=None, logits=(None,    19         'roberta_attention_mask[0

In [10]:
max_epochs = 6
batch_size = 16

opt = tfa.optimizers.RectifiedAdam(learning_rate=3e-5)
loss = keras.losses.CategoricalCrossentropy(from_logits=True)
best_weights_file = f"bertweet_preft.h5"
acc = keras.metrics.CategoricalAccuracy()
f1_macro = keras.metrics.F1Score(average='macro')
m_ckpt = ModelCheckpoint(best_weights_file, monitor='val_'+f1_macro.name, mode='max', verbose=2,
                          save_weights_only=True, save_best_only=True)

pre_finetuned_model.compile(loss=loss, optimizer=opt, metrics=[f1_macro,acc])
pre_finetuned_model.summary()

### uncomment to perform the pre-fine-tuning step ###

# pre_finetuned_model.fit(
# X_train, labels_train,
# validation_data=(X_val, labels_val),
# epochs=max_epochs,
# batch_size=batch_size,
# callbacks=[m_ckpt],
# verbose=2
# )

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 roberta_input_ids (InputLa  [(None, 200)]                0         []                            
 yer)                                                                                             
                                                                                                  
 roberta_attention_mask (In  [(None, 200)]                0         []                            
 putLayer)                                                                                        
                                                                                                  
 tf_roberta_for_sequence_cl  TFSequenceClassifierOutput   3553628   ['roberta_input_ids[0][0]',   
 assification (TFRobertaFor  (loss=None, logits=(None,    19         'roberta_attention_mask[0

In [19]:
# test the model
from sklearn.metrics import classification_report
best_weights_file = f"bertweet.h5"
pre_finetuned_model.load_weights(best_weights_file)
pre_finetuned_model.compile(loss=loss, optimizer=opt, metrics=[f1_macro,acc])
y_pred_probs = pre_finetuned_model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
labels_test_decode = np.argmax(labels_test, axis=1)

report = classification_report(labels_test_decode, y_pred, digits=3)
print(report)

# if os.path.isfile(best_weights_file):
#     pre_finetuned_model.load_weights(best_weights_file)
#     pre_finetuned_model.compile(loss=loss, optimizer=opt, metrics=[f1_macro,acc])
#     y_pred_probs = pre_finetuned_model.predict(X_test)
#     y_pred = np.argmax(y_pred_probs, axis=1)
#     labels_test_decode = np.argmax(labels_test, axis=1)

#     report = classification_report(labels_test_decode, y_pred, digits=3)
#     print(report)
# else:
#     print(f"The file {best_weights_file} does not exist.")


FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'bertweet.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)