In [11]:
import time
import datasets
import pandas
import transformers
import tensorflow as tf
import keras
from keras import layers
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
import numpy

train_path="data/train.csv"
dev_path="data/dev.csv"

### Pre-processing

In [12]:
# use the tokenizer from DistilRoBERTa
tokenizer = transformers.AutoTokenizer.from_pretrained("distilroberta-base")

def tokenize(examples):
    """Converts the text of each example to "input_ids", a sequence of integers
    representing 1-hot vectors for each token in the text"""
    return tokenizer(examples["text"], truncation=True, max_length=64,
                     padding="max_length")

# load the CSVs into Huggingface datasets to allow use of the tokenizer
hf_dataset = datasets.load_dataset("csv", data_files={
    "train": train_path, "validation": dev_path})

# the labels are the names of all columns except the first
labels = hf_dataset["train"].column_names[1:]

def gather_labels(example):
    """Converts the label columns into a list of 0s and 1s"""
    # the float here is because converting hf to tf data requires a list or array of labels
    return {"labels": [float(example[l]) for l in labels]}

# convert text and labels to format expected by model
hf_dataset = hf_dataset.map(gather_labels)
hf_dataset = hf_dataset.map(tokenize, batched=True)



  0%|          | 0/2 [00:00<?, ?it/s]



### Convert to TF format

In [13]:
# split train and val into their own objects
hf_train = hf_dataset['train']
hf_dev = hf_dataset['validation']

# rename column for embeddings layer
hf_train = hf_train.rename_column("input_ids", "embedding_inputs")
hf_dev = hf_dev.rename_column("input_ids", "embedding_inputs")

### Model

In [23]:
# define grid search parameters and loop

for b in batch_size:
        
    # convert Huggingface datasets to Tensorflow datasets
    train_dataset = hf_train.to_tf_dataset(
        columns="embedding_inputs",
        label_cols="labels",
        batch_size=32,
        shuffle=True)
    dev_dataset = hf_dev.to_tf_dataset(
        columns="embedding_inputs",
        label_cols="labels",
        batch_size=32)

    # define a model with a single fully connected layer
    model = tf.keras.Sequential()
    model.add(layers.Embedding(
        input_dim=tokenizer.vocab_size,
        output_dim=16,
        mask_zero=True))
    model.add(layers.Bidirectional(layers.GRU(64)))
    # final processing with a dense RELU layer
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(
        units=len(labels),
        activation='sigmoid'))

    # specify compilation hyperparameters
    model.compile(
        optimizer=tf.keras.optimizers.Adam(0.001),
        loss=tf.keras.losses.binary_crossentropy,
        metrics=[tf.keras.metrics.F1Score(average="micro", threshold=0.5)])

    # set time for run time and model naming
    start_time = time.time()
    desc = f"batch-size_{b}"
    model_name = f"{desc}_time_{start_time}"

    # fit the model to the training data, monitoring F1 on the dev data
    print(model_name)
    model.fit(
        train_dataset,
        epochs=20,
        validation_data=dev_dataset,
        callbacks=[
            ModelCheckpoint(
                filepath=f"checkpoints/{model_name}",
                monitor="val_f1_score",
                mode="max",
                save_best_only=True),
            TensorBoard(
                log_dir=f"logs/{model_name}"),
            EarlyStopping(
                monitor="val_f1_score",
                min_delta=0.25,
                patience=5,
                start_from_epoch=2)
            ])

    time_elapsed = (time.time() - start_time) / 60
    print(f"Time Elapsed: {time_elapsed} min.")

Epoch 1/20


INFO:tensorflow:Assets written to: checkpoints/batch-size_256_time_1701558399.5076332/assets


Epoch 2/20


INFO:tensorflow:Assets written to: checkpoints/batch-size_256_time_1701558399.5076332/assets


Epoch 3/20


INFO:tensorflow:Assets written to: checkpoints/batch-size_256_time_1701558399.5076332/assets


Epoch 4/20


INFO:tensorflow:Assets written to: checkpoints/batch-size_256_time_1701558399.5076332/assets


Epoch 5/20


INFO:tensorflow:Assets written to: checkpoints/batch-size_256_time_1701558399.5076332/assets


Epoch 6/20


INFO:tensorflow:Assets written to: checkpoints/batch-size_256_time_1701558399.5076332/assets


Epoch 7/20


INFO:tensorflow:Assets written to: checkpoints/batch-size_256_time_1701558399.5076332/assets


Epoch 8/20
Epoch 9/20


INFO:tensorflow:Assets written to: checkpoints/batch-size_256_time_1701558399.5076332/assets


Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Time Elapsed: 6.189360284805298 min.
