In [2]:
import time
import datasets
import pandas as pd
import transformers
import tensorflow as tf
from keras import layers
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
import numpy

train_path="data/train.csv"
dev_path="data/dev.csv"

2023-12-04 17:33:59.365918: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-04 17:33:59.410895: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-04 17:33:59.626141: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-04 17:33:59.626176: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-04 17:33:59.627333: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

### Pre-processing

In [3]:
# use the tokenizer from DistilRoBERTa
tokenizer = transformers.AutoTokenizer.from_pretrained("distilroberta-base", pad_token_id=0)

def tokenize(examples):
    """Converts the text of each example to "input_ids", a sequence of integers
    representing 1-hot vectors for each token in the text"""
    return tokenizer(examples["text"], truncation=True, max_length=64,
                     padding="max_length")

# load the CSVs into Huggingface datasets to allow use of the tokenizer
hf_dataset = datasets.load_dataset("csv", data_files={
    "train": train_path, "validation": dev_path})

# the labels are the names of all columns except the first
labels = hf_dataset["train"].column_names[1:]

def gather_labels(example):
    """Converts the label columns into a list of 0s and 1s"""
    # the float here is because converting hf to tf data requires a list or array of labels
    return {"labels": [float(example[l]) for l in labels]}

# convert text and labels to format expected by model
hf_dataset = hf_dataset.map(gather_labels)
hf_dataset = hf_dataset.map(tokenize, batched=True)

Found cached dataset csv (/home/reed/.cache/huggingface/datasets/csv/default-0989c6c4d599a70a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/25196 [00:00<?, ? examples/s]

Map:   0%|          | 0/3149 [00:00<?, ? examples/s]

Map:   0%|          | 0/25196 [00:00<?, ? examples/s]

Map:   0%|          | 0/3149 [00:00<?, ? examples/s]

### Convert to TF format

In [4]:
# split train and val into their own objects
hf_train = hf_dataset['train']
hf_dev = hf_dataset['validation']

# rename column for embeddings layer
hf_train = hf_train.rename_column("input_ids", "embedding_inputs")
hf_dev = hf_dev.rename_column("input_ids", "embedding_inputs")

# convert Huggingface datasets to Tensorflow datasets
train_dataset = hf_train.to_tf_dataset(
    columns="embedding_inputs",
    label_cols="labels",
    batch_size=32,
    shuffle=True)
dev_dataset = hf_dev.to_tf_dataset(
    columns="embedding_inputs",
    label_cols="labels",
    batch_size=32)

### Model

In [13]:
def run_model(embed_out, gru_units, dropout_rate, epochs, checkpoint, desc):

    model_name = f"{desc}_e_{embed_out}_g_{gru_units}_r_{relu_units}"

    # define a model with a single fully connected layer
    model = tf.keras.Sequential()
    model.add(layers.Embedding(
        input_dim=tokenizer.vocab_size,
        output_dim=embed_out,
        mask_zero=True))
    # conditional layer addition
    model.add(layers.Bidirectional(layers.GRU(gru_units, return_sequences=True)))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Bidirectional(layers.GRU(gru_units)))
    model.add(layers.Dropout(dropout_rate))
    # final processing with a dense RELU layer
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(.2))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dropout(.2))

    model.add(layers.Dense(
        units=len(labels),
        activation='sigmoid'))

    # specify compilation hyperparameters
    model.compile(
        optimizer=tf.keras.optimizers.Adam(0.001),
        loss=tf.keras.losses.binary_crossentropy,
        metrics=[tf.keras.metrics.F1Score(average="micro", threshold=0.5)])

    # set time for run time and model naming
    print("================================================================================")
    print(f"MODEL SPEC.: {model_name}")
    start_time = time.time()

    # set callbacks
    def pick_callback(checkpoint):
        callback_no_checkpoint = [
            TensorBoard(
                log_dir=f"logs/{model_name}"),
            EarlyStopping(
                monitor="val_f1_score",
                min_delta=0.25,
                patience=5,
                start_from_epoch=2)]
        callback_checkpoint = [
                ModelCheckpoint(
                filepath=f"checkpoints/{model_name}",
                monitor="val_f1_score",
                mode="max",
                save_best_only=True),
            TensorBoard(
                log_dir=f"logs/{model_name}"),
            EarlyStopping(
                monitor="val_f1_score",
                min_delta=0.009,
                patience=3,
                start_from_epoch=2)]

        if checkpoint == True:
            return callback_checkpoint
        else:
            return callback_no_checkpoint
    
       # fit the model to the training data, monitoring F1 on the dev data
    model.fit(
        train_dataset,
        epochs=epochs,
        validation_data=dev_dataset,
        callbacks=pick_callback(checkpoint))

    time_elapsed = (time.time() - start_time) / 60
    print(f"Time Elapsed: {time_elapsed} min.")

### Layer width grid search

In [None]:
# define grid search parameters and loop
embed_out = [32, 64, 128]
gru_units = [32, 64, 128, 256]
relu_units = [32, 64, 128, 256]

for e in embed_out:
    for g in gru_units:
        for r in relu_units:
            run_model(e, g, r, 1)

### Testings training stability of best models

In [10]:
for _ in range(4):
    run_model(
        embed_out=128,
        gru_units=64,
        relu_units=32,
        deep_rnn=False,
        relu_layer=True,
        epochs=1,
        desc='stability')

for _ in range(4):
    run_model(
        embed_out=128,
        gru_units=256,
        relu_units=128,
        deep_rnn=False,
        relu_layer=True,
        epochs=1,
        desc='stability')

MODEL SPEC.: stability_e_128_g_64_r_32


INFO:tensorflow:Assets written to: checkpoints/stability_e_128_g_64_r_32/assets


Time Elapsed: 1.48796306848526 min.
MODEL SPEC.: stability_e_128_g_64_r_32


INFO:tensorflow:Assets written to: checkpoints/stability_e_128_g_64_r_32/assets


Time Elapsed: 2.4686445156733194 min.
MODEL SPEC.: stability_e_128_g_64_r_32


INFO:tensorflow:Assets written to: checkpoints/stability_e_128_g_64_r_32/assets


Time Elapsed: 1.5821001331011455 min.
MODEL SPEC.: stability_e_128_g_64_r_32


INFO:tensorflow:Assets written to: checkpoints/stability_e_128_g_64_r_32/assets


Time Elapsed: 2.4495981017748516 min.
MODEL SPEC.: stability_e_128_g_256_r_128


INFO:tensorflow:Assets written to: checkpoints/stability_e_128_g_256_r_128/assets


Time Elapsed: 3.697490147749583 min.
MODEL SPEC.: stability_e_128_g_256_r_128


INFO:tensorflow:Assets written to: checkpoints/stability_e_128_g_256_r_128/assets


Time Elapsed: 3.3310019612312316 min.
MODEL SPEC.: stability_e_128_g_256_r_128


INFO:tensorflow:Assets written to: checkpoints/stability_e_128_g_256_r_128/assets


Time Elapsed: 3.1070512374242147 min.
MODEL SPEC.: stability_e_128_g_256_r_128


INFO:tensorflow:Assets written to: checkpoints/stability_e_128_g_256_r_128/assets


Time Elapsed: 3.1231038848559063 min.


### Test larger embedding output dimension on best models

In [14]:
print("+++++++WIDE and BIG EMBEDDING DIM+++++++")
for i in range(1):
    desc = f"wide_{i}"
    run_model(
        embed_out=256,
        gru_units=256,
        relu_units=128,
        deep_rnn=False,
        relu_layer=True,
        epochs=1,
        checkpoint=False,
        desc=desc)
        
print("+++++++NARROWER and BIG EMBEDDING DIM+++++++")
for i in range(1):
    desc = f"narrow_{i}"
    run_model(
        embed_out=256,
        gru_units=128,
        relu_units=128,
        deep_rnn=False,
        relu_layer=True,
        epochs=1,
        checkpoint=False,
        desc=desc)
        
print("+++++++NO RELU and BIG EMBEDDING DIM+++++++")
for i in range(1):
    desc = f"no_relu_{i}"
    run_model(
        embed_out=256,
        gru_units=128,
        relu_units=128,
        deep_rnn=False,
        relu_layer=False,
        epochs=1,
        checkpoint=False,
        desc=desc)
      

+++++++WIDE and BIG EMBEDDING DIM+++++++
MODEL SPEC.: wide_0_e_256_g_256_r_128
Time Elapsed: 3.9052470525105796 min.
MODEL SPEC.: wide_1_e_256_g_256_r_128
Time Elapsed: 4.458066550890605 min.
+++++++NARROWER and BIG EMBEDDING DIM+++++++
MODEL SPEC.: narrow_0_e_256_g_128_r_128
Time Elapsed: 2.4997939626375834 min.
MODEL SPEC.: narrow_1_e_256_g_128_r_128
Time Elapsed: 2.552736254533132 min.
+++++++NO RELU and BIG EMBEDDING DIM+++++++
MODEL SPEC.: no_relu_0_e_256_g_128_r_128
Time Elapsed: 2.5096387108167013 min.
MODEL SPEC.: no_relu_1_e_256_g_128_r_128
Time Elapsed: 2.804123079776764 min.


### Test Larger Embedding Dim Sizes

In [7]:
print("+++++++LARGER EMBEDDING DIMS+++++++")
for d in [512, 1024]:
    for _ in range(5):
        desc = f"embed-dims-stability_"
        run_model(
            embed_out=d,
            gru_units=128,
            relu_units=128,
            deep_rnn=False,
            relu_layer=False,
            epochs=1,
            checkpoint=False,
            desc=desc)

+++++++LARGER EMBEDDING DIMS+++++++
MODEL SPEC.: embed-dims-stability__e_512_g_128_r_128
Time Elapsed: 4.083845933278401 min.
MODEL SPEC.: embed-dims-stability__e_512_g_128_r_128
Time Elapsed: 4.055330149332682 min.
MODEL SPEC.: embed-dims-stability__e_512_g_128_r_128
Time Elapsed: 4.456600824991862 min.
MODEL SPEC.: embed-dims-stability__e_512_g_128_r_128
Time Elapsed: 4.452627996603648 min.
MODEL SPEC.: embed-dims-stability__e_512_g_128_r_128
Time Elapsed: 4.4465512990951535 min.
MODEL SPEC.: embed-dims-stability__e_1024_g_128_r_128
Time Elapsed: 6.954580640792846 min.
MODEL SPEC.: embed-dims-stability__e_1024_g_128_r_128
Time Elapsed: 7.166326224803925 min.
MODEL SPEC.: embed-dims-stability__e_1024_g_128_r_128
Time Elapsed: 7.18612863222758 min.
MODEL SPEC.: embed-dims-stability__e_1024_g_128_r_128
Time Elapsed: 7.19367485443751 min.
MODEL SPEC.: embed-dims-stability__e_1024_g_128_r_128
Time Elapsed: 7.217430011431376 min.


### Even Bigger Embeddings Layer

In [8]:
for _ in range(3):
    desc = f"embed-dims-stability_"
    run_model(
        embed_out=2048,
        gru_units=128,
        relu_units=128,
        deep_rnn=False,
        relu_layer=False,
        epochs=1,
        checkpoint=False,
        desc=desc)

MODEL SPEC.: embed-dims-stability__e_2048_g_128_r_128
Time Elapsed: 14.460391795635223 min.
MODEL SPEC.: embed-dims-stability__e_2048_g_128_r_128
Time Elapsed: 14.465113210678101 min.
MODEL SPEC.: embed-dims-stability__e_2048_g_128_r_128
 16/788 [..............................] - ETA: 14:04 - loss: 0.3249 - f1_score: 0.0606

KeyboardInterrupt: 

### Going for it!

In [9]:
for _ in range(3):
    desc = f"deep-stability_"
    run_model(
        embed_out=512,
        gru_units=128,
        relu_units=128,
        deep_rnn=True,
        relu_layer=False,
        epochs=10,
        checkpoint=True,
        desc=desc)

MODEL SPEC.: deep-stability__e_1024_g_128_r_128
Epoch 1/10


INFO:tensorflow:Assets written to: checkpoints/deep-stability__e_1024_g_128_r_128/assets


Epoch 2/10
Epoch 3/10
Epoch 4/10
115/788 [===>..........................] - ETA: 6:59 - loss: 0.0219 - f1_score: 0.9486

KeyboardInterrupt: 

### Dropout Rate

In [14]:
for d in [.5, .6, .7]:
    desc = f"dropout_{d}_multi_True"
    run_model(
        embed_out=128,
        gru_units=64,
        relu_units=128,
        deep_rnn=True,
        dropout_rate=d,
        multi_dropout=True,
        relu_layer=False,
        epochs=10,
        checkpoint=True,
        desc=desc)

MODEL SPEC.: dropout_0.5_multi_True_e_128_g_64_r_128
Epoch 1/10


INFO:tensorflow:Assets written to: checkpoints/dropout_0.5_multi_True_e_128_g_64_r_128/assets


Epoch 2/10


INFO:tensorflow:Assets written to: checkpoints/dropout_0.5_multi_True_e_128_g_64_r_128/assets


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

KeyboardInterrupt: 

### Predict

In [None]:
# load the saved model
model_dir = "e_128_g_256_r_32"
model_path = f"checkpoints/{model_dir}"
model = tf.keras.models.load_model(model_path)
df = pd.read_csv(dev_path)

# generate predictions from model (on the tf version of validation data)
predictions = numpy.where(model.predict(dev_dataset) > 0.5, 1, 0)

# assign predictions to label columns in Pandas data frame
df.iloc[:, 1:] = predictions
print(df.head())

# write the Pandas dataframe to a zipped CSV file
df.to_csv("submission.zip", index=False, compression=dict(
    method='zip', archive_name=f'submission.csv'))

