In [1]:
import time
import datasets
import pandas as pd
import transformers
import tensorflow as tf
from keras import layers
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
import numpy

train_path="data/train.csv"
dev_path="data/dev.csv"

2023-12-04 21:58:51.488938: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-04 21:58:51.519677: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-04 21:58:51.709010: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-04 21:58:51.709050: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-04 21:58:51.710195: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

### Pre-processing

In [2]:
# use the tokenizer from DistilRoBERTa
tokenizer = transformers.AutoTokenizer.from_pretrained("distilroberta-base", pad_token_id=0)

def tokenize(examples):
    """Converts the text of each example to "input_ids", a sequence of integers
    representing 1-hot vectors for each token in the text"""
    return tokenizer(examples["text"], truncation=True, max_length=64,
                     padding="max_length")

# load the CSVs into Huggingface datasets to allow use of the tokenizer
hf_dataset = datasets.load_dataset("csv", data_files={
    "train": train_path, "validation": dev_path})

# the labels are the names of all columns except the first
labels = hf_dataset["train"].column_names[1:]

def gather_labels(example):
    """Converts the label columns into a list of 0s and 1s"""
    # the float here is because converting hf to tf data requires a list or array of labels
    return {"labels": [float(example[l]) for l in labels]}

# convert text and labels to format expected by model
hf_dataset = hf_dataset.map(gather_labels)
hf_dataset = hf_dataset.map(tokenize, batched=True)

Found cached dataset csv (/home/reed/.cache/huggingface/datasets/csv/default-0989c6c4d599a70a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/25196 [00:00<?, ? examples/s]

Map:   0%|          | 0/3149 [00:00<?, ? examples/s]

Map:   0%|          | 0/25196 [00:00<?, ? examples/s]

Map:   0%|          | 0/3149 [00:00<?, ? examples/s]

### Convert to TF format

In [3]:
# split train and val into their own objects
hf_train = hf_dataset['train']
hf_dev = hf_dataset['validation']

# rename column for embeddings layer
hf_train = hf_train.rename_column("input_ids", "embedding_inputs")
hf_dev = hf_dev.rename_column("input_ids", "embedding_inputs")

# convert Huggingface datasets to Tensorflow datasets
train_dataset = hf_train.to_tf_dataset(
    columns="embedding_inputs",
    label_cols="labels",
    batch_size=32,
    shuffle=True)
dev_dataset = hf_dev.to_tf_dataset(
    columns="embedding_inputs",
    label_cols="labels",
    batch_size=32)

### Model

In [30]:
def train_model(embed_out, gru_units, dropout_rate, epochs, checkpoint, desc):

    model_name = f"{desc}_e_{embed_out}_g_{gru_units}"

    # define a model with a single fully connected layer
    model = tf.keras.Sequential()
    model.add(layers.Embedding(
        input_dim=tokenizer.vocab_size,
        output_dim=embed_out,
        mask_zero=True))
    model.add(layers.Bidirectional(layers.GRU(gru_units, return_sequences=True)))
    model.add(layers.MaxPooling1D(pool_size=7))
    model.add(layers.Bidirectional(layers.GRU(32, return_sequences=True)))
    model.add(layers.GlobalMaxPool1D())
    model.add(layers.Dense(64))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(32))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(
        units=len(labels),
        activation='sigmoid'))

    print(model.summary())
    # specify compilation hyperparameters
    model.compile(
        optimizer=tf.keras.optimizers.Adam(0.001),
        loss=tf.keras.losses.binary_crossentropy,
        metrics=[tf.keras.metrics.F1Score(average="micro", threshold=0.5)])

    # set time for run time and model naming
    print("================================================================================")
    print(f"MODEL SPEC.: {model_name}")
    start_time = time.time()

    # set callbacks
    callbacks = [
        ModelCheckpoint(
            filepath=f"checkpoints/{model_name}",
            monitor="val_f1_score",
            mode="max",
            save_best_only=True),
        TensorBoard(
            log_dir=f"logs/{model_name}"),
        EarlyStopping(
            monitor="val_f1_score",
            min_delta=0.001,
            patience=3,
            start_from_epoch=10)]
            

    
       # fit the model to the training data, monitoring F1 on the dev data
    model.fit(
        train_dataset,
        epochs=epochs,
        validation_data=dev_dataset,
        callbacks=callbacks)

    time_elapsed = (time.time() - start_time) / 60
    print(f"Time Elapsed: {time_elapsed} min.")

### Train

In [31]:
train_model(
    embed_out=128,
    gru_units=128,
    dropout_rate=.7,
    epochs=20,
    checkpoint=True,
    desc='new-arch_2'
)

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, None, 128)         6433920   
                                                                 
 bidirectional_17 (Bidirect  (None, None, 256)         198144    
 ional)                                                          
                                                                 
 max_pooling1d_6 (MaxPoolin  (None, None, 256)         0         
 g1D)                                                            
                                                                 
 bidirectional_18 (Bidirect  (None, None, 64)          55680     
 ional)                                                          
                                                                 
 global_max_pooling1d_7 (Gl  (None, 64)                0         
 obalMaxPooling1D)                                   

INFO:tensorflow:Assets written to: checkpoints/new-arch_2_e_128_g_128/assets


Epoch 2/20


INFO:tensorflow:Assets written to: checkpoints/new-arch_2_e_128_g_128/assets


Epoch 3/20
Epoch 4/20

KeyboardInterrupt: 

### Predict

In [None]:
# load the saved model
model_dir = "e_128_g_256_r_32"
model_path = f"checkpoints/{model_dir}"
model = tf.keras.models.load_model(model_path)
df = pd.read_csv(dev_path)

# generate predictions from model (on the tf version of validation data)
predictions = numpy.where(model.predict(dev_dataset) > 0.5, 1, 0)

# assign predictions to label columns in Pandas data frame
df.iloc[:, 1:] = predictions
print(df.head())

# write the Pandas dataframe to a zipped CSV file
df.to_csv("submission.zip", index=False, compression=dict(
    method='zip', archive_name=f'submission.csv'))

