In [1]:
import time
import datasets
import pandas as pd
import numpy as np
import transformers
import tensorflow as tf
import tensorflow_datasets as tfds
from keras import layers
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
import numpy

train_path="data/train.csv"
dev_path="data/dev.csv"

2023-12-04 12:51:24.299651: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-04 12:51:24.302542: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-04 12:51:24.335800: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-04 12:51:24.335834: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-04 12:51:24.335867: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

### Pre-processing

In [2]:
# use the tokenizer from DistilRoBERTa
tokenizer = transformers.AutoTokenizer.from_pretrained("distilroberta-base", pad_token_id=0)

def tokenize(examples):
    """Converts the text of each example to "input_ids", a sequence of integers
    representing 1-hot vectors for each token in the text"""
    return tokenizer(examples["text"], truncation=True, max_length=64,
                     padding="max_length")

# load the CSVs into Huggingface datasets to allow use of the tokenizer
hf_dataset = datasets.load_dataset("csv", data_files={
    "train": train_path, "validation": dev_path})

# the labels are the names of all columns except the first
labels = hf_dataset["train"].column_names[1:]

def gather_labels(example):
    """Converts the label columns into a list of 0s and 1s"""
    # the float here is because converting hf to tf data requires a list or array of labels
    return {"labels": [float(example[l]) for l in labels]}

# convert text and labels to format expected by model
hf_dataset = hf_dataset.map(gather_labels)
hf_dataset = hf_dataset.map(tokenize, batched=True)

Found cached dataset csv (/home/reed/.cache/huggingface/datasets/csv/default-0989c6c4d599a70a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/25196 [00:00<?, ? examples/s]

Map:   0%|          | 0/3149 [00:00<?, ? examples/s]

Map:   0%|          | 0/25196 [00:00<?, ? examples/s]

Map:   0%|          | 0/3149 [00:00<?, ? examples/s]

### Convert to TF format

In [3]:
# split train and val into their own objects
hf_train = hf_dataset['train']
hf_dev = hf_dataset['validation']

# rename column for embeddings layer
hf_train = hf_train.rename_column("input_ids", "embedding_inputs")
hf_dev = hf_dev.rename_column("input_ids", "embedding_inputs")

# convert Huggingface datasets to Tensorflow datasets
train_dataset = hf_train.to_tf_dataset(
    columns="embedding_inputs",
    label_cols="labels",
    batch_size=32,
    shuffle=True)
dev_dataset = hf_dev.to_tf_dataset(
    columns="embedding_inputs",
    label_cols="labels",
    batch_size=32)

print(train_dataset)

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), TensorSpec(shape=(None, 7), dtype=tf.float32, name=None))>


### Sample Weights

In [10]:
# extract labels as a numpy array
train_unbatched = train_dataset.unbatch()
labels_tmp = np.asarray(list(train_unbatched.map(lambda x, y: y)))
print(labels_tmp)
# convert to pandas
labels_df = pd.DataFrame(labels_tmp)
# convert labels to a string of ids
labels_df['ids'] = pd.DataFrame(labels_df.astype(int).astype(str).agg(''.join, axis=1))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [9]:
# get the number of times each id appears
weights = labels_df.groupby('ids').size().reset_index(name='n')
# calculate weights
weights['weight'] = (1 / weights['n'])
# convert to dictionary
weights = weights.set_index('ids')['weight'].to_dict()
print(weights)    

{'0000000': 7.142346975216056e-05, '0000001': 0.0019646365422396855, '0000010': 0.007407407407407408, '0000100': 0.012658227848101266, '0001000': 0.0005580357142857143, '0001001': 0.3333333333333333, '0001010': 0.5, '0010000': 0.000449034575662326, '0010001': 0.07142857142857142, '0010010': 0.1111111111111111, '0010100': 0.125, '0011000': 0.023255813953488372, '0100000': 0.00047664442326024784, '0100001': 0.1111111111111111, '0100010': 0.5, '0100100': 1.0, '0101000': 0.0196078431372549, '0101001': 1.0, '0110000': 0.012658227848101266, '0110010': 1.0, '0111000': 0.5, '1000000': 0.0002817695125387433, '1000001': 0.125, '1000010': 0.25, '1000100': 0.043478260869565216, '1001000': 0.00546448087431694, '1010000': 0.003703703703703704, '1010001': 1.0, '1011000': 0.125, '1100000': 0.012048192771084338, '1101000': 1.0}


### Model

In [6]:
# define grid search parameters and loop

# define a model with a single fully connected layer
model = tf.keras.Sequential()
model.add(layers.Embedding(
    input_dim=tokenizer.vocab_size,
    output_dim=16,
    mask_zero=True))
model.add(layers.Bidirectional(layers.GRU(64)))
# final processing with a dense RELU layer
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(
    units=len(labels),
    activation='sigmoid'))

# specify compilation hyperparameters
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=tf.keras.losses.binary_crossentropy,
    metrics=[tf.keras.metrics.F1Score(average="micro", threshold=0.5)])

# set time for run time and model naming
start_time = time.time()
desc = "weights-and-masks"
model_name = f"{desc}_time_{start_time}"

# fit the model to the training data, monitoring F1 on the dev data
print(model_name)
model.fit(
    train_dataset,
    epochs=50,
    validation_data=dev_dataset,
    class_weight=weights_array,
    callbacks=[
        ModelCheckpoint(
            filepath=f"checkpoints/{model_name}",
            monitor="val_f1_score",
            mode="max",
            save_best_only=True),
        TensorBoard(
            log_dir=f"logs/{model_name}"),
        EarlyStopping(
            monitor="val_f1_score",
            min_delta=0.25,
            patience=5,
            start_from_epoch=2)
        ])

time_elapsed = (time.time() - start_time) / 60
print(f"Time Elapsed: {time_elapsed} min.")

weights-and-masks_time_1701719529.3200686


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

### Predict

In [None]:
# load the saved model
model_path="checkpoints/batch-size_32_time_1701555125.2780304"
model = tf.keras.models.load_model(model_path)
df = pd.read_csv(dev_path)

# generate predictions from model (on the tf version of validation data)
predictions = numpy.where(model.predict(dev_dataset) > 0.5, 1, 0)

# assign predictions to label columns in Pandas data frame
df.iloc[:, 1:] = predictions
print(df.head())

# write the Pandas dataframe to a zipped CSV file
df.to_csv("submission.zip", index=False, compression=dict(
    method='zip', archive_name=f'submission.csv'))

