In [12]:
import time
import datasets
import pandas as pd
import transformers
import tensorflow as tf
from keras import layers
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
import numpy

train_path="data/train.csv"
dev_path="data/dev.csv"

### Pre-processing

In [3]:
# use the tokenizer from DistilRoBERTa
tokenizer = transformers.AutoTokenizer.from_pretrained("distilroberta-base")

def tokenize(examples):
    """Converts the text of each example to "input_ids", a sequence of integers
    representing 1-hot vectors for each token in the text"""
    return tokenizer(examples["text"], truncation=True, max_length=64,
                     padding="max_length")

# load the CSVs into Huggingface datasets to allow use of the tokenizer
hf_dataset = datasets.load_dataset("csv", data_files={
    "train": train_path, "validation": dev_path})

# the labels are the names of all columns except the first
labels = hf_dataset["train"].column_names[1:]

def gather_labels(example):
    """Converts the label columns into a list of 0s and 1s"""
    # the float here is because converting hf to tf data requires a list or array of labels
    return {"labels": [float(example[l]) for l in labels]}

# convert text and labels to format expected by model
hf_dataset = hf_dataset.map(gather_labels)
hf_dataset = hf_dataset.map(tokenize, batched=True)

Found cached dataset csv (/home/reed/.cache/huggingface/datasets/csv/default-0989c6c4d599a70a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/reed/.cache/huggingface/datasets/csv/default-0989c6c4d599a70a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-f4683b0b6acb8908.arrow
Loading cached processed dataset at /home/reed/.cache/huggingface/datasets/csv/default-0989c6c4d599a70a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-30e6e2a577146a8a.arrow
Loading cached processed dataset at /home/reed/.cache/huggingface/datasets/csv/default-0989c6c4d599a70a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-2a31658dd9f4f20f.arrow


Map:   0%|          | 0/3149 [00:00<?, ? examples/s]

### Convert to TF format

In [4]:
# split train and val into their own objects
hf_train = hf_dataset['train']
hf_dev = hf_dataset['validation']

# rename column for embeddings layer
hf_train = hf_train.rename_column("input_ids", "embedding_inputs")
hf_dev = hf_dev.rename_column("input_ids", "embedding_inputs")

### Calculate Sample Weights

In [None]:
def sample_weights(data):
    summed_labels = data.apply(sum, axis=0)
    total_n = summed_labels.sum()
    proportions = summed_labels / total_n
    sorted_proportions = proportions.sort_values(ascending=False)
    return sorted_proportions

### Model

In [7]:
# define grid search parameters and loop

batch_size = [1]

for b in batch_size:
        
    # convert Huggingface datasets to Tensorflow datasets
    train_dataset = hf_train.to_tf_dataset(
        columns="embedding_inputs",
        label_cols="labels",
        batch_size=32,
        shuffle=True)
    dev_dataset = hf_dev.to_tf_dataset(
        columns="embedding_inputs",
        label_cols="labels",
        batch_size=32)

    # define a model with a single fully connected layer
    model = tf.keras.Sequential()
    model.add(layers.Embedding(
        input_dim=tokenizer.vocab_size,
        output_dim=16,
        mask_zero=True))
    model.add(layers.Bidirectional(layers.GRU(64)))
    # final processing with a dense RELU layer
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(
        units=len(labels),
        activation='sigmoid'))

    # specify compilation hyperparameters
    model.compile(
        optimizer=tf.keras.optimizers.Adam(0.001),
        loss=tf.keras.losses.binary_crossentropy,
        metrics=[tf.keras.metrics.F1Score(average="micro", threshold=0.5)])

    # set time for run time and model naming
    start_time = time.time()
    desc = f"batch-size_{b}"
    model_name = f"{desc}_time_{start_time}"

    # fit the model to the training data, monitoring F1 on the dev data
    print(model_name)
    model.fit(
        train_dataset,
        epochs=20,
        validation_data=dev_dataset,
        callbacks=[
            ModelCheckpoint(
                filepath=f"checkpoints/{model_name}",
                monitor="val_f1_score",
                mode="max",
                save_best_only=True),
            TensorBoard(
                log_dir=f"logs/{model_name}"),
            EarlyStopping(
                monitor="val_f1_score",
                min_delta=0.25,
                patience=5,
                start_from_epoch=2)
            ])

    time_elapsed = (time.time() - start_time) / 60
    print(f"Time Elapsed: {time_elapsed} min.")

batch-size_1_time_1701706064.1189744
Epoch 1/20
 83/788 [==>...........................] - ETA: 41s - loss: 0.3316 - f1_score: 0.0218

KeyboardInterrupt: 

### Predict

In [14]:
# load the saved model
model_path="checkpoints/batch-size_32_time_1701555125.2780304"
model = tf.keras.models.load_model(model_path)
df = pd.read_csv(dev_path)

# generate predictions from model (on the tf version of validation data)
predictions = numpy.where(model.predict(dev_dataset) > 0.5, 1, 0)

# assign predictions to label columns in Pandas data frame
df.iloc[:, 1:] = predictions
print(df.head())

# write the Pandas dataframe to a zipped CSV file
df.to_csv("submission.zip", index=False, compression=dict(
    method='zip', archive_name=f'submission.csv'))



2023-12-04 09:12:03.243996: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-12-04 09:12:03.562684: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-12-04 09:12:03.663934: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-12-04 09:12:04.261578: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-12-04 09:12:04.739521: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but 

                                                text  admiration  amusement  \
0  Is this in New Orleans?? I really feel like th...           0          0   
1  You know the answer man, you are programmed to...           0          0   
2  The economy is heavily controlled and subsidiz...           0          0   
3  Thank you for your vote of confidence, but we ...           0          0   
4                                       There it is!           0          0   

   gratitude  love  pride  relief  remorse  
0          0     0      0       0        0  
1          0     0      0       0        0  
2          0     0      0       0        0  
3          1     0      0       0        0  
4          0     0      0       0        0  
