In [11]:
import time
import datasets
import pandas as pd
import transformers
import tensorflow as tf
from keras import layers
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
import numpy

train_path="data/train.csv"
dev_path="data/test-in.csv"

### Pre-processing

In [12]:
# use the tokenizer from DistilRoBERTa
tokenizer = transformers.AutoTokenizer.from_pretrained("distilroberta-base", pad_token_id=0)

def tokenize(examples):
    """Converts the text of each example to "input_ids", a sequence of integers
    representing 1-hot vectors for each token in the text"""
    return tokenizer(examples["text"], truncation=True, max_length=64,
                     padding="max_length")

# load the CSVs into Huggingface datasets to allow use of the tokenizer
hf_dataset = datasets.load_dataset("csv", data_files={
    "train": train_path, "validation": dev_path})

# the labels are the names of all columns except the first
labels = hf_dataset["train"].column_names[1:]

def gather_labels(example):
    """Converts the label columns into a list of 0s and 1s"""
    # the float here is because converting hf to tf data requires a list or array of labels
    return {"labels": [float(example[l]) for l in labels]}

# convert text and labels to format expected by model
hf_dataset = hf_dataset.map(gather_labels)
hf_dataset = hf_dataset.map(tokenize, batched=True)

Downloading and preparing dataset csv/default to /home/reed/.cache/huggingface/datasets/csv/default-54a5f621ffa2e0d9/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

  if _pandas_api.is_sparse(col):
  if _pandas_api.is_sparse(col):
  if _pandas_api.is_sparse(col):


Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/reed/.cache/huggingface/datasets/csv/default-54a5f621ffa2e0d9/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  if _pandas_api.is_sparse(col):


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/25196 [00:00<?, ? examples/s]

Map:   0%|          | 0/3109 [00:00<?, ? examples/s]

Map:   0%|          | 0/25196 [00:00<?, ? examples/s]

Map:   0%|          | 0/3109 [00:00<?, ? examples/s]

### Convert to TF format

In [13]:
# split train and val into their own objects
hf_train = hf_dataset['train']
hf_dev = hf_dataset['validation']

# rename column for embeddings layer
hf_train = hf_train.rename_column("input_ids", "embedding_inputs")
hf_dev = hf_dev.rename_column("input_ids", "embedding_inputs")

# convert Huggingface datasets to Tensorflow datasets
train_dataset = hf_train.to_tf_dataset(
    columns="embedding_inputs",
    label_cols="labels",
    batch_size=32,
    shuffle=True)
dev_dataset = hf_dev.to_tf_dataset(
    columns="embedding_inputs",
    label_cols="labels",
    batch_size=32)

### Model

In [5]:
def train_model(embed_out, gru_units, dropout_rate, epochs, iter, desc):

    model_name = f"{desc}_e_{embed_out}_g_{gru_units}"

    # define a model with a single fully connected layer
    model = tf.keras.Sequential()
    model.add(layers.Embedding(
        input_dim=tokenizer.vocab_size,
        output_dim=embed_out,
        mask_zero=True))
    model.add(layers.Bidirectional(layers.GRU(gru_units)))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(
        units=len(labels),
        activation='sigmoid'))

    print(model.summary())
    # specify compilation hyperparameters
    model.compile(
        optimizer=tf.keras.optimizers.Adam(0.001),
        loss=tf.keras.losses.binary_crossentropy,
        metrics=[tf.keras.metrics.F1Score(average="micro", threshold=0.5)])

    # set time for run time and model naming
    print("================================================================================")
    print(f"MODEL SPEC.: {model_name}")
    start_time = time.time()

    # set callbacks
    callbacks = [
        ModelCheckpoint(
            filepath=f"checkpoints/{model_name}",
            monitor="val_f1_score",
            mode="max",
            save_best_only=True),
        TensorBoard(
            log_dir=f"logs/{model_name}"),
        EarlyStopping(
            monitor="val_f1_score",
            min_delta=0.001,
            patience=3,
            start_from_epoch=10)]
    
       # fit the model to the training data, monitoring F1 on the dev data
    model.fit(
        train_dataset,
        epochs=epochs,
        validation_data=dev_dataset,
        callbacks=callbacks)

    time_elapsed = (time.time() - start_time) / 60
    print(f"Time Elapsed: {time_elapsed} min.")

### Train

In [8]:
for i in range(3):
    train_model(
        embed_out=512,
        gru_units=128,
        dropout_rate=0.65,
        epochs=1,
        iter=i,
        desc=f"dropout_.65_iter_{i}"
    )

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 1024)        51471360  
                                                                 
 bidirectional_1 (Bidirecti  (None, 512)               1969152   
 onal)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense (Dense)               (None, 7)                 3591      
                                                                 
Total params: 53444103 (203.87 MB)
Trainable params: 53444103 (203.87 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
MODEL SPEC.: dropout_.65_iter_0_e_1024_g_256


INFO:tensorflow:Assets written to: checkpoints/dropout_.65_iter_0_e_1024_g_256/assets


Time Elapsed: 10.555853005250295 min.
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 1024)        51471360  
                                                                 
 bidirectional_2 (Bidirecti  (None, 512)               1969152   
 onal)                                                           
                                                                 
 dropout_2 (Dropout)         (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 7)                 3591      
                                                                 
Total params: 53444103 (203.87 MB)
Trainable params: 53444103 (203.87 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
MODEL SPEC.: dropout_.65_iter_1_e

INFO:tensorflow:Assets written to: checkpoints/dropout_.65_iter_1_e_1024_g_256/assets


Time Elapsed: 10.610922352472942 min.
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 1024)        51471360  
                                                                 
 bidirectional_3 (Bidirecti  (None, 512)               1969152   
 onal)                                                           
                                                                 
 dropout_3 (Dropout)         (None, 512)               0         
                                                                 
 dense_2 (Dense)             (None, 7)                 3591      
                                                                 
Total params: 53444103 (203.87 MB)
Trainable params: 53444103 (203.87 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
MODEL SPEC.: dropout_.65_iter_2_e

INFO:tensorflow:Assets written to: checkpoints/dropout_.65_iter_2_e_1024_g_256/assets


Time Elapsed: 11.453673342863718 min.


### Predict

In [14]:
# load the saved model
model_dir = "dropout_0.65_iter_4_e_512_g_128"
model_path = f"checkpoints/{model_dir}"
model = tf.keras.models.load_model(model_path)
df = pd.read_csv("data/test-in.csv")

# generate predictions from model (on the tf version of validation data)
predictions = numpy.where(model.predict(dev_dataset) > 0.5, 1, 0)

# assign predictions to label columns in Pandas data frame
df.iloc[:, 1:] = predictions
print(df.head())

# write the Pandas dataframe to a zipped CSV file
df.to_csv("submission.zip", index=False, compression=dict(
    method='zip', archive_name=f'submission.csv'))

2023-12-05 08:34:42.104983: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-12-05 08:34:42.780783: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-12-05 08:34:43.175153: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond' has 4 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-12-05 08:34:43.387407: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2023-12-05 08:34:43.507464: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'cond/while' has 13 outputs but the _ou

                                                text  admiration  amusement  \
0    It's wonderful because it's awful. At not with.           1          0   
1  I didn't know that, thank you for teaching me ...           0          0   
2  They got bored from haunting earth for thousan...           0          0   
3  Thank you for asking questions and recognizing...           0          0   
4                                     You’re welcome           0          0   

   gratitude  love  pride  relief  remorse  
0          0     0      0       0        0  
1          1     0      0       0        0  
2          0     0      0       0        0  
3          1     0      0       0        0  
4          1     0      0       0        0  
