<a href="https://colab.research.google.com/github/WillN202/NLU_CW/blob/main/NLU_Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLU Task 2


For this task I've implemented a GRU model using TODO word embeddings. TODO WRITE ME

## Imports and Setup

In [1]:
!pip install torchmetrics
!pip install ray[tune]
from google.colab import drive
drive.mount('/content/drive')

Collecting torchmetrics
  Downloading torchmetrics-1.3.2-py3-none-any.whl (841 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/841.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.7/841.5 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m839.7/841.5 kB[0m [31m12.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m841.5/841.5 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cuda_runtime_

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchmetrics
import os.path
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
import string
import re
TRAINING_DATASET_LOCATION = "/content/drive/MyDrive/train.csv"
DEV_DATASET_LOCATION = "/content/drive/MyDrive/dev.csv"
TEST_DATASET_LOCATION = "/content/drive/MyDrive/AV_trial.csv"
WORD2VEC_EMBEDDINGS = "/content/drive/MyDrive/word2vec_embeddings.model"
EPOCHS = 4

DEVICE = (
    "cuda:0"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
# print(DEVICE)
# torch.set_default_device(DEVICE)

## Data Loading and Pre Processing

In [3]:
def generic_preprocessor(sentence):
  sentence = sentence.lower()

  return sentence

class AVDataset(Dataset):
  def __init__(self, csv_file, pre_processor=None):
    self.samples = pd.read_csv(csv_file)
    self.pre_processor = pre_processor

  def __len__(self):
      return len(self.samples)

  def __getitem__(self, index):
    sample = self.samples.iloc[index]
    # sample[0] = re.sub(f"[{re.escape(string.punctuation)}]", "", sample[0])
    # sample[0] = re.sub(f"[{re.escape(string.punctuation)}]", "", sample[1])
    sample_text = f"{sample[0]} <sep> {sample[1]}"
    return (self.pre_processor(sample_text), sample[2])


In [8]:
training_samples = AVDataset(TRAINING_DATASET_LOCATION, pre_processor=generic_preprocessor)
dev_samples = AVDataset(DEV_DATASET_LOCATION, pre_processor=generic_preprocessor)
test_samples = AVDataset(TEST_DATASET_LOCATION, pre_processor=generic_preprocessor)

## Word Embedding Generation

In [4]:
phrases = pd.read_csv(TRAINING_DATASET_LOCATION)
phrases = phrases.loc[:, "text_1":"text_2"].to_numpy().flatten().tolist()
#phrases = [re.sub(f"[{re.escape(string.punctuation)}]", "", str(phrase)).split() for phrase in phrases]
phrases = [str(phrase).split() for phrase in phrases]

def generate_word2vec_embeddings(size):
  embeddings = Word2Vec(sentences=phrases, workers=300, min_count=1, vector_size=size)
  embeddings.wv["<UNK>"] = np.random.rand(size)
  embeddings.wv["<sep>"] = np.random.rand(size)
  embeddings.wv["<pad>"] = np.random.rand(size)

  return embeddings


embeddings_512 = generate_word2vec_embeddings(512)
embeddings_256 = generate_word2vec_embeddings(256)
embeddings_128 = generate_word2vec_embeddings(128)
embeddings_64 = generate_word2vec_embeddings(64)

## Model Creation

To reduce code content, the GRU models are defined from a base class. New linear layers are added between each class of model since this could not be done using parameters alone

In [5]:
class BaseGruRNN(torch.nn.Module):
  def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
    super(BaseGruRNN, self).__init__()
    self.vocab = vocab
    self.get_embedding = torch.nn.Embedding.from_pretrained(embeddings)
    self.GRU_Layer = torch.nn.GRU(embedding_size, hidden_size, batch_first=True, num_layers=rnn_layers, dropout=0.1, bidirectional=is_bidirectional)

  def forward(self, x, linear_layer):
    unk_embedding = self.vocab["<UNK>"]
    #x = [re.sub(f"[{re.escape(string.punctuation)}]", "", sentence) for sentence in x]
    x = [sentence.split() for sentence in x]
    x = [[self.vocab.get(word, unk_embedding) for word in sentence ] for sentence in x]
    # TODO -> instead of padding, use pack sequence instead. Note this may break the output from the lstm (woo)
    max_len = max([len(words) for words in x])
    x = [([self.vocab["<pad>"]] * (max_len -  len(words))) + words for words in x]
    input = torch.tensor(x)

    embeddings = self.get_embedding(input)
    GRU_int_results = self.GRU_Layer(embeddings)[1]
    GRU_values = GRU_int_results[0] #Get last hidden state(s)
    result = linear_layer(GRU_values)
    return result

class OneLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      self.base = super(OneLinearLayerGruRNN, self).__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, output_size),
      )

    def forward(self, x):
      return super().forward(x, self.linear_layer)

class TwoLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      super().__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, output_size)
      )

    def forward(self, x):
      return super().forward(x, self.linear_layer)

class ThreeLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      super(ThreeLinearLayerGruRNN, self).__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, output_size)
      )

    def forward(self, x):
      return super().forward(x, self.linear_layer)

class FourLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      super(FourLinearLayerGruRNN, self).__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, output_size)
      )


    def forward(self, x):
      return super().forward(x, self.linear_layer)


## Model Training

In [6]:
def train_and_validate_loop(model, batch_size, learning_rate, epochs, tuning=False):
  device = (
    "cuda:0"
    if torch.cuda.is_available()
    else "cpu"
)
  torch.set_default_device(device)
  loss_function = torch.nn.BCEWithLogitsLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  training_loader = DataLoader(training_samples, batch_size=batch_size, generator=torch.Generator(device=device))
  dev_loader = DataLoader(dev_samples, batch_size=batch_size, generator=torch.Generator(device=device))

  torch.set_grad_enabled(True)

  if not tuning:
    print(f"Epochs: {epochs}")

  for epoch in range(0, epochs):
      model.train()
      running_loss = 0.0

      # Training
      for index, value in enumerate(training_loader):
          optimizer.zero_grad()
          data, labels = value
          labels = labels.reshape(-1,1)
          labels = labels.type(torch.FloatTensor)
          labels = labels.to(device)
          outputs = model(data)

          loss = loss_function(outputs, labels)
          loss.backward()
          optimizer.step()
          running_loss += loss.item()

      # Validation to ensure the model is learning
      model.eval()
      torch.set_grad_enabled(False)
      running_loss = 0
      num_correct = 0
      for index, value in enumerate(dev_loader):
          data, labels = value
          labels = labels.reshape(-1,1)
          labels = labels.type(torch.FloatTensor)
          labels = labels.to(device)

          outputs = model(data)
          loss = loss_function(outputs, labels)
          running_loss += loss

          for answer, standard in zip(outputs, labels):
              normalised_answer = torch.round(torch.sigmoid(answer[0]))
              num_correct += 1 if normalised_answer == standard[0] else 0

      torch.set_grad_enabled(True)
      batch_loss = running_loss / len(dev_loader)
      accuracy = num_correct / len(dev_loader.dataset)

      if tuning:
         train.report({"loss": int(batch_loss)})
      else:
        print(f"---------------------EPOCH {epoch+1} / {epochs}---------------------")
        print(f"Batch Loss {batch_loss}")
        print(f"Accuracy {accuracy}")


In [None]:
coded_embeddings = torch.FloatTensor(embeddings_256.wv.vectors).to(DEVICE)
vocab = embeddings_256.wv.key_to_index
model = TwoLinearLayerGruRNN(256, 1, 256, coded_embeddings, vocab).to(DEVICE)
train_and_validate_loop(model, 64, 0.00001, EPOCHS)



Epochs: 4
---------------------EPOCH 1 / 4---------------------
Batch Loss 0.6928825378417969
Accuracy 0.5178333333333334
---------------------EPOCH 2 / 4---------------------
Batch Loss 0.6927028298377991
Accuracy 0.5178333333333334
---------------------EPOCH 3 / 4---------------------
Batch Loss 0.6925799250602722
Accuracy 0.5151666666666667
---------------------EPOCH 4 / 4---------------------
Batch Loss 0.6924920082092285
Accuracy 0.5163333333333333


## Model Testing

In [None]:
model.eval()
torch.set_grad_enabled(False)
num_correct = 0

loader = DataLoader(test_samples, batch_size=64, shuffle=True, generator=torch.Generator(device=DEVICE))

for index, value in enumerate(loader):
    data, labels = value
    labels = labels.reshape(-1,1)
    labels = labels.type(torch.FloatTensor)
    labels = labels.to(DEVICE)

    outputs = model(data)

    for answer, standard in zip(outputs, labels):
        normalised_answer = torch.round(torch.sigmoid(answer[0]))
        num_correct += 1 if normalised_answer == standard[0] else 0

torch.set_grad_enabled(True)
print(f"Accuracy {num_correct / len(loader.dataset)}")

NameError: name 'model' is not defined

## Hyperparameter Selection

In [9]:
config = {
  "lr": tune.loguniform(1e-4, 1e-1),
  "batch_size": tune.choice([2, 4, 8, 16, 32, 64, 128]),
  "is_bidirectional": tune.choice([True, False]),
  "epochs": tune.choice(list(range(1,11))),
  "linear_layers": tune.choice(list(range(1,5))),
  "rnn_layers": tune.choice(list(range(1,5))),
  "embedding_size": tune.choice([64,128,256,512]),
  "hidden_size": tune.choice(list(range(32, 513, 32)))
}

def tune_training_step(config, embedding, network):
  device = (
    "cuda:0"
    if torch.cuda.is_available()
    else "cpu"
)
  chosen_embedding = embedding[config["embedding_size"]]
  chosen_network = network[config["linear_layers"]]
  coded_embeddings = torch.FloatTensor(np.array(chosen_embedding.wv.vectors)).to(device)
  vocab = chosen_embedding.wv.key_to_index
  generated_model = chosen_network(config["embedding_size"], 1, config["hidden_size"], coded_embeddings, vocab, config["rnn_layers"], config["is_bidirectional"]).to(device)
  train_and_validate_loop(generated_model, config["batch_size"], config["lr"], config["epochs"], True)


embedding = {
    64: embeddings_64,
    128: embeddings_128,
    256: embeddings_256,
    512: embeddings_512
}

network = {
    1: OneLinearLayerGruRNN,
    2: TwoLinearLayerGruRNN,
    3: ThreeLinearLayerGruRNN,
    4: FourLinearLayerGruRNN
}

asha_scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=15,
        grace_period=5,
        reduction_factor=2
    )


training_wrapper = tune.with_resources(tune.with_parameters(tune_training_step, embedding=embedding, network=network), {"CPU": 1.6, "GPU": 0.2})
tuner = tune.Tuner(
    training_wrapper,
    tune_config=tune.TuneConfig(
        num_samples=15,
        scheduler=asha_scheduler,
    ),
    param_space=config
)

result = tuner.fit()


2024-04-22 18:36:00,854	INFO tune.py:633 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


+---------------------------------------------------------------------------+
| Configuration for experiment     tune_training_step_2024-04-22_18-36-00   |
+---------------------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator                    |
| Scheduler                        AsyncHyperBandScheduler                  |
| Number of trials                 15                                       |
+---------------------------------------------------------------------------+

View detailed results here: /root/ray_results/tune_training_step_2024-04-22_18-36-00
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2024-04-22_18-35-24_297637_500/artifacts/2024-04-22_18-36-01/tune_training_step_2024-04-22_18-36-00/driver_artifacts`

Trial status: 15 PENDING
Current time: 2024-04-22 18:36:11. Total running time: 10s
Logical resource usage: 8.0/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)

2024-04-22 20:01:05,784	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_2af4b_00006
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=11415, ip=172.28.0.12, act


Trial tune_training_step_2af4b_00006 errored after 4 iterations at 2024-04-22 20:01:05. Total running time: 1hr 25min 4s
Error file: /tmp/ray/session_2024-04-22_18-35-24_297637_500/artifacts/2024-04-22_18-36-01/tune_training_step_2024-04-22_18-36-00/driver_artifacts/tune_training_step_2af4b_00006_6_batch_size=4,embedding_size=128,epochs=9,hidden_size=256,is_bidirectional=True,linear_layers=4,lr_2024-04-22_18-36-08/error.txt
+---------------------------------------------------------+
| Trial tune_training_step_2af4b_00006 result             |
+---------------------------------------------------------+
| checkpoint_dir_name                                     |
| time_this_iter_s                                786.131 |
| time_total_s                                    4273.49 |
| training_iteration                                    4 |
| loss                                                  2 |
+---------------------------------------------------------+

Trial status: 13 TERMINATED | 

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
2024-04-22 21:49:20,060	INFO tune.py:1021 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/tune_training_step_2024-04-22_18-36-00' in 0.0077s.
2024-04-22 21:49:20,071	ERROR tune.py:1049 -- Trials did not complete: [tune_training_step_2af4b_00006]



Trial tune_training_step_2af4b_00011 completed after 9 iterations at 2024-04-22 21:49:20. Total running time: 3hr 13min 18s
+---------------------------------------------------------+
| Trial tune_training_step_2af4b_00011 result             |
+---------------------------------------------------------+
| checkpoint_dir_name                                     |
| time_this_iter_s                                998.966 |
| time_total_s                                    11079.8 |
| training_iteration                                    9 |
| loss                                                  0 |
+---------------------------------------------------------+

Trial status: 14 TERMINATED | 1 ERROR
Current time: 2024-04-22 21:49:20. Total running time: 3hr 13min 18s
Logical resource usage: 1.6/16 CPUs, 0.2/1 GPUs (0.0/1.0 accelerator_type:L4)
+----------------------------------------------------------------------------------------------------------------------------------------------------

In [17]:
# df = result.get_dataframe(filter_metric="loss", filter_mode="min")
# df.to_csv("/content/drive/MyDrive/best_model_params.csv")

print(df)

#best_result = result.get_best_result("loss", mode="min")
# with best_result.checkpoint.as_directory() as checkpoint_dir:
#     state_dict = torch.load(os.path.join(checkpoint_dir, "model.pth"))

    loss   timestamp checkpoint_dir_name   done  training_iteration  \
0      0  1713811071                None  False                   1   
1      0  1713811039                None  False                   1   
2      0  1713811191                None  False                   1   
3      0  1713811029                None  False                   1   
4      0  1713811060                None  False                   1   
5      0  1713811218                None  False                   1   
6      0  1713813672                None  False                   2   
7      0  1713811301                None  False                   1   
8      0  1713811475                None  False                   1   
9      0  1713811849                None  False                   1   
10     0  1713811531                None  False                   1   
11     0  1713813866                None  False                   1   
12     0  1713811770                None  False                   1   
13    