<a href="https://colab.research.google.com/github/WillN202/NLU_CW/blob/main/NLU_Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLU Task 2


For this task I've implemented a GRU model using TODO word embeddings. TODO WRITE ME

## Imports and Setup

In [1]:
!pip install torchmetrics
!pip install ray[tune]
from google.colab import drive
drive.mount('/content/drive')

Collecting torchmetrics
  Downloading torchmetrics-1.3.2-py3-none-any.whl (841 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/841.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.0/841.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m788.5/841.5 kB[0m [31m11.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m841.5/841.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cuda

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchmetrics
import os.path
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
import string
import re
import gensim.downloader
import tempfile
TRAINING_DATASET_LOCATION = "/content/drive/MyDrive/train.csv"
DEV_DATASET_LOCATION = "/content/drive/MyDrive/dev.csv"
TEST_DATASET_LOCATION = "/content/drive/MyDrive/AV_trial.csv"
WORD2VEC_EMBEDDINGS = "/content/drive/MyDrive/word2vec_embeddings.model"
EPOCHS = 4

DEVICE = (
    "cuda:0"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
# print(DEVICE)
# torch.set_default_device(DEVICE)

## Data Loading and Pre Processing

In [3]:
def generic_preprocessor(sentence):
  sentence = sentence.lower()

  return sentence

class AVDataset(Dataset):
  def __init__(self, csv_file, pre_processor=None):
    self.samples = pd.read_csv(csv_file)
    self.pre_processor = pre_processor

  def __len__(self):
      return len(self.samples)

  def __getitem__(self, index):
    sample = self.samples.iloc[index]
    # sample[0] = re.sub(f"[{re.escape(string.punctuation)}]", "", sample[0])
    # sample[0] = re.sub(f"[{re.escape(string.punctuation)}]", "", sample[1])
    sample_text = f"{sample[0]} <sep> {sample[1]}"
    return (self.pre_processor(sample_text), sample[2])


In [4]:
training_samples = AVDataset(TRAINING_DATASET_LOCATION, pre_processor=generic_preprocessor)
dev_samples = AVDataset(DEV_DATASET_LOCATION, pre_processor=generic_preprocessor)
test_samples = AVDataset(TEST_DATASET_LOCATION, pre_processor=generic_preprocessor)

## Word Embedding Generation

In [6]:
phrases = pd.read_csv(TRAINING_DATASET_LOCATION)
phrases = phrases.loc[:, "text_1":"text_2"].to_numpy().flatten().tolist()
#phrases = [re.sub(f"[{re.escape(string.punctuation)}]", "", str(phrase)).split() for phrase in phrases]
phrases = [str(phrase).split() for phrase in phrases]

def generate_word2vec_embeddings(size):
  embeddings = Word2Vec(sentences=phrases, workers=300, min_count=1, vector_size=size)
  embeddings.wv["<UNK>"] = np.random.rand(size)
  embeddings.wv["<sep>"] = np.random.rand(size)
  embeddings.wv["<pad>"] = np.random.rand(size)

  return embeddings


embeddings_512 = generate_word2vec_embeddings(512)
embeddings_256 = generate_word2vec_embeddings(256)
embeddings_128 = generate_word2vec_embeddings(128)
embeddings_64 = generate_word2vec_embeddings(64)

## Model Creation

To reduce code content, the GRU models are defined from a base class. New linear layers are added between each class of model since this could not be done using parameters alone

In [7]:
class BaseGruRNN(torch.nn.Module):
  def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
    super(BaseGruRNN, self).__init__()
    self.vocab = vocab
    self.get_embedding = torch.nn.Embedding.from_pretrained(embeddings)
    self.GRU_Layer = torch.nn.GRU(embedding_size, hidden_size, batch_first=True, num_layers=rnn_layers, dropout=0.1, bidirectional=is_bidirectional)

  def forward(self, x, linear_layer):
    unk_embedding = self.vocab["<UNK>"]
    #x = [re.sub(f"[{re.escape(string.punctuation)}]", "", sentence) for sentence in x]
    x = [sentence.split() for sentence in x]
    x = [[self.vocab.get(word, unk_embedding) for word in sentence ] for sentence in x]
    # TODO -> instead of padding, use pack sequence instead. Note this may break the output from the lstm (woo)
    max_len = max([len(words) for words in x])
    x = [([self.vocab["<pad>"]] * (max_len -  len(words))) + words for words in x]
    input = torch.tensor(x)

    embeddings = self.get_embedding(input)
    GRU_int_results = self.GRU_Layer(embeddings)[1]
    GRU_values = GRU_int_results[0] #Get last hidden state(s)
    result = linear_layer(GRU_values)
    return result

class OneLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      self.base = super(OneLinearLayerGruRNN, self).__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, output_size),
      )

    def forward(self, x):
      return super().forward(x, self.linear_layer)

class TwoLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      super().__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, output_size)
      )

    def forward(self, x):
      return super().forward(x, self.linear_layer)

class ThreeLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      super(ThreeLinearLayerGruRNN, self).__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, output_size)
      )

    def forward(self, x):
      return super().forward(x, self.linear_layer)

class FourLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      super(FourLinearLayerGruRNN, self).__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, output_size)
      )


    def forward(self, x):
      return super().forward(x, self.linear_layer)


## Model Training

In [21]:
def train_and_validate_loop(model, batch_size, learning_rate, epochs, tuning=False):
  device = (
    "cuda:0"
    if torch.cuda.is_available()
    else "cpu"
)
  torch.set_default_device(device)
  loss_function = torch.nn.BCEWithLogitsLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  training_loader = DataLoader(training_samples, batch_size=batch_size, generator=torch.Generator(device=device))
  dev_loader = DataLoader(dev_samples, batch_size=batch_size, generator=torch.Generator(device=device))
  accuracy = torchmetrics.Accuracy(task="binary")
  precision = torchmetrics.Precision(task="binary")
  torch.set_grad_enabled(True)

  if not tuning:
    print(f"Epochs: {epochs}")

  for epoch in range(0, epochs):
      model.train()
      running_loss = 0.0

      # Training
      for index, value in enumerate(training_loader):
          optimizer.zero_grad()
          data, labels = value
          labels = labels.reshape(-1,1)
          labels = labels.type(torch.FloatTensor)
          labels = labels.to(device)
          outputs = model(data)

          loss = loss_function(outputs, labels)
          loss.backward()
          optimizer.step()
          running_loss += loss.item()

      # Validation to ensure the model is learning
      model.eval()
      torch.set_grad_enabled(False)
      running_loss = 0
      num_correct = 0

      pred = torch.Tensor().to(device)
      gold_standard = torch.Tensor().to(device)
      for index, value in enumerate(dev_loader):
          data, labels = value
          labels = labels.reshape(-1,1)
          labels = labels.type(torch.FloatTensor)
          labels = labels.to(device)

          outputs = model(data)
          loss = loss_function(outputs, labels)
          running_loss += loss

          pred = torch.cat((pred, outputs))
          gold_standard = torch.cat((gold_standard, labels))

      torch.set_grad_enabled(True)
      batch_loss = running_loss / len(dev_loader)

      if tuning:
        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
          checkpoint = None
          if (epoch + 1) % 2 == 0:
              # This saves the model to the trial directory
              torch.save(
                  model.state_dict(),
                  os.path.join(temp_checkpoint_dir, "model.pth")
              )
              checkpoint = train.Checkpoint.from_directory(temp_checkpoint_dir)

        if tuning:
          train.report({"precision": float(precision(pred, gold_standard))}, checkpoint=checkpoint)
        else:
          print(f"---------------------EPOCH {epoch+1} / {epochs}---------------------")
          print(f"Batch Loss {batch_loss}")
          print(f"Accuracy {accuracy(pred, gold_standard)}")
          print(f"Precision {precision(pred, gold_standard)}")


In [19]:
test_embeddings = gensim.downloader.load("word2vec-google-news-300")
test_embeddings["<UNK>"] = np.random.rand(300)
test_embeddings["<sep>"] = np.random.rand(300)
test_embeddings["<pad>"] = np.random.rand(300)
coded_embeddings = torch.FloatTensor(test_embeddings.vectors).to(DEVICE)
vocab = test_embeddings.key_to_index
# model = FourLinearLayerGruRNN(300, 1, 300, coded_embeddings, vocab, 1, is_bidirectional=True).to(DEVICE)
# train_and_validate_loop(model,64, 0.000019238, 3)

## Model Testing

In [None]:
model.eval()
torch.set_grad_enabled(False)
num_correct = 0

loader = DataLoader(test_samples, batch_size=128, shuffle=True, generator=torch.Generator(device=DEVICE))

for index, value in enumerate(loader):
    data, labels = value
    labels = labels.reshape(-1,1)
    labels = labels.type(torch.FloatTensor)
    labels = labels.to(DEVICE)

    outputs = model(data)

    for answer, standard in zip(outputs, labels):
        normalised_answer = torch.round(torch.sigmoid(answer[0]))
        num_correct += 1 if normalised_answer == standard[0] else 0

torch.set_grad_enabled(True)
print(f"Accuracy {num_correct / len(loader.dataset)}")

## Hyperparameter Selection

In [13]:
config = {
  "lr": tune.loguniform(1e-6, 1e-4),
  "batch_size": tune.choice([8, 16, 32, 64, 128]),
  "is_bidirectional": tune.choice([True, False]),
  "epochs": tune.choice(list(range(1,11))),
  "linear_layers": tune.choice(list(range(1,5))),
  "rnn_layers": tune.choice(list(range(1,5))),
  # "embedding_size": tune.choice([64,128,256,512]),
  # "hidden_size": tune.choice(list(range(32, 513, 32)))
}

def tune_training_step(config, embedding, network):
  device = (
    "cuda:0"
    if torch.cuda.is_available()
    else "cpu"
)

  chosen_network = network[config["linear_layers"]]
  coded_embeddings = torch.FloatTensor(np.array(embedding.vectors)).to(device)
  vocab = embedding.key_to_index
  generated_model = chosen_network(300, 1, 300, coded_embeddings, vocab, config["rnn_layers"], config["is_bidirectional"]).to(device)
  train_and_validate_loop(generated_model, config["batch_size"], config["lr"], config["epochs"], True)


embedding = {
    64: embeddings_64,
    128: embeddings_128,
    256: embeddings_256,
    512: embeddings_512
}

network = {
    1: OneLinearLayerGruRNN,
    2: TwoLinearLayerGruRNN,
    3: ThreeLinearLayerGruRNN,
    4: FourLinearLayerGruRNN
}

asha_scheduler = ASHAScheduler(
        metric="precision",
        mode="max",
        max_t=15,
        grace_period=2,
        reduction_factor=2
    )

chosen_embedding = gensim.downloader.load("word2vec-google-news-300")
chosen_embedding["<UNK>"] = np.random.rand(300)
chosen_embedding["<sep>"] = np.random.rand(300)
chosen_embedding["<pad>"] = np.random.rand(300)

training_wrapper = tune.with_resources(tune.with_parameters(tune_training_step, embedding=chosen_embedding, network=network), {"CPU": 1.6, "GPU": 1/3})
tuner = tune.Tuner(
    training_wrapper,
    tune_config=tune.TuneConfig(
        num_samples=15,
        scheduler=asha_scheduler,
    ),
    param_space=config
)

result = tuner.fit()


2024-04-23 01:31:58,015	INFO tune.py:633 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


+---------------------------------------------------------------------------+
| Configuration for experiment     tune_training_step_2024-04-23_01-31-58   |
+---------------------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator                    |
| Scheduler                        AsyncHyperBandScheduler                  |
| Number of trials                 15                                       |
+---------------------------------------------------------------------------+

View detailed results here: /root/ray_results/tune_training_step_2024-04-23_01-31-58
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2024-04-23_01-25-07_045823_459/artifacts/2024-04-23_01-31-58/tune_training_step_2024-04-23_01-31-58/driver_artifacts`

Trial status: 15 PENDING
Current time: 2024-04-23 01:32:06. Total running time: 8s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 acceler




Trial status: 3 RUNNING | 12 PENDING
Current time: 2024-04-23 01:32:36. Total running time: 38s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-----------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                       status              lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers |
+-----------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_46942_00000   RUNNING    3.35349e-05            128   False                       8                 4              4 |
| tune_training_step_46942_00001   RUNNING    1.29296e-05             16   True                        7                 4              1 |
| tune_training_step_46942_00002   RUNNING    1.87382e-06              8   True                        7       

[36m(tune_training_step pid=15701)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00000_0_batch_size=128,epochs=8,is_bidirectional=False,linear_layers=4,lr=0.0000,rnn_layers=4_2024-04-23_01-32-02/checkpoint_000000)


Trial status: 3 RUNNING | 12 PENDING
Current time: 2024-04-23 01:34:36. Total running time: 2min 38s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                       status              lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     iter     total time (s)     precision |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_46942_00000   RUNNING    3.35349e-05            128   False                       8                 4              4        2            128.222      0.504239 |
| tune_training_step_46942_00001   RUNNING    1.29296e-05             16   True

[36m(tune_training_step pid=15702)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00001_1_batch_size=16,epochs=7,is_bidirectional=True,linear_layers=4,lr=0.0000,rnn_layers=1_2024-04-23_01-32-03/checkpoint_000000)


Trial status: 3 RUNNING | 12 PENDING
Current time: 2024-04-23 01:35:36. Total running time: 3min 38s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                       status              lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     iter     total time (s)     precision |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_46942_00000   RUNNING    3.35349e-05            128   False                       8                 4              4        3            175.914      0.503842 |
| tune_training_step_46942_00001   RUNNING    1.29296e-05             16   True

[36m(tune_training_step pid=15701)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00000_0_batch_size=128,epochs=8,is_bidirectional=False,linear_layers=4,lr=0.0000,rnn_layers=4_2024-04-23_01-32-02/checkpoint_000001)


Trial status: 3 RUNNING | 12 PENDING
Current time: 2024-04-23 01:36:37. Total running time: 4min 38s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                       status              lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     iter     total time (s)     precision |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_46942_00000   RUNNING    3.35349e-05            128   False                       8                 4              4        4            243.961      0.508567 |
| tune_training_step_46942_00001   RUNNING    1.29296e-05             16   True

[36m(tune_training_step pid=15701)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00000_0_batch_size=128,epochs=8,is_bidirectional=False,linear_layers=4,lr=0.0000,rnn_layers=4_2024-04-23_01-32-02/checkpoint_000002)



Trial tune_training_step_46942_00001 completed after 4 iterations at 2024-04-23 01:38:57. Total running time: 6min 58s
+-------------------------------------------------------------------+
| Trial tune_training_step_46942_00001 result                       |
+-------------------------------------------------------------------+
| checkpoint_dir_name                             checkpoint_000001 |
| time_this_iter_s                                        130.50026 |
| time_total_s                                            405.30921 |
| training_iteration                                              4 |
| precision                                                 0.50613 |
+-------------------------------------------------------------------+


[36m(tune_training_step pid=15702)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00001_1_batch_size=16,epochs=7,is_bidirectional=True,linear_layers=4,lr=0.0000,rnn_layers=1_2024-04-23_01-32-03/checkpoint_000001)



Trial tune_training_step_46942_00003 started with configuration:
+----------------------------------------------------+
| Trial tune_training_step_46942_00003 config        |
+----------------------------------------------------+
| batch_size                                      32 |
| epochs                                           9 |
| is_bidirectional                                 0 |
| linear_layers                                    2 |
| lr                                               0 |
| rnn_layers                                       3 |
+----------------------------------------------------+

Trial status: 3 RUNNING | 1 TERMINATED | 11 PENDING
Current time: 2024-04-23 01:39:07. Total running time: 7min 9s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[36m(tune_training_step pid=15701)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00000_0_batch_size=128,epochs=8,is_bidirectional=False,linear_layers=4,lr=0.0000,rnn_layers=4_2024-04-23_01-32-02/checkpoint_000003)



Trial tune_training_step_46942_00000 completed after 8 iterations at 2024-04-23 01:40:52. Total running time: 8min 54s
+-------------------------------------------------------------------+
| Trial tune_training_step_46942_00000 result                       |
+-------------------------------------------------------------------+
| checkpoint_dir_name                             checkpoint_000003 |
| time_this_iter_s                                          81.9004 |
| time_total_s                                            519.96103 |
| training_iteration                                              8 |
| precision                                                 0.51093 |
+-------------------------------------------------------------------+

Trial tune_training_step_46942_00004 started with configuration:
+-------------------------------------------------------+
| Trial tune_training_step_46942_00004 config           |
+-------------------------------------------------------+
| batch_si




Trial status: 2 TERMINATED | 3 RUNNING | 10 PENDING
Current time: 2024-04-23 01:41:08. Total running time: 9min 9s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                       status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     iter     total time (s)     precision |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_46942_00002   RUNNING      1.87382e-06              8   True                        7                 1              3        1           319.09        0.511519 |
| tune_training_step_46942_00003   RUNNING      1.39863e

[36m(tune_training_step pid=17623)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00003_3_batch_size=32,epochs=9,is_bidirectional=False,linear_layers=2,lr=0.0000,rnn_layers=3_2024-04-23_01-32-03/checkpoint_000000)



Trial status: 3 TERMINATED | 2 RUNNING | 10 PENDING
Current time: 2024-04-23 01:42:08. Total running time: 10min 9s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                       status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     iter     total time (s)     precision |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_46942_00002   RUNNING      1.87382e-06              8   True                        7                 1              3        1            319.09       0.511519 |
| tune_training_step_46942_00004   RUNNING      9.32319

[36m(tune_training_step pid=18166)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00004_4_batch_size=128,epochs=6,is_bidirectional=False,linear_layers=1,lr=0.0000,rnn_layers=1_2024-04-23_01-32-03/checkpoint_000000)


Trial status: 3 TERMINATED | 3 RUNNING | 9 PENDING
Current time: 2024-04-23 01:43:08. Total running time: 11min 9s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                       status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     iter     total time (s)     precision |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_46942_00002   RUNNING      1.87382e-06              8   True                        7                 1              3        1            319.09       0.511519 |
| tune_training_step_46942_00004   RUNNING      9.32319e-

[36m(tune_training_step pid=15703)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00002_2_batch_size=8,epochs=7,is_bidirectional=True,linear_layers=1,lr=0.0000,rnn_layers=3_2024-04-23_01-32-03/checkpoint_000000)



Trial tune_training_step_46942_00006 started with configuration:
+-------------------------------------------------------+
| Trial tune_training_step_46942_00006 config           |
+-------------------------------------------------------+
| batch_size                                         16 |
| epochs                                              8 |
| is_bidirectional                                    0 |
| linear_layers                                       2 |
| lr                                              1e-05 |
| rnn_layers                                          4 |
+-------------------------------------------------------+

Trial status: 4 TERMINATED | 3 RUNNING | 8 PENDING
Current time: 2024-04-23 01:43:38. Total running time: 11min 39s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------

[36m(tune_training_step pid=18166)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00004_4_batch_size=128,epochs=6,is_bidirectional=False,linear_layers=1,lr=0.0000,rnn_layers=1_2024-04-23_01-32-03/checkpoint_000001)


Trial status: 4 TERMINATED | 3 RUNNING | 8 PENDING
Current time: 2024-04-23 01:44:08. Total running time: 12min 9s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                       status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     iter     total time (s)     precision |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_46942_00002   RUNNING      1.87382e-06              8   True                        7                 1              3        2           660.996       0.511203 |
| tune_training_step_46942_00004   RUNNING      9.32319e-

[36m(tune_training_step pid=18166)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00004_4_batch_size=128,epochs=6,is_bidirectional=False,linear_layers=1,lr=0.0000,rnn_layers=1_2024-04-23_01-32-03/checkpoint_000002)



Trial tune_training_step_46942_00004 completed after 6 iterations at 2024-04-23 01:45:39. Total running time: 13min 40s
+-------------------------------------------------------------------+
| Trial tune_training_step_46942_00004 result                       |
+-------------------------------------------------------------------+
| checkpoint_dir_name                             checkpoint_000002 |
| time_this_iter_s                                         66.90427 |
| time_total_s                                             279.7967 |
| training_iteration                                              6 |
| precision                                                 0.51035 |
+-------------------------------------------------------------------+

Trial tune_training_step_46942_00007 started with configuration:
+-----------------------------------------------------+
| Trial tune_training_step_46942_00007 config         |
+-----------------------------------------------------+
| batch_size   

[36m(tune_training_step pid=18856)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00006_6_batch_size=16,epochs=8,is_bidirectional=False,linear_layers=2,lr=0.0000,rnn_layers=4_2024-04-23_01-32-04/checkpoint_000000)



Trial tune_training_step_46942_00007 completed after 2 iterations at 2024-04-23 01:48:03. Total running time: 16min 5s
+-------------------------------------------------------------------+
| Trial tune_training_step_46942_00007 result                       |
+-------------------------------------------------------------------+
| checkpoint_dir_name                             checkpoint_000000 |
| time_this_iter_s                                         71.76413 |
| time_total_s                                            138.69072 |
| training_iteration                                              2 |
| precision                                                     0.5 |
+-------------------------------------------------------------------+


[36m(tune_training_step pid=19542)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00007_7_batch_size=128,epochs=9,is_bidirectional=True,linear_layers=3,lr=0.0000,rnn_layers=2_2024-04-23_01-32-04/checkpoint_000000)



Trial status: 6 TERMINATED | 2 RUNNING | 7 PENDING
Current time: 2024-04-23 01:48:08. Total running time: 16min 10s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                       status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     iter     total time (s)     precision |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_46942_00002   RUNNING      1.87382e-06              8   True                        7                 1              3        3           945.14        0.513098 |
| tune_training_step_46942_00006   RUNNING      1.34487

[36m(tune_training_step pid=20213)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00008_8_batch_size=32,epochs=4,is_bidirectional=False,linear_layers=4,lr=0.0000,rnn_layers=4_2024-04-23_01-32-04/checkpoint_000000)



Trial tune_training_step_46942_00008 completed after 2 iterations at 2024-04-23 01:51:20. Total running time: 19min 22s
+-------------------------------------------------------------------+
| Trial tune_training_step_46942_00008 result                       |
+-------------------------------------------------------------------+
| checkpoint_dir_name                             checkpoint_000000 |
| time_this_iter_s                                        101.54963 |
| time_total_s                                            190.60332 |
| training_iteration                                              2 |
| precision                                                 0.50416 |
+-------------------------------------------------------------------+

Trial tune_training_step_46942_00009 started with configuration:
+-------------------------------------------------------+
| Trial tune_training_step_46942_00009 config           |
+-------------------------------------------------------+
| batch_s

[36m(tune_training_step pid=18856)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00006_6_batch_size=16,epochs=8,is_bidirectional=False,linear_layers=2,lr=0.0000,rnn_layers=4_2024-04-23_01-32-04/checkpoint_000001)



Trial tune_training_step_46942_00010 started with configuration:
+-------------------------------------------------------+
| Trial tune_training_step_46942_00010 config           |
+-------------------------------------------------------+
| batch_size                                        128 |
| epochs                                              3 |
| is_bidirectional                                    1 |
| linear_layers                                       2 |
| lr                                              6e-05 |
| rnn_layers                                          1 |
+-------------------------------------------------------+





Trial status: 8 TERMINATED | 3 RUNNING | 4 PENDING
Current time: 2024-04-23 01:52:38. Total running time: 20min 40s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                       status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     iter     total time (s)     precision |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_46942_00002   RUNNING      1.87382e-06              8   True                        7                 1              3        3           945.14        0.513098 |
| tune_training_step_46942_00009   RUNNING      5.28758

[36m(tune_training_step pid=15703)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00002_2_batch_size=8,epochs=7,is_bidirectional=True,linear_layers=1,lr=0.0000,rnn_layers=3_2024-04-23_01-32-03/checkpoint_000001)


Trial status: 8 TERMINATED | 3 RUNNING | 4 PENDING
Current time: 2024-04-23 01:54:39. Total running time: 22min 40s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                       status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     iter     total time (s)     precision |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_46942_00002   RUNNING      1.87382e-06              8   True                        7                 1              3        4          1322.04        0.51214  |
| tune_training_step_46942_00009   RUNNING      5.28758e

[36m(tune_training_step pid=21397)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00010_10_batch_size=128,epochs=3,is_bidirectional=True,linear_layers=2,lr=0.0001,rnn_layers=1_2024-04-23_01-32-05/checkpoint_000000)
[36m(tune_training_step pid=21113)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00009_9_batch_size=64,epochs=2,is_bidirectional=True,linear_layers=3,lr=0.0001,rnn_layers=2_2024-04-23_01-32-04/checkpoint_000000)



Trial tune_training_step_46942_00009 completed after 2 iterations at 2024-04-23 01:55:08. Total running time: 23min 10s
+-------------------------------------------------------------------+
| Trial tune_training_step_46942_00009 result                       |
+-------------------------------------------------------------------+
| checkpoint_dir_name                             checkpoint_000000 |
| time_this_iter_s                                        159.77935 |
| time_total_s                                            221.59899 |
| training_iteration                                              2 |
| precision                                                 0.50153 |
+-------------------------------------------------------------------+

Trial status: 10 TERMINATED | 1 RUNNING | 4 PENDING
Current time: 2024-04-23 01:55:09. Total running time: 23min 10s
Logical resource usage: 3.2/16 CPUs, 0.6666666666666666/1 GPUs (0.0/1.0 accelerator_type:L4)
+-------------------------------------




Trial status: 10 TERMINATED | 3 RUNNING | 2 PENDING
Current time: 2024-04-23 01:55:39. Total running time: 23min 40s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                       status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     iter     total time (s)     precision |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_46942_00002   RUNNING      1.87382e-06              8   True                        7                 1              3        4          1322.04        0.51214  |
| tune_training_step_46942_00011   RUNNING      1.9089

[36m(tune_training_step pid=22170)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00011_11_batch_size=8,epochs=6,is_bidirectional=False,linear_layers=1,lr=0.0000,rnn_layers=1_2024-04-23_01-32-05/checkpoint_000000)



Trial tune_training_step_46942_00011 completed after 2 iterations at 2024-04-23 02:00:38. Total running time: 28min 40s
+-------------------------------------------------------------------+
| Trial tune_training_step_46942_00011 result                       |
+-------------------------------------------------------------------+
| checkpoint_dir_name                             checkpoint_000000 |
| time_this_iter_s                                        182.64871 |
| time_total_s                                            326.38151 |
| training_iteration                                              2 |
| precision                                                  0.5019 |
+-------------------------------------------------------------------+

Trial status: 11 TERMINATED | 2 RUNNING | 2 PENDING
Current time: 2024-04-23 02:00:39. Total running time: 28min 41s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+--------------------------------------

[36m(tune_training_step pid=22235)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00012_12_batch_size=32,epochs=5,is_bidirectional=True,linear_layers=2,lr=0.0000,rnn_layers=4_2024-04-23_01-32-05/checkpoint_000000)


Trial status: 11 TERMINATED | 3 RUNNING | 1 PENDING
Current time: 2024-04-23 02:01:39. Total running time: 29min 41s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                       status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     iter     total time (s)     precision |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_46942_00002   RUNNING      1.87382e-06              8   True                        7                 1              3        5          1619.7         0.511322 |
| tune_training_step_46942_00012   RUNNING      1.15484

[36m(tune_training_step pid=15703)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00002_2_batch_size=8,epochs=7,is_bidirectional=True,linear_layers=1,lr=0.0000,rnn_layers=3_2024-04-23_01-32-03/checkpoint_000002)
[36m(tune_training_step pid=24395)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00014_14_batch_size=64,epochs=9,is_bidirectional=False,linear_layers=4,lr=0.0000,rnn_layers=3_2024-04-23_01-32-05/checkpoint_000000)


Trial status: 12 TERMINATED | 3 RUNNING
Current time: 2024-04-23 02:05:39. Total running time: 33min 41s
Logical resource usage: 4.800000000000001/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                       status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     iter     total time (s)     precision |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_46942_00002   RUNNING      1.87382e-06              8   True                        7                 1              3        6          1985.46        0.512774 |
| tune_training_step_46942_00012   RUNNING      1.15484e-06        

[36m(tune_training_step pid=22235)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00012_12_batch_size=32,epochs=5,is_bidirectional=True,linear_layers=2,lr=0.0000,rnn_layers=4_2024-04-23_01-32-05/checkpoint_000001)



Trial tune_training_step_46942_00014 completed after 4 iterations at 2024-04-23 02:07:39. Total running time: 35min 41s
+-------------------------------------------------------------------+
| Trial tune_training_step_46942_00014 result                       |
+-------------------------------------------------------------------+
| checkpoint_dir_name                             checkpoint_000001 |
| time_this_iter_s                                         69.68783 |
| time_total_s                                            262.15322 |
| training_iteration                                              4 |
| precision                                                 0.50579 |
+-------------------------------------------------------------------+

Trial status: 13 TERMINATED | 2 RUNNING
Current time: 2024-04-23 02:07:40. Total running time: 35min 41s
Logical resource usage: 3.2/16 CPUs, 0.6666666666666666/1 GPUs (0.0/1.0 accelerator_type:L4)
+-------------------------------------------------

[36m(tune_training_step pid=24395)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-23_01-31-58/tune_training_step_46942_00014_14_batch_size=64,epochs=9,is_bidirectional=False,linear_layers=4,lr=0.0000,rnn_layers=3_2024-04-23_01-32-05/checkpoint_000001)


Trial status: 13 TERMINATED | 2 RUNNING
Current time: 2024-04-23 02:08:10. Total running time: 36min 11s
Logical resource usage: 3.2/16 CPUs, 0.6666666666666666/1 GPUs (0.0/1.0 accelerator_type:L4)
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                       status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     iter     total time (s)     precision |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_46942_00002   RUNNING      1.87382e-06              8   True                        7                 1              3        6          1985.46        0.512774 |
| tune_training_step_46942_00012   RUNNING      1.15484e-06       

2024-04-23 02:09:53,408	INFO tune.py:1021 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/tune_training_step_2024-04-23_01-31-58' in 0.0071s.



Trial tune_training_step_46942_00002 completed after 7 iterations at 2024-04-23 02:09:53. Total running time: 37min 54s
+---------------------------------------------------------+
| Trial tune_training_step_46942_00002 result             |
+---------------------------------------------------------+
| checkpoint_dir_name                                     |
| time_this_iter_s                                275.922 |
| time_total_s                                    2261.38 |
| training_iteration                                    7 |
| precision                                       0.51269 |
+---------------------------------------------------------+

Trial status: 15 TERMINATED
Current time: 2024-04-23 02:09:53. Total running time: 37min 55s
Logical resource usage: 1.6/16 CPUs, 0.3333333333333333/1 GPUs (0.0/1.0 accelerator_type:L4)
+-------------------------------------------------------------------------------------------------------------------------------------------------------

In [16]:
# df = result.get_dataframe(filter_metric="loss", filter_mode="min")
# df.to_csv("/content/drive/MyDrive/best_model_params.csv")

# print(df)

best_result = result.get_best_result("precision", mode="max")
with best_result.checkpoint.as_directory() as checkpoint_dir:
    state_dict = torch.load(os.path.join(checkpoint_dir, "model.pth"))



In [32]:
model = TwoLinearLayerGruRNN(300, 1, 300, coded_embeddings, vocab, 4, is_bidirectional=True).to(DEVICE)
model.load_state_dict(state_dict)
train_and_validate_loop(model,32, 1.15484e-06, 5)

Epochs: 5


KeyboardInterrupt: 