<a href="https://colab.research.google.com/github/WillN202/NLU_CW/blob/main/NLU_Task_2_TrainedWord2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLU Task 2


For this task I've implemented a GRU model using TODO word embeddings. TODO WRITE ME

## Imports and Setup

In [3]:
!pip install torchmetrics
!pip install ray[tune]
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchmetrics
import os.path
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch
import string
import re
import gensim.downloader
import tempfile
import random
TRAINING_DATASET_LOCATION = "/content/drive/MyDrive/train.csv"
DEV_DATASET_LOCATION = "/content/drive/MyDrive/dev.csv"
TEST_DATASET_LOCATION = "/content/drive/MyDrive/AV_trial.csv"
WORD2VEC_EMBEDDINGS = "/content/drive/MyDrive/word2vec_embeddings_tessty.model"
EPOCHS = 4

DEVICE = (
    "cuda:0"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
# print(DEVICE)
# torch.set_default_device(DEVICE)

## Data Loading and Pre Processing

In [6]:
def generic_preprocessor(sentence):
  sentence = sentence.lower()

  return sentence

class AVDataset(Dataset):
  def __init__(self, csv_file, pre_processor=None):
    self.samples = pd.read_csv(csv_file)
    self.pre_processor = pre_processor

  def __len__(self):
      return len(self.samples)

  def __getitem__(self, index):
    sample = self.samples.iloc[index]
    # sample[0] = re.sub(f"[{re.escape(string.punctuation)}]", "", sample[0])
    # sample[0] = re.sub(f"[{re.escape(string.punctuation)}]", "", sample[1])
    sample_text = f"{sample[0]} <sep> {sample[1]}"
    return (self.pre_processor(sample_text), sample[2])


In [7]:
training_samples = AVDataset(TRAINING_DATASET_LOCATION, pre_processor=generic_preprocessor)
dev_samples = AVDataset(DEV_DATASET_LOCATION, pre_processor=generic_preprocessor)
test_samples = AVDataset(TEST_DATASET_LOCATION, pre_processor=generic_preprocessor)

## Word Embedding Generation

In [8]:
def add_unk(phrase1, phrase2):
  phrase1 = ["<UNK>" if random.random() < 0.05 else word for word in phrase1.split()]
  phrase2 = ["<UNK>" if random.random() < 0.05 else word for word in phrase2.split()]

  return ["<pad>"] * random.randint(1,15) + phrase1 + ["<sep>"] + phrase2


def generate_word2vec_embeddings(size, phrases):
  return Word2Vec(sentences=phrases, workers=300, min_count=1, vector_size=size)

phrases = pd.read_csv(TRAINING_DATASET_LOCATION)
phrases = phrases.loc[:, "text_1":"text_2"].to_numpy().flatten().tolist()
phrases = [add_unk(str(phrases[i]), str(phrases[i+1])) for i in range(0, len(phrases), 2)]

embeddings_1024 = generate_word2vec_embeddings(1024, phrases)
embeddings_512 = generate_word2vec_embeddings(512, phrases)
embeddings_256 = generate_word2vec_embeddings(256, phrases)
embeddings_128 = generate_word2vec_embeddings(128, phrases)
embeddings_64 = generate_word2vec_embeddings(64, phrases)

## Model Creation

To reduce code content, the GRU models are defined from a base class. New linear layers are added between each class of model since this could not be done using parameters alone

In [9]:
class BaseGruRNN(torch.nn.Module):
  def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
    super(BaseGruRNN, self).__init__()
    self.vocab = vocab
    self.get_embedding = torch.nn.Embedding.from_pretrained(embeddings)
    self.GRU_Layer = torch.nn.GRU(embedding_size, hidden_size, batch_first=True, num_layers=rnn_layers, dropout=0.1, bidirectional=is_bidirectional)

  def forward(self, x, linear_layer):
    unk_embedding = self.vocab["<UNK>"]
    #x = [re.sub(f"[{re.escape(string.punctuation)}]", "", sentence) for sentence in x]
    x = [sentence.split() for sentence in x]
    x = [[self.vocab.get(word, unk_embedding) for word in sentence ] for sentence in x]
    # TODO -> instead of padding, use pack sequence instead. Note this may break the output from the lstm (woo)
    max_len = max([len(words) for words in x])
    x = [([self.vocab["<pad>"]] * (max_len -  len(words))) + words for words in x]
    input = torch.tensor(x)

    embeddings = self.get_embedding(input)
    GRU_int_results = self.GRU_Layer(embeddings)[1]
    GRU_values = GRU_int_results[-1] #Get last hidden state(s)
    result = linear_layer(GRU_values)
    return result

class OneLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      self.base = super(OneLinearLayerGruRNN, self).__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, output_size),
      )

    def forward(self, x):
      return super().forward(x, self.linear_layer)

class TwoLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      super().__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, output_size)
      )

    def forward(self, x):
      return super().forward(x, self.linear_layer)

class ThreeLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      super(ThreeLinearLayerGruRNN, self).__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, output_size)
      )

    def forward(self, x):
      return super().forward(x, self.linear_layer)

class FourLinearLayerGruRNN(BaseGruRNN):
    def __init__(self, embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers=1, is_bidirectional=False):
      super(FourLinearLayerGruRNN, self).__init__(embedding_size, output_size, hidden_size, embeddings, vocab, rnn_layers, is_bidirectional)
      self.linear_layer = torch.nn.Sequential(
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, hidden_size),
          torch.nn.LeakyReLU(),
          torch.nn.Dropout(p=0.1),
          torch.nn.Linear(hidden_size, output_size)
      )


    def forward(self, x):
      return super().forward(x, self.linear_layer)


## Model Training

In [10]:
def train_and_validate_loop(model, batch_size, learning_rate, epochs, tuning=False):
  device = (
    "cuda:0"
    if torch.cuda.is_available()
    else "cpu"
)
  torch.set_default_device(device)
  loss_function = torch.nn.BCEWithLogitsLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
  training_loader = DataLoader(training_samples, batch_size=batch_size, generator=torch.Generator(device=device))
  dev_loader = DataLoader(dev_samples, batch_size=batch_size, generator=torch.Generator(device=device))
  accuracy = torchmetrics.Accuracy(task="binary")
  f1_score = torchmetrics.F1Score(task="binary")
  mcc = torchmetrics.MatthewsCorrCoef(task="binary")
  torch.set_grad_enabled(True)

  if not tuning:
    print(f"Epochs: {epochs}")

  for epoch in range(0, epochs):
      model.train()
      running_loss = 0.0

      # Training
      for index, value in enumerate(training_loader):
          optimizer.zero_grad()
          data, labels = value
          labels = labels.reshape(-1,1)
          labels = labels.type(torch.FloatTensor)
          labels = labels.to(device)
          outputs = model(data)

          loss = loss_function(outputs, labels)
          loss.backward()
          optimizer.step()
          running_loss += loss.item()

      # Validation to ensure the model is learning
      model.eval()
      torch.set_grad_enabled(False)
      running_loss = 0
      num_correct = 0

      pred = torch.Tensor().to(device)
      gold_standard = torch.Tensor().to(device)
      for index, value in enumerate(dev_loader):
          data, labels = value
          labels = labels.reshape(-1,1)
          labels = labels.type(torch.FloatTensor)
          labels = labels.to(device)

          outputs = model(data)
          loss = loss_function(outputs, labels)
          running_loss += loss

          normalised_outputs = torch.sigmoid(outputs)
          pred = torch.cat((pred, normalised_outputs))
          gold_standard = torch.cat((gold_standard, labels))

      torch.set_grad_enabled(True)
      batch_loss = running_loss / len(dev_loader)
      f1 = f1_score(pred, gold_standard)
      acc = accuracy(pred, gold_standard)
      m_score = mcc(pred, gold_standard)

      score = (float(acc)) + (1.5 * float(m_score))
      with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
        checkpoint = None
        if (epoch + 1) % 3 == 0:
            # This saves the model to the trial directory
            torch.save(
                model.state_dict(),
                os.path.join(temp_checkpoint_dir, "model.pth")
            )
            checkpoint = train.Checkpoint.from_directory(temp_checkpoint_dir)

        if tuning:
          train.report({"score": score}, checkpoint=checkpoint)
        else:
          print(f"---------------------EPOCH {epoch+1} / {epochs}---------------------")
          print(f"Batch Loss {batch_loss}")
          print(f"Accuracy {accuracy(pred, gold_standard)}")


In [None]:
# test_embeddings = gensim.downloader.load("word2vec-google-news-300")
# test_embeddings["<UNK>"] = np.random.rand(300)
# test_embeddings["<sep>"] = np.random.rand(300)
# test_embeddings["<pad>"] = np.random.rand(300)
# coded_embeddings = torch.FloatTensor(test_embeddings.vectors).to(DEVICE)
# vocab = test_embeddings.key_to_index
# model = FourLinearLayerGruRNN(300, 1, 300, coded_embeddings, vocab, 1, is_bidirectional=True).to(DEVICE)
# train_and_validate_loop(model,64, 0.000019238, 3)

## Hyperparameter Selection

In [11]:
def tune_training_step(config, embedding, network):
  device = (
    "cuda:0"
    if torch.cuda.is_available()
    else "cpu"
)

  chosen_network = network[config["linear_layers"]]
  embedding = embedding[config["embedding_size"]]
  coded_embeddings = torch.FloatTensor(np.array(embedding.wv.vectors)).to(device)
  vocab = embedding.wv.key_to_index
  generated_model = chosen_network(config["embedding_size"], 1, config["embedding_size"], coded_embeddings, vocab, config["rnn_layers"], config["is_bidirectional"]).to(device)
  train_and_validate_loop(generated_model, config["batch_size"], config["lr"], config["epochs"], True)


config = {
  "lr": tune.loguniform(1e-6, 1),
  "batch_size": tune.choice([8, 16, 32, 64, 128]),
  "is_bidirectional": tune.choice([True, False]),
  "epochs": tune.choice(list(range(1,15))),
  "linear_layers": tune.choice(list(range(1,5))),
  "rnn_layers": tune.choice(list(range(1,10))),
  "embedding_size": tune.choice([64,128,256,512, 1024]),
  "hidden_size": tune.choice(list(range(128, 1025, 128)))
}


embedding = {
    64: embeddings_64,
    128: embeddings_128,
    256: embeddings_256,
    512: embeddings_512,
    1024: embeddings_1024
}

network = {
    1: OneLinearLayerGruRNN,
    2: TwoLinearLayerGruRNN,
    3: ThreeLinearLayerGruRNN,
    4: FourLinearLayerGruRNN
}

asha_scheduler = ASHAScheduler(
        metric="score",
        mode="max",
        max_t=14,
        grace_period=6,
        reduction_factor=2
    )

chosen_search_alg = HyperOptSearch(metric="score", mode="max")

training_wrapper = tune.with_resources(tune.with_parameters(tune_training_step, embedding=embedding, network=network), {"CPU": 1.6, "GPU": 0.25})
tuner = tune.Tuner(
    training_wrapper,
    tune_config=tune.TuneConfig(
        num_samples=35,
        search_alg=chosen_search_alg,
        scheduler=asha_scheduler,
    ),
    param_space=config
)

result = tuner.fit()


  self.pid = _posixsubprocess.fork_exec(
2024-04-24 00:49:18,547	INFO worker.py:1749 -- Started a local Ray instance.
2024-04-24 00:49:20,590	INFO tune.py:263 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Tuner(...)`.
2024-04-24 00:49:20,593	INFO tune.py:633 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


+---------------------------------------------------------------------------+
| Configuration for experiment     tune_training_step_2024-04-24_00-49-15   |
+---------------------------------------------------------------------------+
| Search algorithm                 SearchGenerator                          |
| Scheduler                        AsyncHyperBandScheduler                  |
| Number of trials                 35                                       |
+---------------------------------------------------------------------------+

View detailed results here: /root/ray_results/tune_training_step_2024-04-24_00-49-15
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts`

Trial status: 1 PENDING
Current time: 2024-04-24 00:49:21. Total running time: 0s
Logical resource usage: 0/16 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:L4)
+----




Trial tune_training_step_d8a71156 started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_d8a71156 config             |
+------------------------------------------------------+
| batch_size                                       128 |
| embedding_size                                   128 |
| epochs                                             1 |
| hidden_size                                      512 |
| is_bidirectional                                   1 |
| linear_layers                                      3 |
| lr                                           0.00092 |
| rnn_layers                                         5 |
+------------------------------------------------------+

Trial tune_training_step_58b81384 started with configuration:
+----------------------------------------------------+
| Trial tune_training_step_58b81384 config           |
+----------------------------------------------------+
| batch_size             




Trial tune_training_step_ef5c4940 started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_ef5c4940 config             |
+------------------------------------------------------+
| batch_size                                        32 |
| embedding_size                                   128 |
| epochs                                             8 |
| hidden_size                                     1024 |
| is_bidirectional                                   1 |
| linear_layers                                      1 |
| lr                                           0.00782 |
| rnn_layers                                         8 |
+------------------------------------------------------+

Trial status: 4 RUNNING | 1 PENDING
Current time: 2024-04-24 00:49:51. Total running time: 30s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------




Trial status: 4 RUNNING | 1 TERMINATED | 1 PENDING
Current time: 2024-04-24 00:50:52. Total running time: 1min 30s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_43f8d0bb   RUNNING      0.00126034              16   False                       3                 4              1                512            

[36m(tune_training_step pid=15435)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_58b81384_3_batch_size=128,embedding_size=512,epochs=7,hidden_size=1024,is_bidirectional=True,linear_layers=4,lr_2024-04-24_00-49-32/checkpoint_000000)


Trial status: 4 RUNNING | 1 TERMINATED | 1 PENDING
Current time: 2024-04-24 00:52:52. Total running time: 3min 31s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_43f8d0bb   RUNNING      0.00126034              16   False                       3                 4              1                512             

[36m(tune_training_step pid=15435)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_58b81384_3_batch_size=128,embedding_size=512,epochs=7,hidden_size=1024,is_bidirectional=True,linear_layers=4,lr_2024-04-24_00-49-32/checkpoint_000001)
[36m(tune_training_step pid=15521)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_ef5c4940_4_batch_size=32,embedding_size=128,epochs=8,hidden_size=1024,is_bidirectional=True,linear_layers=1,lr=_2024-04-24_00-49-38/checkpoint_000000)


Trial status: 4 RUNNING | 1 TERMINATED | 1 PENDING
Current time: 2024-04-24 00:55:22. Total running time: 6min 1s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_43f8d0bb   RUNNING      0.00126034              16   False                       3                 4              1                512             6

[36m(tune_training_step pid=15245)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_43f8d0bb_1_batch_size=16,embedding_size=512,epochs=3,hidden_size=640,is_bidirectional=False,linear_layers=4,lr=_2024-04-24_00-49-21/checkpoint_000000)



Trial tune_training_step_43f8d0bb completed after 3 iterations at 2024-04-24 00:56:24. Total running time: 7min 3s
+----------------------------------------------------------------+
| Trial tune_training_step_43f8d0bb result                       |
+----------------------------------------------------------------+
| checkpoint_dir_name                          checkpoint_000000 |
| time_this_iter_s                                     146.16194 |
| time_total_s                                         417.73367 |
| training_iteration                                           3 |
| score                                                  0.49802 |
+----------------------------------------------------------------+

Trial tune_training_step_8280f8a4 started with configuration:
+---------------------------------------------------+
| Trial tune_training_step_8280f8a4 config          |
+---------------------------------------------------+
| batch_size                                     32 |
| 

[36m(tune_training_step pid=17426)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_8280f8a4_7_batch_size=32,embedding_size=64,epochs=4,hidden_size=1024,is_bidirectional=True,linear_layers=3,lr=0_2024-04-24_00-56-18/checkpoint_000000)


Trial status: 3 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 00:58:52. Total running time: 9min 31s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_ef5c4940   RUNNING      0.00781718              32   True                        8                 1              8                128            1

[36m(tune_training_step pid=15521)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_ef5c4940_4_batch_size=32,embedding_size=128,epochs=8,hidden_size=1024,is_bidirectional=True,linear_layers=1,lr=_2024-04-24_00-49-38/checkpoint_000001)


Trial status: 4 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 01:00:52. Total running time: 11min 31s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_ef5c4940   RUNNING      0.00781718              32   True                        8                 1              8                128            

[36m(tune_training_step pid=17317)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_2d98e058_6_batch_size=128,embedding_size=512,epochs=12,hidden_size=128,is_bidirectional=True,linear_layers=2,lr_2024-04-24_00-50-45/checkpoint_000000)


Trial status: 4 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 01:02:23. Total running time: 13min 1s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_ef5c4940   RUNNING      0.00781718              32   True                        8                 1              8                128            1

[36m(tune_training_step pid=17317)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_2d98e058_6_batch_size=128,embedding_size=512,epochs=12,hidden_size=128,is_bidirectional=True,linear_layers=2,lr_2024-04-24_00-50-45/checkpoint_000001)


Trial status: 5 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 01:07:53. Total running time: 18min 32s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_c99a90ba   RUNNING      0.0625229                8   True                       12                 2              1               1024            

[36m(tune_training_step pid=18253)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_d31a7767_8_batch_size=8,embedding_size=64,epochs=13,hidden_size=512,is_bidirectional=True,linear_layers=2,lr=0._2024-04-24_00-56-30/checkpoint_000000)


Trial status: 5 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 01:10:23. Total running time: 21min 2s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_c99a90ba   RUNNING      0.0625229                8   True                       12                 2              1               1024            1

[36m(tune_training_step pid=19585)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_e0866910_9_batch_size=64,embedding_size=256,epochs=7,hidden_size=384,is_bidirectional=False,linear_layers=1,lr=_2024-04-24_00-59-34/checkpoint_000000)


Trial status: 5 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 01:11:23. Total running time: 22min 2s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_c99a90ba   RUNNING      0.0625229                8   True                       12                 2              1               1024            1

[36m(tune_training_step pid=17317)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_2d98e058_6_batch_size=128,embedding_size=512,epochs=12,hidden_size=128,is_bidirectional=True,linear_layers=2,lr_2024-04-24_00-50-45/checkpoint_000002)


Trial status: 5 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 01:13:54. Total running time: 24min 33s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_c99a90ba   RUNNING      0.0625229                8   True                       12                 2              1               1024            

[36m(tune_training_step pid=19585)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_e0866910_9_batch_size=64,embedding_size=256,epochs=7,hidden_size=384,is_bidirectional=False,linear_layers=1,lr=_2024-04-24_00-59-34/checkpoint_000001)


Trial status: 5 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 01:17:24. Total running time: 28min 3s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_c99a90ba   RUNNING      0.0625229                8   True                       12                 2              1               1024            1

[36m(tune_training_step pid=15848)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_c99a90ba_5_batch_size=8,embedding_size=1024,epochs=12,hidden_size=1024,is_bidirectional=True,linear_layers=2,lr_2024-04-24_00-49-44/checkpoint_000000)


Trial status: 5 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 01:18:24. Total running time: 29min 3s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_c99a90ba   RUNNING      0.0625229                8   True                       12                 2              1               1024            1

[36m(tune_training_step pid=17317)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_2d98e058_6_batch_size=128,embedding_size=512,epochs=12,hidden_size=128,is_bidirectional=True,linear_layers=2,lr_2024-04-24_00-50-45/checkpoint_000003)



Trial tune_training_step_2d98e058 completed after 12 iterations at 2024-04-24 01:19:15. Total running time: 29min 54s
+----------------------------------------------------------------+
| Trial tune_training_step_2d98e058 result                       |
+----------------------------------------------------------------+
| checkpoint_dir_name                          checkpoint_000003 |
| time_this_iter_s                                     114.53455 |
| time_total_s                                        1377.16928 |
| training_iteration                                          12 |
| score                                                    0.892 |
+----------------------------------------------------------------+

Trial tune_training_step_e0866910 completed after 7 iterations at 2024-04-24 01:19:18. Total running time: 29min 56s
+------------------------------------------------------+
| Trial tune_training_step_e0866910 result             |
+---------------------------------------------




Trial status: 7 TERMINATED | 3 RUNNING | 1 PENDING
Current time: 2024-04-24 01:19:24. Total running time: 30min 3s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_c99a90ba   RUNNING      0.0625229                8   True                       12                 2              1               1024            

[36m(tune_training_step pid=18253)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_d31a7767_8_batch_size=8,embedding_size=64,epochs=13,hidden_size=512,is_bidirectional=True,linear_layers=2,lr=0._2024-04-24_00-56-30/checkpoint_000001)


Trial status: 7 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 01:20:54. Total running time: 31min 33s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_c99a90ba   RUNNING      0.0625229                8   True                       12                 2              1               1024            

[36m(tune_training_step pid=23369)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_82a91684_10_batch_size=8,embedding_size=128,epochs=12,hidden_size=128,is_bidirectional=True,linear_layers=3,lr=_2024-04-24_01-04-36/checkpoint_000000)


Trial status: 7 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 01:24:55. Total running time: 35min 34s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_c99a90ba   RUNNING      0.0625229                8   True                       12                 2              1               1024            

[36m(tune_training_step pid=18253)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_d31a7767_8_batch_size=8,embedding_size=64,epochs=13,hidden_size=512,is_bidirectional=True,linear_layers=2,lr=0._2024-04-24_00-56-30/checkpoint_000002)


Trial status: 7 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 01:28:55. Total running time: 39min 34s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_c99a90ba   RUNNING      0.0625229                8   True                       12                 2              1               1024            

[36m(tune_training_step pid=23369)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_82a91684_10_batch_size=8,embedding_size=128,epochs=12,hidden_size=128,is_bidirectional=True,linear_layers=3,lr=_2024-04-24_01-04-36/checkpoint_000001)


Trial status: 7 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 01:30:55. Total running time: 41min 34s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_c99a90ba   RUNNING      0.0625229                8   True                       12                 2              1               1024            

[36m(tune_training_step pid=23369)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_82a91684_10_batch_size=8,embedding_size=128,epochs=12,hidden_size=128,is_bidirectional=True,linear_layers=3,lr=_2024-04-24_01-04-36/checkpoint_000002)


Trial status: 7 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 01:36:26. Total running time: 47min 4s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_c99a90ba   RUNNING      0.0625229                8   True                       12                 2              1               1024            1

[36m(tune_training_step pid=18253)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_d31a7767_8_batch_size=8,embedding_size=64,epochs=13,hidden_size=512,is_bidirectional=True,linear_layers=2,lr=0._2024-04-24_00-56-30/checkpoint_000003)



Trial tune_training_step_2a5f3f91 started with configuration:
+----------------------------------------------------+
| Trial tune_training_step_2a5f3f91 config           |
+----------------------------------------------------+
| batch_size                                       8 |
| embedding_size                                 256 |
| epochs                                          10 |
| hidden_size                                    896 |
| is_bidirectional                                 0 |
| linear_layers                                    4 |
| lr                                           2e-05 |
| rnn_layers                                       2 |
+----------------------------------------------------+

Trial status: 8 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 01:37:26. Total running time: 48min 5s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+----------------------------------------------------------------------------------

[36m(tune_training_step pid=15848)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_c99a90ba_5_batch_size=8,embedding_size=1024,epochs=12,hidden_size=1024,is_bidirectional=True,linear_layers=2,lr_2024-04-24_00-49-44/checkpoint_000001)



Trial tune_training_step_aaaac0c2 started with configuration:
+--------------------------------------------------+
| Trial tune_training_step_aaaac0c2 config         |
+--------------------------------------------------+
| batch_size                                    32 |
| embedding_size                               128 |
| epochs                                         1 |
| hidden_size                                  896 |
| is_bidirectional                               0 |
| linear_layers                                  1 |
| lr                                             0 |
| rnn_layers                                     4 |
+--------------------------------------------------+

Trial status: 9 TERMINATED | 4 RUNNING | 1 PENDING
Current time: 2024-04-24 01:39:26. Total running time: 50min 5s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+----------------------------------------------------------------------------------------------------------

2024-04-24 01:40:09,913	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_2f0bcead
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=28860, ip=172.28.0.12, 


Trial tune_training_step_2f0bcead errored after 0 iterations at 2024-04-24 01:40:09. Total running time: 50min 48s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_2f0bcead_14_batch_size=64,embedding_size=512,epochs=1,hidden_size=512,is_bidirectional=True,linear_layers=1,lr=_2024-04-24_01-39-14/error.txt

Trial tune_training_step_6cdb1b3f started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_6cdb1b3f config             |
+------------------------------------------------------+
| batch_size                                         8 |
| embedding_size                                  1024 |
| epochs                                            14 |
| hidden_size                                      384 |
| is_bidirectional                                   1 |
| linear_layers                                      1 |
|




Trial status: 10 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 01:40:26. Total running time: 51min 5s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_82a91684   RUNNING      1.46706e-05              8   True                       12                 3              1                128 

[36m(tune_training_step pid=23369)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_82a91684_10_batch_size=8,embedding_size=128,epochs=12,hidden_size=128,is_bidirectional=True,linear_layers=3,lr=_2024-04-24_01-04-36/checkpoint_000003)



Trial tune_training_step_08bcd4e7 started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_08bcd4e7 config             |
+------------------------------------------------------+
| batch_size                                        16 |
| embedding_size                                    64 |
| epochs                                             4 |
| hidden_size                                     1024 |
| is_bidirectional                                   0 |
| linear_layers                                      4 |
| lr                                           0.02627 |
| rnn_layers                                         7 |
+------------------------------------------------------+

Trial status: 11 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 01:41:56. Total running time: 52min 35s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+----------------------------------------------

[36m(tune_training_step pid=29406)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_08bcd4e7_16_batch_size=16,embedding_size=64,epochs=4,hidden_size=1024,is_bidirectional=False,linear_layers=4,lr_2024-04-24_01-40-15/checkpoint_000000)


Trial status: 11 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 01:45:57. Total running time: 56min 35s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5               1024 




Trial status: 12 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 01:47:57. Total running time: 58min 36s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5               1024

[36m(tune_training_step pid=30950)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_d1562b69_17_batch_size=128,embedding_size=128,epochs=10,hidden_size=1024,is_bidirectional=False,linear_layers=2_2024-04-24_01-41-35/checkpoint_000000)


Trial status: 12 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 01:48:27. Total running time: 59min 6s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5               1024  

[36m(tune_training_step pid=28071)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_2a5f3f91_12_batch_size=8,embedding_size=256,epochs=10,hidden_size=896,is_bidirectional=False,linear_layers=4,lr_2024-04-24_01-19-27/checkpoint_000000)


Trial status: 12 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 01:48:57. Total running time: 59min 36s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5               1024 

[36m(tune_training_step pid=30950)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_d1562b69_17_batch_size=128,embedding_size=128,epochs=10,hidden_size=1024,is_bidirectional=False,linear_layers=2_2024-04-24_01-41-35/checkpoint_000001)


Trial status: 12 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 01:49:27. Total running time: 1hr 0min 6s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5               102

[36m(tune_training_step pid=30950)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_d1562b69_17_batch_size=128,embedding_size=128,epochs=10,hidden_size=1024,is_bidirectional=False,linear_layers=2_2024-04-24_01-41-35/checkpoint_000002)


Trial status: 12 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 01:49:57. Total running time: 1hr 0min 36s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5               10




Trial status: 13 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 01:50:27. Total running time: 1hr 1min 6s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5               10

[36m(tune_training_step pid=31731)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_ee2f70cb_18_batch_size=128,embedding_size=256,epochs=8,hidden_size=768,is_bidirectional=True,linear_layers=2,lr_2024-04-24_01-47-28/checkpoint_000000)


Trial status: 13 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 01:52:27. Total running time: 1hr 3min 6s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5               102

[36m(tune_training_step pid=31731)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_ee2f70cb_18_batch_size=128,embedding_size=256,epochs=8,hidden_size=768,is_bidirectional=True,linear_layers=2,lr_2024-04-24_01-47-28/checkpoint_000001)


Trial status: 13 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 01:53:57. Total running time: 1hr 4min 36s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5               10




Trial status: 14 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 01:55:28. Total running time: 1hr 6min 7s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5               10

[36m(tune_training_step pid=28071)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_2a5f3f91_12_batch_size=8,embedding_size=256,epochs=10,hidden_size=896,is_bidirectional=False,linear_layers=4,lr_2024-04-24_01-19-27/checkpoint_000001)


Trial status: 14 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 02:01:58. Total running time: 1hr 12min 37s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5               1

[36m(tune_training_step pid=29001)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_6cdb1b3f_15_batch_size=8,embedding_size=1024,epochs=14,hidden_size=384,is_bidirectional=True,linear_layers=1,lr_2024-04-24_01-39-55/checkpoint_000000)


Trial status: 14 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 02:03:28. Total running time: 1hr 14min 7s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5               10

[36m(tune_training_step pid=32982)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_762d2324_19_batch_size=8,embedding_size=256,epochs=7,hidden_size=640,is_bidirectional=False,linear_layers=3,lr=_2024-04-24_01-50-18/checkpoint_000000)


Trial status: 14 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 02:06:29. Total running time: 1hr 17min 7s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5               10

[36m(tune_training_step pid=28071)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_2a5f3f91_12_batch_size=8,embedding_size=256,epochs=10,hidden_size=896,is_bidirectional=False,linear_layers=4,lr_2024-04-24_01-19-27/checkpoint_000002)


Trial status: 14 TERMINATED | 4 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 02:14:29. Total running time: 1hr 25min 8s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5               10

[36m(tune_training_step pid=32982)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_762d2324_19_batch_size=8,embedding_size=256,epochs=7,hidden_size=640,is_bidirectional=False,linear_layers=3,lr=_2024-04-24_01-50-18/checkpoint_000001)



Trial status: 15 TERMINATED | 3 RUNNING | 1 ERROR | 1 PENDING
Current time: 2024-04-24 02:17:00. Total running time: 1hr 27min 38s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5               

2024-04-24 02:19:03,236	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_86f0eda5
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=39147, ip=172.28.0.12, 


Trial tune_training_step_86f0eda5 errored after 0 iterations at 2024-04-24 02:19:03. Total running time: 1hr 29min 42s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_86f0eda5_21_batch_size=32,embedding_size=1024,epochs=4,hidden_size=512,is_bidirectional=False,linear_layers=1,l_2024-04-24_02-17-02/error.txt

Trial tune_training_step_7b11e5a4 started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_7b11e5a4 config             |
+------------------------------------------------------+
| batch_size                                       128 |
| embedding_size                                   512 |
| epochs                                            10 |
| hidden_size                                      128 |
| is_bidirectional                                   0 |
| linear_layers                                      2

2024-04-24 02:19:11,186	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_7b11e5a4
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=39240, ip=172.28.0.12, 


Trial tune_training_step_7b11e5a4 errored after 0 iterations at 2024-04-24 02:19:11. Total running time: 1hr 29min 50s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_7b11e5a4_22_batch_size=128,embedding_size=512,epochs=10,hidden_size=128,is_bidirectional=False,linear_layers=2,_2024-04-24_02-18-59/error.txt

Trial tune_training_step_62ac763c started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_62ac763c config             |
+------------------------------------------------------+
| batch_size                                       128 |
| embedding_size                                   512 |
| epochs                                            11 |
| hidden_size                                      256 |
| is_bidirectional                                   0 |
| linear_layers                                      2

2024-04-24 02:19:20,110	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_62ac763c
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=39333, ip=172.28.0.12, 


Trial tune_training_step_62ac763c errored after 0 iterations at 2024-04-24 02:19:20. Total running time: 1hr 29min 58s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_62ac763c_23_batch_size=128,embedding_size=512,epochs=11,hidden_size=256,is_bidirectional=False,linear_layers=2,_2024-04-24_02-19-08/error.txt

Trial tune_training_step_27d15d68 started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_27d15d68 config             |
+------------------------------------------------------+
| batch_size                                       128 |
| embedding_size                                   512 |
| epochs                                             9 |
| hidden_size                                      128 |
| is_bidirectional                                   0 |
| linear_layers                                      2

2024-04-24 02:19:28,292	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_27d15d68
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=39463, ip=172.28.0.12, 


Trial tune_training_step_27d15d68 errored after 0 iterations at 2024-04-24 02:19:28. Total running time: 1hr 30min 7s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_27d15d68_24_batch_size=128,embedding_size=512,epochs=9,hidden_size=128,is_bidirectional=False,linear_layers=2,l_2024-04-24_02-19-16/error.txt

Trial status: 16 TERMINATED | 3 RUNNING | 5 ERROR | 1 PENDING
Current time: 2024-04-24 02:19:30. Total running time: 1hr 30min 9s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_s

2024-04-24 02:19:36,135	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_4ebeb881
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=39554, ip=172.28.0.12, 


Trial tune_training_step_4ebeb881 errored after 0 iterations at 2024-04-24 02:19:36. Total running time: 1hr 30min 15s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_4ebeb881_25_batch_size=128,embedding_size=512,epochs=11,hidden_size=256,is_bidirectional=False,linear_layers=2,_2024-04-24_02-19-25/error.txt

Trial tune_training_step_8fe06dfc started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_8fe06dfc config             |
+------------------------------------------------------+
| batch_size                                       128 |
| embedding_size                                   512 |
| epochs                                             2 |
| hidden_size                                      128 |
| is_bidirectional                                   0 |
| linear_layers                                      2

2024-04-24 02:19:44,259	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_8fe06dfc
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=39645, ip=172.28.0.12, 


Trial tune_training_step_8fe06dfc errored after 0 iterations at 2024-04-24 02:19:44. Total running time: 1hr 30min 23s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_8fe06dfc_26_batch_size=128,embedding_size=512,epochs=2,hidden_size=128,is_bidirectional=False,linear_layers=2,l_2024-04-24_02-19-33/error.txt

Trial tune_training_step_404edc2b started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_404edc2b config             |
+------------------------------------------------------+
| batch_size                                       128 |
| embedding_size                                   512 |
| epochs                                             9 |
| hidden_size                                      768 |
| is_bidirectional                                   0 |
| linear_layers                                      2

2024-04-24 02:19:52,150	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_404edc2b
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=39736, ip=172.28.0.12, 


Trial tune_training_step_404edc2b errored after 0 iterations at 2024-04-24 02:19:52. Total running time: 1hr 30min 31s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_404edc2b_27_batch_size=128,embedding_size=512,epochs=9,hidden_size=768,is_bidirectional=False,linear_layers=2,l_2024-04-24_02-19-41/error.txt

Trial tune_training_step_905be566 started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_905be566 config             |
+------------------------------------------------------+
| batch_size                                       128 |
| embedding_size                                   512 |
| epochs                                             6 |
| hidden_size                                      128 |
| is_bidirectional                                   0 |
| linear_layers                                      2

2024-04-24 02:20:00,312	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_905be566
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=39828, ip=172.28.0.12, 


Trial tune_training_step_905be566 errored after 0 iterations at 2024-04-24 02:20:00. Total running time: 1hr 30min 39s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_905be566_28_batch_size=128,embedding_size=512,epochs=6,hidden_size=128,is_bidirectional=False,linear_layers=2,l_2024-04-24_02-19-49/error.txt

Trial status: 16 TERMINATED | 3 RUNNING | 9 ERROR | 1 PENDING
Current time: 2024-04-24 02:20:00. Total running time: 1hr 30min 39s
Logical resource usage: 6.4/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding

2024-04-24 02:20:08,119	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_2c7abee9
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=39919, ip=172.28.0.12, 


Trial tune_training_step_2c7abee9 errored after 0 iterations at 2024-04-24 02:20:08. Total running time: 1hr 30min 46s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_2c7abee9_29_batch_size=128,embedding_size=512,epochs=9,hidden_size=768,is_bidirectional=False,linear_layers=2,l_2024-04-24_02-19-57/error.txt

Trial tune_training_step_bc38a942 started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_bc38a942 config             |
+------------------------------------------------------+
| batch_size                                       128 |
| embedding_size                                   512 |
| epochs                                            11 |
| hidden_size                                      128 |
| is_bidirectional                                   0 |
| linear_layers                                      2

2024-04-24 02:20:16,293	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_bc38a942
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=40012, ip=172.28.0.12, 


Trial tune_training_step_bc38a942 errored after 0 iterations at 2024-04-24 02:20:16. Total running time: 1hr 30min 55s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_bc38a942_30_batch_size=128,embedding_size=512,epochs=11,hidden_size=128,is_bidirectional=False,linear_layers=2,_2024-04-24_02-20-05/error.txt

Trial tune_training_step_b64136ed started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_b64136ed config             |
+------------------------------------------------------+
| batch_size                                       128 |
| embedding_size                                   512 |
| epochs                                            10 |
| hidden_size                                      768 |
| is_bidirectional                                   0 |
| linear_layers                                      2

2024-04-24 02:20:24,194	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_b64136ed
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=40101, ip=172.28.0.12, 


Trial tune_training_step_b64136ed errored after 0 iterations at 2024-04-24 02:20:24. Total running time: 1hr 31min 3s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_b64136ed_31_batch_size=128,embedding_size=512,epochs=10,hidden_size=768,is_bidirectional=False,linear_layers=2,_2024-04-24_02-20-13/error.txt

Trial tune_training_step_5b3543ab started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_5b3543ab config             |
+------------------------------------------------------+
| batch_size                                       128 |
| embedding_size                                   512 |
| epochs                                            11 |
| hidden_size                                      128 |
| is_bidirectional                                   0 |
| linear_layers                                      2 

2024-04-24 02:20:32,360	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_5b3543ab
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=40190, ip=172.28.0.12, 


Trial tune_training_step_5b3543ab errored after 0 iterations at 2024-04-24 02:20:32. Total running time: 1hr 31min 11s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_5b3543ab_32_batch_size=128,embedding_size=512,epochs=11,hidden_size=128,is_bidirectional=False,linear_layers=2,_2024-04-24_02-20-21/error.txt

Trial tune_training_step_d06455d1 started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_d06455d1 config             |
+------------------------------------------------------+
| batch_size                                       128 |
| embedding_size                                   512 |
| epochs                                             5 |
| hidden_size                                      256 |
| is_bidirectional                                   0 |
| linear_layers                                      2

2024-04-24 02:20:40,254	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_d06455d1
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=40278, ip=172.28.0.12, 


Trial tune_training_step_d06455d1 errored after 0 iterations at 2024-04-24 02:20:40. Total running time: 1hr 31min 19s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_d06455d1_33_batch_size=128,embedding_size=512,epochs=5,hidden_size=256,is_bidirectional=False,linear_layers=2,l_2024-04-24_02-20-29/error.txt

Trial tune_training_step_2b191b98 started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_2b191b98 config             |
+------------------------------------------------------+
| batch_size                                       128 |
| embedding_size                                   512 |
| epochs                                             2 |
| hidden_size                                      128 |
| is_bidirectional                                   0 |
| linear_layers                                      2

2024-04-24 02:20:48,198	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_2b191b98
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=40369, ip=172.28.0.12, 


Trial tune_training_step_2b191b98 errored after 0 iterations at 2024-04-24 02:20:48. Total running time: 1hr 31min 27s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_2b191b98_34_batch_size=128,embedding_size=512,epochs=2,hidden_size=128,is_bidirectional=False,linear_layers=2,l_2024-04-24_02-20-37/error.txt

Trial tune_training_step_3da457e6 started with configuration:
+------------------------------------------------------+
| Trial tune_training_step_3da457e6 config             |
+------------------------------------------------------+
| batch_size                                       128 |
| embedding_size                                   512 |
| epochs                                             5 |
| hidden_size                                      256 |
| is_bidirectional                                   0 |
| linear_layers                                      2

2024-04-24 02:20:56,117	ERROR tune_controller.py:1332 -- Trial task failed for trial tune_training_step_3da457e6
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=40462, ip=172.28.0.12, 


Trial tune_training_step_3da457e6 errored after 0 iterations at 2024-04-24 02:20:56. Total running time: 1hr 31min 34s
Error file: /tmp/ray/session_2024-04-24_00-49-15_723019_212/artifacts/2024-04-24_00-49-20/tune_training_step_2024-04-24_00-49-15/driver_artifacts/tune_training_step_3da457e6_35_batch_size=128,embedding_size=512,epochs=5,hidden_size=256,is_bidirectional=False,linear_layers=2,l_2024-04-24_02-20-45/error.txt

Trial status: 16 TERMINATED | 3 RUNNING | 16 ERROR
Current time: 2024-04-24 02:21:00. Total running time: 1hr 31min 39s
Logical resource usage: 4.800000000000001/16 CPUs, 0.75/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embed

[36m(tune_training_step pid=29001)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_6cdb1b3f_15_batch_size=8,embedding_size=1024,epochs=14,hidden_size=384,is_bidirectional=True,linear_layers=1,lr_2024-04-24_01-39-55/checkpoint_000001)


Trial status: 16 TERMINATED | 3 RUNNING | 16 ERROR
Current time: 2024-04-24 02:27:01. Total running time: 1hr 37min 40s
Logical resource usage: 4.800000000000001/16 CPUs, 0.75/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5            

[36m(tune_training_step pid=29001)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_6cdb1b3f_15_batch_size=8,embedding_size=1024,epochs=14,hidden_size=384,is_bidirectional=True,linear_layers=1,lr_2024-04-24_01-39-55/checkpoint_000002)


Trial status: 16 TERMINATED | 3 RUNNING | 16 ERROR
Current time: 2024-04-24 02:52:35. Total running time: 2hr 3min 14s
Logical resource usage: 4.800000000000001/16 CPUs, 0.75/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5             

[36m(tune_training_step pid=23448)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_5e105fdc_11_batch_size=8,embedding_size=1024,epochs=4,hidden_size=256,is_bidirectional=True,linear_layers=2,lr=_2024-04-24_01-19-21/checkpoint_000000)


Trial status: 16 TERMINATED | 3 RUNNING | 16 ERROR
Current time: 2024-04-24 03:11:08. Total running time: 2hr 21min 47s
Logical resource usage: 4.800000000000001/16 CPUs, 0.75/1 GPUs (0.0/1.0 accelerator_type:L4)
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                    status                lr     batch_size   is_bidirectional       epochs     linear_layers     rnn_layers     embedding_size     hidden_size     iter     total time (s)      score |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| tune_training_step_5e105fdc   RUNNING      0.0994704                8   True                        4                 2              5            

[36m(tune_training_step pid=29001)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_6cdb1b3f_15_batch_size=8,embedding_size=1024,epochs=14,hidden_size=384,is_bidirectional=True,linear_layers=1,lr_2024-04-24_01-39-55/checkpoint_000003)
[36m(tune_training_step pid=38589)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_eeedd63c_20_batch_size=16,embedding_size=1024,epochs=3,hidden_size=128,is_bidirectional=True,linear_layers=4,lr_2024-04-24_01-55-03/checkpoint_000000)



Trial tune_training_step_eeedd63c completed after 3 iterations at 2024-04-24 03:18:05. Total running time: 2hr 28min 44s
+----------------------------------------------------------------+
| Trial tune_training_step_eeedd63c result                       |
+----------------------------------------------------------------+
| checkpoint_dir_name                          checkpoint_000000 |
| time_this_iter_s                                    1222.73959 |
| time_total_s                                        3663.81842 |
| training_iteration                                           3 |
| score                                                  0.49802 |
+----------------------------------------------------------------+

Trial status: 17 TERMINATED | 2 RUNNING | 16 ERROR
Current time: 2024-04-24 03:18:09. Total running time: 2hr 28min 48s
Logical resource usage: 3.2/16 CPUs, 0.5/1 GPUs (0.0/1.0 accelerator_type:L4)
+---------------------------------------------------------------------------

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
2024-04-24 03:33:43,874	INFO tune.py:1021 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/tune_training_step_2024-04-24_00-49-15' in 0.0163s.
2024-04-24 03:33:43,893	ERROR tune.py:1049 -- Trials did not complete: [tune_training_step_2f0bcead, tune_training_step_86f0eda5, tune_training_step_7b11e5a4, tune_training_step_62ac763c, tune_training_step_27d15d68, tune_training_step_4ebeb881, tune_training_step_8fe06dfc, tune_training_step_404edc2b, tune_training_step_905be566, tune_training_step_2c7abee9, tune_training_step_bc38a942, tune_training_step_b64136ed, tune_training_step_5b3543ab, tune_training_step_d06455d1, tune_training_step_2b191b98, tune_trai


Trial tune_training_step_5e105fdc completed after 4 iterations at 2024-04-24 03:33:43. Total running time: 2hr 44min 22s
+------------------------------------------------------+
| Trial tune_training_step_5e105fdc result             |
+------------------------------------------------------+
| checkpoint_dir_name                                  |
| time_this_iter_s                             1363.02 |
| time_total_s                                 8056.74 |
| training_iteration                                 4 |
| score                                        0.50198 |
+------------------------------------------------------+

Trial status: 19 TERMINATED | 16 ERROR
Current time: 2024-04-24 03:33:43. Total running time: 2hr 44min 22s
Logical resource usage: 1.6/16 CPUs, 0.25/1 GPUs (0.0/1.0 accelerator_type:L4)
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Model Evaluation

In [12]:
best_result = result.get_best_result("score", mode="max")
with best_result.checkpoint.as_directory() as checkpoint_dir:
    state_dict = torch.load(os.path.join(checkpoint_dir, "model.pth"))

print(best_result)

Result(
  metrics={'score': 0.8920048773288727},
  path='/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_2d98e058_6_batch_size=128,embedding_size=512,epochs=12,hidden_size=128,is_bidirectional=True,linear_layers=2,lr_2024-04-24_00-50-45',
  filesystem='local',
  checkpoint=Checkpoint(filesystem=local, path=/root/ray_results/tune_training_step_2024-04-24_00-49-15/tune_training_step_2d98e058_6_batch_size=128,embedding_size=512,epochs=12,hidden_size=128,is_bidirectional=True,linear_layers=2,lr_2024-04-24_00-50-45/checkpoint_000003)
)


In [16]:
chosen_embedding = embeddings_512
coded_embeddings = torch.FloatTensor(chosen_embedding.wv.vectors).to(DEVICE)
vocab = chosen_embedding.wv.key_to_index
model = TwoLinearLayerGruRNN(512, 1, 512, coded_embeddings, vocab, 2, is_bidirectional=True).to(DEVICE)
model.load_state_dict(state_dict)


<All keys matched successfully>

In [21]:
model.eval()
torch.set_default_device(DEVICE)
torch.set_grad_enabled(False)
num_correct = 0
loader = DataLoader(test_samples, batch_size=64, generator=torch.Generator(device=DEVICE))

answers = torch.Tensor().to(DEVICE)
gold_standard = torch.Tensor().to(DEVICE)

for index, value in enumerate(loader):
    data, labels = value
    labels = labels.reshape(-1,1)
    labels = labels.type(torch.FloatTensor)
    labels = labels.to(DEVICE)

    outputs = model(data)

    answers = torch.cat((answers, outputs))
    gold_standard = torch.cat((gold_standard, labels))


torch.set_grad_enabled(True)
answers = torch.sigmoid(answers)
accuracy = torchmetrics.Accuracy(task="binary")
mcc = torchmetrics.MatthewsCorrCoef(task="binary")
f1_score = torchmetrics.F1Score(task="binary")
print(f"Accuracy {accuracy(answers, gold_standard)}")
print(f"MCC {mcc(answers, gold_standard)}")
print(f"F1 {f1_score(answers, gold_standard)}")


Accuracy 0.9399999976158142
MCC 0.8864052295684814
F1 0.936170220375061
