Google drive

In [11]:
from google.colab import drive
drive.mount('/content/drive')

path_to_drive = "/content/drive/MyDrive/Colab Notebooks"

Mounted at /content/drive


## spec

In [1]:
!nvidia-smi

Mon Sep  6 07:02:56 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P8    32W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Get GitHub repository files and install python packages

In [2]:
!git clone https://github.com/Team-M1/badwords-classifier-train -b WJ
%cd badwords-classifier-train
!pip install -r requirements.txt

Cloning into 'badwords-classifier-train'...
remote: Enumerating objects: 257, done.[K
remote: Counting objects: 100% (257/257), done.[K
remote: Compressing objects: 100% (215/215), done.[K
remote: Total 257 (delta 142), reused 154 (delta 40), pack-reused 0[K
Receiving objects: 100% (257/257), 2.95 MiB | 13.25 MiB/s, done.
Resolving deltas: 100% (142/142), done.
/content/badwords-classifier-train
Collecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 5.5 MB/s 
Collecting torchmetrics
  Downloading torchmetrics-0.5.1-py3-none-any.whl (282 kB)
[K     |████████████████████████████████| 282 kB 35.7 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 33.5 MB/s 
Collecting pytorch-lightning
  Downloading pytorch_lightning-1.4.5-py3-none-any.whl (919 kB)
[K     |████████████████████████████████| 919 kB 20.3 MB/s 
[?25hCollec

## model and tokenizer



In [3]:
model_config = {
    "num_labels": 3,
    "id2label": {0: 0, 1: 1, 2: 2},
    "label2id": {0: 0, 1: 1, 2: 2}
}

In [4]:
from transformers import ElectraForSequenceClassification, ElectraTokenizer

# model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v3-discriminator", **model_config)
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-v3-discriminator")

Downloading:   0%|          | 0.00/263k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/458 [00:00<?, ?B/s]

## datasets

In [5]:
from data_loader import get_data_loaders

train_data, val_data, test_data = get_data_loaders(tokenizer, return_loader=False)

Using custom data configuration default-cfa12129877ac758


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-cfa12129877ac758/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff...


0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-cfa12129877ac758/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff. Subsequent calls will reuse this data.


  0%|          | 0/40242 [00:00<?, ?ex/s]

  0%|          | 0/4472 [00:00<?, ?ex/s]

  0%|          | 0/5000 [00:00<?, ?ex/s]

## metric

In [6]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred, average="macro")

    return {"accuracy": accuracy, "f1": f1}

## Do Train

In [7]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
from transformers import set_seed

set_seed(42)

In [12]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir=f'{path_to_drive}/koelectra-small-v3-discriminator-test',
    seed=42,
    num_train_epochs=200,
    learning_rate=1e-4,
    weight_decay=0.0,
    gradient_accumulation_steps=1,
    adam_epsilon=1e-8,
    max_grad_norm=1.0,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    # checkpoint
    overwrite_output_dir=True,
    save_strategy='steps',
    save_steps=1000,

    # evaluation
    evaluation_strategy='steps',
    eval_steps=1000,
    metric_for_best_model="f1",

    # early stopping
    load_best_model_at_end=True
)

In [11]:
from transformers.optimization import Adafactor, AdafactorSchedule

optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
scheduler = AdafactorSchedule(optimizer)

In [13]:
from trainer import ImbalancedSamplerTrainer

def model_init():
    return ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v3-discriminator", **model_config, return_dict=True)

trainer = ImbalancedSamplerTrainer(
    model_init=model_init,
    args=args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
    # optimizers=(optimizer, scheduler)
)

loading configuration file https://huggingface.co/monologg/koelectra-small-v3-discriminator/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/bd0f09888c5a5619ddb9de81d4a9936a94e5f45064f9a23ba6d39241ceebce02.d2485d28e5c07ca60bfa4fe84af673e0df83401e5c56bcdd991878cb4966eb34
Model config ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": 0,
    "1": 1,
    "2": 2
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "label2id": {
    "0": 0,
    "1": 1,
    "2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary

Downloading:   0%|          | 0.00/56.6M [00:00<?, ?B/s]

storing https://huggingface.co/monologg/koelectra-small-v3-discriminator/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/052bc3ecf8c8484f1519650794b32b6d2c70750bcba71d1f763951514b5cf0c8.84bb33167d2e89d46f4cde129b2f4c447618ac33ac46f2012ee9e5e706fec112
creating metadata file for /root/.cache/huggingface/transformers/052bc3ecf8c8484f1519650794b32b6d2c70750bcba71d1f763951514b5cf0c8.84bb33167d2e89d46f4cde129b2f4c447618ac33ac46f2012ee9e5e706fec112
loading weights file https://huggingface.co/monologg/koelectra-small-v3-discriminator/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/052bc3ecf8c8484f1519650794b32b6d2c70750bcba71d1f763951514b5cf0c8.84bb33167d2e89d46f4cde129b2f4c447618ac33ac46f2012ee9e5e706fec112
Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.

# install ray

In [14]:
!pip install "ray[tune]" transformers datasets scipy sklearn torch

Collecting ray[tune]
  Downloading ray-1.6.0-cp37-cp37m-manylinux2014_x86_64.whl (49.6 MB)
[K     |████████████████████████████████| 49.6 MB 6.2 kB/s 
Collecting redis>=3.5.0
  Downloading redis-3.5.3-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 476 kB/s 
Collecting tensorboardX>=1.9
  Downloading tensorboardX-2.4-py2.py3-none-any.whl (124 kB)
[K     |████████████████████████████████| 124 kB 55.3 MB/s 
Installing collected packages: redis, tensorboardX, ray
Successfully installed ray-1.6.0 redis-3.5.3 tensorboardX-2.4


# haperparameter search

In [None]:
trainer.hyperparameter_search(
    direction="maximize",
    backend="ray",
    n_trials=10 # number of trials
)

No `resources_per_trial` arg was passed into `hyperparameter_search`. Setting it to a default value of 1 CPU and 1 GPU for each trial.


== Status ==
Memory usage on this node: 2.2/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.32 GiB heap, 0.0/3.66 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/_objective_2021-09-06_07-07-38
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+------------------------+----------+-------+-----------------+--------------------+-------------------------------+----------+
| Trial name             | status   | loc   |   learning_rate |   num_train_epochs |   per_device_train_batch_size |     seed |
|------------------------+----------+-------+-----------------+--------------------+-------------------------------+----------|
| _objective_1e810_00000 | RUNNING  |       |     5.61152e-06 |                  5 |                            64 |  8.15396 |
| _objective_1e810_00001 | PENDING  |       |     1.56207e-05 |                  2 |                            16 |  7.08379 |
| _objective_1e810_00002 | PENDING  |       |  

[2m[36m(pid=478)[0m Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight']
[2m[36m(pid=478)[0m - This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(pid=478)[0m - This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(pid=478)[0m Some weights of ElectraForSequenceClassification were not initialized from

Result for _objective_1e810_00000:
  {}
  


[2m[36m(pid=477)[0m Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
[2m[36m(pid=477)[0m - This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(pid=477)[0m - This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(pid=477)[0m Some weights of ElectraForSequenceClassification were not initialized from

[2m[36m(pid=477)[0m {'loss': 0.9483, 'learning_rate': 1.4068557976550074e-05, 'epoch': 0.2}


 10%|▉         | 501/5032 [05:32<49:50,  1.52it/s]
 10%|▉         | 502/5032 [05:33<49:58,  1.51it/s]
 10%|▉         | 503/5032 [05:34<50:05,  1.51it/s]
 10%|█         | 504/5032 [05:34<50:00,  1.51it/s]
 10%|█         | 505/5032 [05:35<50:10,  1.50it/s]
 10%|█         | 506/5032 [05:36<50:24,  1.50it/s]
 10%|█         | 507/5032 [05:36<50:12,  1.50it/s]
 10%|█         | 508/5032 [05:37<50:06,  1.50it/s]
 10%|█         | 509/5032 [05:38<50:09,  1.50it/s]
 10%|█         | 510/5032 [05:38<50:04,  1.51it/s]
 10%|█         | 511/5032 [05:39<50:06,  1.50it/s]
 10%|█         | 512/5032 [05:40<50:08,  1.50it/s]
 10%|█         | 513/5032 [05:40<49:49,  1.51it/s]
 10%|█         | 514/5032 [05:41<50:03,  1.50it/s]
 10%|█         | 515/5032 [05:42<49:57,  1.51it/s]
 10%|█         | 516/5032 [05:42<49:52,  1.51it/s]
 10%|█         | 517/5032 [05:43<49:55,  1.51it/s]
 10%|█         | 518/5032 [05:44<49:39,  1.51it/s]
 10%|█         | 519/5032 [05:44<49:39,  1.51it/s]
 10%|█         | 520/5032 [05:4

[2m[36m(pid=477)[0m {'loss': 0.6894, 'learning_rate': 1.2516422277460261e-05, 'epoch': 0.4}


[2m[36m(pid=477)[0m 
  0%|          | 0/140 [00:00<?, ?it/s][A
[2m[36m(pid=477)[0m 
  1%|▏         | 2/140 [00:00<00:30,  4.50it/s][A
[2m[36m(pid=477)[0m 
  2%|▏         | 3/140 [00:00<00:42,  3.24it/s][A
[2m[36m(pid=477)[0m 
  3%|▎         | 4/140 [00:01<00:48,  2.81it/s][A
[2m[36m(pid=477)[0m 
  4%|▎         | 5/140 [00:01<00:51,  2.61it/s][A
[2m[36m(pid=477)[0m 
  4%|▍         | 6/140 [00:02<00:53,  2.50it/s][A
[2m[36m(pid=477)[0m 
  5%|▌         | 7/140 [00:02<00:54,  2.44it/s][A
[2m[36m(pid=477)[0m 
  6%|▌         | 8/140 [00:03<00:54,  2.40it/s][A
[2m[36m(pid=477)[0m 
  6%|▋         | 9/140 [00:03<00:55,  2.37it/s][A
[2m[36m(pid=477)[0m 
  7%|▋         | 10/140 [00:03<00:55,  2.36it/s][A
[2m[36m(pid=477)[0m 
  8%|▊         | 11/140 [00:04<00:55,  2.34it/s][A
[2m[36m(pid=477)[0m 
  9%|▊         | 12/140 [00:04<00:55,  2.33it/s][A
[2m[36m(pid=477)[0m 
  9%|▉         | 13/140 [00:05<00:54,  2.32it/s][A
[2m[36m(pid=477)[0m 
 10%|

Result for _objective_1e810_00001:
  date: 2021-09-06_07-19-56
  done: false
  epoch: 0.4
  eval_accuracy: 0.8477191413237924
  eval_f1: 0.6094269080977907
  eval_loss: 0.4467000663280487
  eval_runtime: 60.7869
  eval_samples_per_second: 73.569
  eval_steps_per_second: 2.303
  experiment_id: ac070bd67d2642f097c5da25ca5c43d8
  hostname: c91990d96aa3
  iterations_since_restore: 1
  node_ip: 172.28.0.2
  objective: 1.457146049421583
  pid: 477
  time_since_restore: 727.6657283306122
  time_this_iter_s: 727.6657283306122
  time_total_s: 727.6657283306122
  timestamp: 1630912796
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 1e810_00001
  
== Status ==
Memory usage on this node: 3.3/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.32 GiB heap, 0.0/3.66 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/_objective_2021-09-06_07-07-38
Number of trials: 10/10 (1 ERROR, 8 PENDING, 1 RUNNING)
+---------------

[2m[36m(pid=477)[0m 
[2m[36m(pid=477)[0m 100%|██████████| 140/140 [01:00<00:00,  2.47it/s][A                                                   
[2m[36m(pid=477)[0m                                                  [A 20%|█▉        | 1000/5032 [12:03<44:29,  1.51it/s]
[2m[36m(pid=477)[0m 100%|██████████| 140/140 [01:00<00:00,  2.47it/s][A
                                                 [A
 20%|█▉        | 1001/5032 [12:05<21:29:39, 19.20s/it]
 20%|█▉        | 1002/5032 [12:06<15:16:02, 13.64s/it]
 20%|█▉        | 1003/5032 [12:06<10:54:19,  9.74s/it]
 20%|█▉        | 1004/5032 [12:07<7:51:08,  7.02s/it] 
 20%|█▉        | 1005/5032 [12:08<5:43:23,  5.12s/it]
 20%|█▉        | 1006/5032 [12:08<4:13:33,  3.78s/it]
 20%|██        | 1007/5032 [12:09<3:10:44,  2.84s/it]
 20%|██        | 1008/5032 [12:10<2:27:43,  2.20s/it]
 20%|██        | 1009/5032 [12:10<1:57:52,  1.76s/it]
 20%|██        | 1010/5032 [12:11<1:37:09,  1.45s/it]
 20%|██        | 1011/5032 [12:12<1:21:16,

[2m[36m(pid=477)[0m {'loss': 0.568, 'learning_rate': 1.0964286578370445e-05, 'epoch': 0.6}


 30%|██▉       | 1501/5032 [17:35<39:03,  1.51it/s]
 30%|██▉       | 1502/5032 [17:36<39:02,  1.51it/s]
 30%|██▉       | 1503/5032 [17:36<38:55,  1.51it/s]
 30%|██▉       | 1504/5032 [17:37<38:55,  1.51it/s]
 30%|██▉       | 1505/5032 [17:38<38:57,  1.51it/s]
 30%|██▉       | 1506/5032 [17:38<39:00,  1.51it/s]
 30%|██▉       | 1507/5032 [17:39<38:51,  1.51it/s]
 30%|██▉       | 1508/5032 [17:40<38:47,  1.51it/s]
 30%|██▉       | 1509/5032 [17:40<38:42,  1.52it/s]
 30%|███       | 1510/5032 [17:41<38:36,  1.52it/s]
 30%|███       | 1511/5032 [17:42<38:41,  1.52it/s]
 30%|███       | 1512/5032 [17:42<38:44,  1.51it/s]
 30%|███       | 1513/5032 [17:43<38:45,  1.51it/s]
 30%|███       | 1514/5032 [17:44<38:45,  1.51it/s]
 30%|███       | 1515/5032 [17:44<38:44,  1.51it/s]
 30%|███       | 1516/5032 [17:45<38:42,  1.51it/s]
 30%|███       | 1517/5032 [17:46<38:44,  1.51it/s]
 30%|███       | 1518/5032 [17:46<38:44,  1.51it/s]
 30%|███       | 1519/5032 [17:47<38:47,  1.51it/s]
 30%|███    

[2m[36m(pid=477)[0m {'loss': 0.4901, 'learning_rate': 9.412150879280633e-06, 'epoch': 0.79}


[2m[36m(pid=477)[0m 
  0%|          | 0/140 [00:00<?, ?it/s][A
[2m[36m(pid=477)[0m 
  1%|▏         | 2/140 [00:00<00:30,  4.52it/s][A
[2m[36m(pid=477)[0m 
  2%|▏         | 3/140 [00:00<00:42,  3.23it/s][A
[2m[36m(pid=477)[0m 
  3%|▎         | 4/140 [00:01<00:48,  2.80it/s][A
[2m[36m(pid=477)[0m 
  4%|▎         | 5/140 [00:01<00:51,  2.61it/s][A
[2m[36m(pid=477)[0m 
  4%|▍         | 6/140 [00:02<00:53,  2.50it/s][A
[2m[36m(pid=477)[0m 
  5%|▌         | 7/140 [00:02<00:54,  2.43it/s][A
[2m[36m(pid=477)[0m 
  6%|▌         | 8/140 [00:03<00:55,  2.39it/s][A
[2m[36m(pid=477)[0m 
  6%|▋         | 9/140 [00:03<00:55,  2.37it/s][A
[2m[36m(pid=477)[0m 
  7%|▋         | 10/140 [00:03<00:55,  2.34it/s][A
[2m[36m(pid=477)[0m 
  8%|▊         | 11/140 [00:04<00:55,  2.33it/s][A
[2m[36m(pid=477)[0m 
  9%|▊         | 12/140 [00:04<00:55,  2.31it/s][A
[2m[36m(pid=477)[0m 
  9%|▉         | 13/140 [00:05<00:55,  2.31it/s][A
[2m[36m(pid=477)[0m 
 10%|

Result for _objective_1e810_00001:
  date: 2021-09-06_07-31-58
  done: false
  epoch: 0.79
  eval_accuracy: 0.8506261180679785
  eval_f1: 0.6306653433942117
  eval_loss: 0.4035838842391968
  eval_runtime: 60.7622
  eval_samples_per_second: 73.598
  eval_steps_per_second: 2.304
  experiment_id: ac070bd67d2642f097c5da25ca5c43d8
  hostname: c91990d96aa3
  iterations_since_restore: 2
  node_ip: 172.28.0.2
  objective: 1.4812914614621902
  pid: 477
  time_since_restore: 1449.84845662117
  time_this_iter_s: 722.1827282905579
  time_total_s: 1449.84845662117
  timestamp: 1630913518
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: 1e810_00001
  
== Status ==
Memory usage on this node: 3.3/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.32 GiB heap, 0.0/3.66 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/_objective_2021-09-06_07-07-38
Number of trials: 10/10 (1 ERROR, 8 PENDING, 1 RUNNING)
+---------------

[2m[36m(pid=477)[0m 
[2m[36m(pid=477)[0m 100%|██████████| 140/140 [01:00<00:00,  2.49it/s][A                                                   
[2m[36m(pid=477)[0m                                                  [A 40%|███▉      | 2000/5032 [24:05<33:21,  1.51it/s]
[2m[36m(pid=477)[0m 100%|██████████| 140/140 [01:00<00:00,  2.49it/s][A
                                                 [A
 40%|███▉      | 2001/5032 [24:07<16:08:10, 19.17s/it]
 40%|███▉      | 2002/5032 [24:08<11:27:22, 13.61s/it]
 40%|███▉      | 2003/5032 [24:08<8:10:54,  9.72s/it] 
 40%|███▉      | 2004/5032 [24:09<5:53:26,  7.00s/it]
 40%|███▉      | 2005/5032 [24:10<4:17:16,  5.10s/it]
 40%|███▉      | 2006/5032 [24:10<3:10:04,  3.77s/it]
 40%|███▉      | 2007/5032 [24:11<2:22:59,  2.84s/it]
 40%|███▉      | 2008/5032 [24:12<1:50:04,  2.18s/it]
 40%|███▉      | 2009/5032 [24:12<1:27:47,  1.74s/it]
 40%|███▉      | 2010/5032 [24:13<1:12:33,  1.44s/it]
 40%|███▉      | 2011/5032 [24:14<1:00:46, 

[2m[36m(pid=477)[0m {'loss': 0.4424, 'learning_rate': 7.860015180190818e-06, 'epoch': 0.99}


 50%|████▉     | 2501/5032 [29:38<27:55,  1.51it/s]
 50%|████▉     | 2502/5032 [29:39<27:48,  1.52it/s]
 50%|████▉     | 2503/5032 [29:39<27:44,  1.52it/s]
 50%|████▉     | 2504/5032 [29:40<27:42,  1.52it/s]
 50%|████▉     | 2505/5032 [29:41<27:49,  1.51it/s]
 50%|████▉     | 2506/5032 [29:41<27:47,  1.52it/s]
 50%|████▉     | 2507/5032 [29:42<27:45,  1.52it/s]
 50%|████▉     | 2508/5032 [29:43<27:42,  1.52it/s]
 50%|████▉     | 2509/5032 [29:43<27:37,  1.52it/s]
 50%|████▉     | 2510/5032 [29:44<29:26,  1.43it/s]
 50%|████▉     | 2511/5032 [29:45<28:52,  1.45it/s]
 50%|████▉     | 2512/5032 [29:45<28:32,  1.47it/s]
 50%|████▉     | 2513/5032 [29:46<28:17,  1.48it/s]
 50%|████▉     | 2514/5032 [29:47<28:06,  1.49it/s]
 50%|████▉     | 2515/5032 [29:47<28:02,  1.50it/s]
 50%|█████     | 2516/5032 [29:48<21:34,  1.94it/s]
 50%|█████     | 2517/5032 [29:48<23:44,  1.77it/s]
 50%|█████     | 2518/5032 [29:49<24:55,  1.68it/s]
 50%|█████     | 2519/5032 [29:50<25:44,  1.63it/s]
 50%|█████  

[2m[36m(pid=477)[0m {'loss': 0.4188, 'learning_rate': 6.307879481101004e-06, 'epoch': 1.19}


[2m[36m(pid=477)[0m 
  0%|          | 0/140 [00:00<?, ?it/s][A
[2m[36m(pid=477)[0m 
  1%|▏         | 2/140 [00:00<00:31,  4.44it/s][A
[2m[36m(pid=477)[0m 
  2%|▏         | 3/140 [00:00<00:43,  3.16it/s][A
[2m[36m(pid=477)[0m 
  3%|▎         | 4/140 [00:01<00:48,  2.78it/s][A
[2m[36m(pid=477)[0m 
  4%|▎         | 5/140 [00:01<00:52,  2.59it/s][A
[2m[36m(pid=477)[0m 
  4%|▍         | 6/140 [00:02<00:54,  2.48it/s][A
[2m[36m(pid=477)[0m 
  5%|▌         | 7/140 [00:02<00:55,  2.42it/s][A
[2m[36m(pid=477)[0m 
  6%|▌         | 8/140 [00:03<00:55,  2.38it/s][A
[2m[36m(pid=477)[0m 
  6%|▋         | 9/140 [00:03<00:55,  2.35it/s][A
[2m[36m(pid=477)[0m 
  7%|▋         | 10/140 [00:03<00:55,  2.33it/s][A
[2m[36m(pid=477)[0m 
  8%|▊         | 11/140 [00:04<00:55,  2.32it/s][A
[2m[36m(pid=477)[0m 
  9%|▊         | 12/140 [00:04<00:55,  2.31it/s][A
[2m[36m(pid=477)[0m 
  9%|▉         | 13/140 [00:05<00:55,  2.31it/s][A
[2m[36m(pid=477)[0m 
 10%|

Result for _objective_1e810_00001:
  date: 2021-09-06_07-44-03
  done: false
  epoch: 1.19
  eval_accuracy: 0.8423524150268337
  eval_f1: 0.6287487526967918
  eval_loss: 0.41695380210876465
  eval_runtime: 61.2026
  eval_samples_per_second: 73.069
  eval_steps_per_second: 2.287
  experiment_id: ac070bd67d2642f097c5da25ca5c43d8
  hostname: c91990d96aa3
  iterations_since_restore: 3
  node_ip: 172.28.0.2
  objective: 1.4711011677236254
  pid: 477
  time_since_restore: 2174.494803905487
  time_this_iter_s: 724.646347284317
  time_total_s: 2174.494803905487
  timestamp: 1630914243
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: 1e810_00001
  
== Status ==
Memory usage on this node: 3.3/12.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/2 CPUs, 1.0/1 GPUs, 0.0/7.32 GiB heap, 0.0/3.66 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/_objective_2021-09-06_07-07-38
Number of trials: 10/10 (1 ERROR, 8 PENDING, 1 RUNNING)
+-------------

[2m[36m(pid=477)[0m 
[2m[36m(pid=477)[0m 100%|██████████| 140/140 [01:00<00:00,  2.45it/s][A                                                   
[2m[36m(pid=477)[0m                                                  [A 60%|█████▉    | 3000/5032 [36:10<22:37,  1.50it/s]
[2m[36m(pid=477)[0m 100%|██████████| 140/140 [01:00<00:00,  2.45it/s][A
                                                 [A
 60%|█████▉    | 3001/5032 [36:12<10:54:35, 19.34s/it]
 60%|█████▉    | 3002/5032 [36:12<7:44:46, 13.74s/it] 
 60%|█████▉    | 3003/5032 [36:13<5:32:01,  9.82s/it]
 60%|█████▉    | 3004/5032 [36:14<3:59:01,  7.07s/it]
 60%|█████▉    | 3005/5032 [36:14<2:53:54,  5.15s/it]
 60%|█████▉    | 3006/5032 [36:15<2:08:37,  3.81s/it]
 60%|█████▉    | 3007/5032 [36:16<1:37:37,  2.89s/it]
 60%|█████▉    | 3008/5032 [36:17<1:15:15,  2.23s/it]
 60%|█████▉    | 3009/5032 [36:17<1:00:27,  1.79s/it]
 60%|█████▉    | 3010/5032 [36:18<49:57,  1.48s/it]  
 60%|█████▉    | 3011/5032 [36:19<41:47,  1.