### Accelerate memory estimates

In [1]:
!accelerate estimate-memory ibm-granite/granite-7b-base

Loading pretrained config for `ibm-granite/granite-7b-base` from `transformers`...
┌────────────────────────────────────────────────────┐
│Memory Usage for loading `ibm-granite/granite-7b-base`│
├───────┬─────────────┬──────────┬───────────────────┤
│ dtype │Largest Layer│Total Size│Training using Adam│
├───────┼─────────────┼──────────┼───────────────────┤
│float32│  774.03 MB  │ 24.68 GB │      98.71 GB     │
│float16│  387.02 MB  │ 12.34 GB │      49.35 GB     │
│  int8 │  193.51 MB  │ 6.17 GB  │      24.68 GB     │
│  int4 │   96.75 MB  │ 3.08 GB  │      12.34 GB     │
└───────┴─────────────┴──────────┴───────────────────┘


In [2]:
!accelerate estimate-memory databricks/dolly-v2-3b

Loading pretrained config for `databricks/dolly-v2-3b` from `transformers`...
┌────────────────────────────────────────────────────┐
│ Memory Usage for loading `databricks/dolly-v2-3b`  │
├───────┬─────────────┬──────────┬───────────────────┤
│ dtype │Largest Layer│Total Size│Training using Adam│
├───────┼─────────────┼──────────┼───────────────────┤
│float32│  491.02 MB  │ 9.88 GB  │      39.54 GB     │
│float16│  245.51 MB  │ 4.94 GB  │      19.77 GB     │
│  int8 │  122.75 MB  │ 2.47 GB  │      9.88 GB      │
│  int4 │   61.38 MB  │ 1.24 GB  │      4.94 GB      │
└───────┴─────────────┴──────────┴───────────────────┘


In [1]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling, AdamW, get_scheduler
from transformers import DataCollatorWithPadding
from tqdm.auto import tqdm
from accelerate import Accelerator
from torch.utils.data.dataloader import DataLoader
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "databricks/dolly-v2-3b"
#model_path = "ibm-granite/granite-3b-code-base"
#model_path = "ibm-granite/granite-7b-base"



In [3]:
model = AutoModelForCausalLM.from_pretrained(model_path)
#optimizer = AdamW(model.parameters(), lr=3e-5)

import bitsandbytes as bnb
optimizer = bnb.optim.Adam8bit(model.parameters(), lr=1e-3)



In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_prompt(data_point):
    prompt = f""" Given a question and the possible answer choices, give the index of the right choice.
    ### Question
    {data_point['question']}
    ### Choices
    {data_point['choices']}
    ### Answer
    {data_point['answer']}
    """
    result = tokenizer(prompt,
                     padding="max_length",
                     max_length=512,
                     truncation=True)
    # Why is this necessary?
    result["labels"] = result["input_ids"].copy()
    return result

dataset = load_dataset("cais/mmlu", 'all')
small_train_dataset = dataset['auxiliary_train'].shuffle(seed=42).select(range(1000))
small_eval_dataset = dataset['validation'].shuffle(seed=42).select(range(200))

# I see a very weird behavior (scalar input IDs instead of max length vector) when I do batched=True in the map funciton
# Anyhow that's only for speeding up the map task
small_train_dataset_tokenized = small_train_dataset.map(tokenize_prompt,
                                                        remove_columns=['question', 'subject', 'choices', 'answer'])
small_eval_dataset_tokenized = small_eval_dataset.map(tokenize_prompt,
                                                      remove_columns=['question', 'subject', 'choices', 'answer'])

small_train_dataset_tokenized.set_format("torch")
small_eval_dataset_tokenized.set_format("torch")

data_collator = DataCollatorForLanguageModeling(tokenizer,
                                               mlm=False)

train_dataloader = DataLoader(small_train_dataset_tokenized, shuffle=True, batch_size=2, collate_fn=data_collator)
eval_dataloader = DataLoader(small_eval_dataset_tokenized, batch_size=2, collate_fn=data_collator)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
accelerator = Accelerator(mixed_precision="fp16")

train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)


In [7]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
eval_interval = 100
global_step = 0

progress_bar = tqdm(range(num_training_steps))
model.gradient_checkpointing_enable()
model.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        outputs = model(**batch)
        loss = outputs.loss
        print(loss)
        accelerator.backward(loss)
        accelerator.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
        global_step += 1
        
        if global_step % eval_interval == 0:
            model.eval()
            eval_loss = 0
            for eval_batch in eval_dl:
                with torch.no_grad():
                    eval_outputs = model(**eval_batch)
                    eval_loss += eval_outputs.loss.item()
            eval_loss /= len(eval_dl)
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{global_step}], Eval Loss: {eval_loss}")
            
            model.train()

tensor(5.3460, device='cuda:0', grad_fn=<NllLossBackward0>)


 64%|██████▎   | 956/1500 [08:28<04:06,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0163, device='cuda:0', grad_fn=<NllLossBackward0>)


 64%|██████▍   | 957/1500 [08:28<04:06,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.3863, device='cuda:0', grad_fn=<NllLossBackward0>)


 64%|██████▍   | 958/1500 [08:29<04:05,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(8.2036, device='cuda:0', grad_fn=<NllLossBackward0>)


 64%|██████▍   | 959/1500 [08:29<04:04,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2566, device='cuda:0', grad_fn=<NllLossBackward0>)


 64%|██████▍   | 960/1500 [08:30<04:04,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6892, device='cuda:0', grad_fn=<NllLossBackward0>)


 64%|██████▍   | 961/1500 [08:30<04:03,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6882, device='cuda:0', grad_fn=<NllLossBackward0>)


 64%|██████▍   | 962/1500 [08:31<04:03,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1809, device='cuda:0', grad_fn=<NllLossBackward0>)


 64%|██████▍   | 963/1500 [08:31<04:02,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5856, device='cuda:0', grad_fn=<NllLossBackward0>)


 64%|██████▍   | 964/1500 [08:31<04:02,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4742, device='cuda:0', grad_fn=<NllLossBackward0>)


 64%|██████▍   | 965/1500 [08:32<04:01,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1772, device='cuda:0', grad_fn=<NllLossBackward0>)


 64%|██████▍   | 966/1500 [08:32<04:00,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9228, device='cuda:0', grad_fn=<NllLossBackward0>)


 64%|██████▍   | 967/1500 [08:33<04:00,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.1764, device='cuda:0', grad_fn=<NllLossBackward0>)


 65%|██████▍   | 968/1500 [08:33<04:00,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.0625, device='cuda:0', grad_fn=<NllLossBackward0>)


 65%|██████▍   | 969/1500 [08:34<03:59,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8032, device='cuda:0', grad_fn=<NllLossBackward0>)


 65%|██████▍   | 970/1500 [08:34<03:58,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.5435, device='cuda:0', grad_fn=<NllLossBackward0>)


 65%|██████▍   | 971/1500 [08:35<03:58,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2723, device='cuda:0', grad_fn=<NllLossBackward0>)


 65%|██████▍   | 972/1500 [08:35<03:58,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5319, device='cuda:0', grad_fn=<NllLossBackward0>)


 65%|██████▍   | 973/1500 [08:36<03:58,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6549, device='cuda:0', grad_fn=<NllLossBackward0>)


 65%|██████▍   | 974/1500 [08:36<03:57,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5907, device='cuda:0', grad_fn=<NllLossBackward0>)


 65%|██████▌   | 975/1500 [08:36<03:57,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.2701, device='cuda:0', grad_fn=<NllLossBackward0>)


 65%|██████▌   | 976/1500 [08:37<03:57,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.9935, device='cuda:0', grad_fn=<NllLossBackward0>)


 65%|██████▌   | 977/1500 [08:37<03:56,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7441, device='cuda:0', grad_fn=<NllLossBackward0>)


 65%|██████▌   | 978/1500 [08:38<03:55,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2211, device='cuda:0', grad_fn=<NllLossBackward0>)


 65%|██████▌   | 979/1500 [08:38<03:55,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3278, device='cuda:0', grad_fn=<NllLossBackward0>)


 65%|██████▌   | 980/1500 [08:39<03:55,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4572, device='cuda:0', grad_fn=<NllLossBackward0>)


 65%|██████▌   | 981/1500 [08:39<03:54,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6133, device='cuda:0', grad_fn=<NllLossBackward0>)


 65%|██████▌   | 982/1500 [08:40<03:53,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7962, device='cuda:0', grad_fn=<NllLossBackward0>)


 66%|██████▌   | 983/1500 [08:40<03:53,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0723, device='cuda:0', grad_fn=<NllLossBackward0>)


 66%|██████▌   | 984/1500 [08:40<03:52,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8443, device='cuda:0', grad_fn=<NllLossBackward0>)


 66%|██████▌   | 985/1500 [08:41<03:52,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9955, device='cuda:0', grad_fn=<NllLossBackward0>)


 66%|██████▌   | 986/1500 [08:41<03:52,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0227, device='cuda:0', grad_fn=<NllLossBackward0>)


 66%|██████▌   | 987/1500 [08:42<03:51,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5683, device='cuda:0', grad_fn=<NllLossBackward0>)


 66%|██████▌   | 988/1500 [08:42<03:51,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6575, device='cuda:0', grad_fn=<NllLossBackward0>)


 66%|██████▌   | 989/1500 [08:43<03:50,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.0486, device='cuda:0', grad_fn=<NllLossBackward0>)


 66%|██████▌   | 990/1500 [08:43<03:50,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3025, device='cuda:0', grad_fn=<NllLossBackward0>)


 66%|██████▌   | 991/1500 [08:44<03:49,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.0617, device='cuda:0', grad_fn=<NllLossBackward0>)


 66%|██████▌   | 992/1500 [08:44<03:49,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5833, device='cuda:0', grad_fn=<NllLossBackward0>)


 66%|██████▌   | 993/1500 [08:45<03:48,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.8425, device='cuda:0', grad_fn=<NllLossBackward0>)


 66%|██████▋   | 994/1500 [08:45<03:48,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6242, device='cuda:0', grad_fn=<NllLossBackward0>)


 66%|██████▋   | 995/1500 [08:45<03:47,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7657, device='cuda:0', grad_fn=<NllLossBackward0>)


 66%|██████▋   | 996/1500 [08:46<03:46,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5093, device='cuda:0', grad_fn=<NllLossBackward0>)


 66%|██████▋   | 997/1500 [08:46<03:46,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.2256, device='cuda:0', grad_fn=<NllLossBackward0>)


 67%|██████▋   | 998/1500 [08:47<03:46,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4180, device='cuda:0', grad_fn=<NllLossBackward0>)


 67%|██████▋   | 999/1500 [08:47<03:46,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3922, device='cuda:0', grad_fn=<NllLossBackward0>)


 67%|██████▋   | 1000/1500 [08:48<03:45,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch [2/3], Step [1000], Eval Loss: 6.240267558097839
tensor(6.0116, device='cuda:0', grad_fn=<NllLossBackward0>)


 67%|██████▋   | 1001/1500 [08:57<25:06,  3.02s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0450, device='cuda:0', grad_fn=<NllLossBackward0>)


 67%|██████▋   | 1002/1500 [08:57<18:40,  2.25s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.9843, device='cuda:0', grad_fn=<NllLossBackward0>)


 67%|██████▋   | 1003/1500 [08:58<14:10,  1.71s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5909, device='cuda:0', grad_fn=<NllLossBackward0>)


 67%|██████▋   | 1004/1500 [08:58<11:01,  1.33s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6195, device='cuda:0', grad_fn=<NllLossBackward0>)


 67%|██████▋   | 1005/1500 [08:59<08:48,  1.07s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.8161, device='cuda:0', grad_fn=<NllLossBackward0>)


 67%|██████▋   | 1006/1500 [08:59<07:16,  1.13it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0874, device='cuda:0', grad_fn=<NllLossBackward0>)


 67%|██████▋   | 1007/1500 [08:59<06:11,  1.33it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.3155, device='cuda:0', grad_fn=<NllLossBackward0>)


 67%|██████▋   | 1008/1500 [09:00<05:26,  1.51it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9602, device='cuda:0', grad_fn=<NllLossBackward0>)


 67%|██████▋   | 1009/1500 [09:00<04:54,  1.67it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6339, device='cuda:0', grad_fn=<NllLossBackward0>)


 67%|██████▋   | 1010/1500 [09:01<04:31,  1.80it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3021, device='cuda:0', grad_fn=<NllLossBackward0>)


 67%|██████▋   | 1011/1500 [09:01<04:16,  1.91it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.3693, device='cuda:0', grad_fn=<NllLossBackward0>)


 67%|██████▋   | 1012/1500 [09:02<04:04,  1.99it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6367, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 1013/1500 [09:02<03:56,  2.06it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5315, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 1014/1500 [09:03<03:51,  2.10it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0755, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 1015/1500 [09:03<03:47,  2.13it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3769, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 1016/1500 [09:03<03:44,  2.16it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0000, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 1017/1500 [09:04<03:42,  2.17it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4651, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 1018/1500 [09:04<03:40,  2.19it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4833, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 1019/1500 [09:05<03:39,  2.19it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2489, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 1020/1500 [09:05<03:38,  2.20it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7252, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 1021/1500 [09:06<03:37,  2.20it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9124, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 1022/1500 [09:06<03:36,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8264, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 1023/1500 [09:07<03:35,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9535, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 1024/1500 [09:07<03:34,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.3988, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 1025/1500 [09:08<03:34,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5197, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 1026/1500 [09:08<03:33,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6790, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 1027/1500 [09:08<03:33,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7859, device='cuda:0', grad_fn=<NllLossBackward0>)


 69%|██████▊   | 1028/1500 [09:09<03:33,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2641, device='cuda:0', grad_fn=<NllLossBackward0>)


 69%|██████▊   | 1029/1500 [09:09<03:32,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.2889, device='cuda:0', grad_fn=<NllLossBackward0>)


 69%|██████▊   | 1030/1500 [09:10<03:32,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1881, device='cuda:0', grad_fn=<NllLossBackward0>)


 69%|██████▊   | 1031/1500 [09:10<03:31,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.1580, device='cuda:0', grad_fn=<NllLossBackward0>)


 69%|██████▉   | 1032/1500 [09:11<03:31,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6404, device='cuda:0', grad_fn=<NllLossBackward0>)


 69%|██████▉   | 1033/1500 [09:11<03:31,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4557, device='cuda:0', grad_fn=<NllLossBackward0>)


 69%|██████▉   | 1034/1500 [09:12<03:30,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8423, device='cuda:0', grad_fn=<NllLossBackward0>)


 69%|██████▉   | 1035/1500 [09:12<03:30,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7891, device='cuda:0', grad_fn=<NllLossBackward0>)


 69%|██████▉   | 1036/1500 [09:13<03:29,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7345, device='cuda:0', grad_fn=<NllLossBackward0>)


 69%|██████▉   | 1037/1500 [09:13<03:29,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4994, device='cuda:0', grad_fn=<NllLossBackward0>)


 69%|██████▉   | 1038/1500 [09:13<03:28,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3379, device='cuda:0', grad_fn=<NllLossBackward0>)


 69%|██████▉   | 1039/1500 [09:14<03:28,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1937, device='cuda:0', grad_fn=<NllLossBackward0>)


 69%|██████▉   | 1040/1500 [09:14<03:28,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3048, device='cuda:0', grad_fn=<NllLossBackward0>)


 69%|██████▉   | 1041/1500 [09:15<03:27,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1559, device='cuda:0', grad_fn=<NllLossBackward0>)


 69%|██████▉   | 1042/1500 [09:15<03:27,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2173, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|██████▉   | 1043/1500 [09:16<03:26,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2712, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|██████▉   | 1044/1500 [09:16<03:25,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1411, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|██████▉   | 1045/1500 [09:17<03:25,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9578, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|██████▉   | 1046/1500 [09:17<03:25,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.8152, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|██████▉   | 1047/1500 [09:17<03:24,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0650, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|██████▉   | 1048/1500 [09:18<03:23,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0463, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|██████▉   | 1049/1500 [09:18<03:23,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1139, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|███████   | 1050/1500 [09:19<03:23,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7347, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|███████   | 1051/1500 [09:19<03:23,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8557, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|███████   | 1052/1500 [09:20<03:22,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7917, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|███████   | 1053/1500 [09:20<03:22,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9690, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|███████   | 1054/1500 [09:21<03:21,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0637, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|███████   | 1055/1500 [09:21<03:21,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5513, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|███████   | 1056/1500 [09:22<03:21,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1345, device='cuda:0', grad_fn=<NllLossBackward0>)


 70%|███████   | 1057/1500 [09:22<03:20,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7557, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████   | 1058/1500 [09:22<03:20,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9989, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████   | 1059/1500 [09:23<03:19,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8874, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████   | 1060/1500 [09:23<03:18,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5648, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████   | 1061/1500 [09:24<03:18,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1325, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████   | 1062/1500 [09:24<03:17,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6176, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████   | 1063/1500 [09:25<03:17,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9696, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████   | 1064/1500 [09:25<03:17,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3009, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████   | 1065/1500 [09:26<03:16,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0638, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████   | 1066/1500 [09:26<03:16,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.6089, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████   | 1067/1500 [09:27<03:15,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.2886, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████   | 1068/1500 [09:27<03:14,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2268, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████▏  | 1069/1500 [09:27<03:14,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7925, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████▏  | 1070/1500 [09:28<03:13,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4549, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████▏  | 1071/1500 [09:28<03:13,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6375, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████▏  | 1072/1500 [09:29<03:12,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4358, device='cuda:0', grad_fn=<NllLossBackward0>)


 72%|███████▏  | 1073/1500 [09:29<03:12,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6575, device='cuda:0', grad_fn=<NllLossBackward0>)


 72%|███████▏  | 1074/1500 [09:30<03:11,  2.23it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1513, device='cuda:0', grad_fn=<NllLossBackward0>)


 72%|███████▏  | 1075/1500 [09:30<03:11,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1520, device='cuda:0', grad_fn=<NllLossBackward0>)


 72%|███████▏  | 1076/1500 [09:31<03:10,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8915, device='cuda:0', grad_fn=<NllLossBackward0>)


 72%|███████▏  | 1077/1500 [09:31<03:10,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.3602, device='cuda:0', grad_fn=<NllLossBackward0>)


 72%|███████▏  | 1078/1500 [09:31<03:10,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2256, device='cuda:0', grad_fn=<NllLossBackward0>)


 72%|███████▏  | 1079/1500 [09:32<03:09,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8454, device='cuda:0', grad_fn=<NllLossBackward0>)


 72%|███████▏  | 1080/1500 [09:32<03:09,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.6213, device='cuda:0', grad_fn=<NllLossBackward0>)


 72%|███████▏  | 1081/1500 [09:33<03:08,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6378, device='cuda:0', grad_fn=<NllLossBackward0>)


 72%|███████▏  | 1082/1500 [09:33<03:08,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7414, device='cuda:0', grad_fn=<NllLossBackward0>)


 72%|███████▏  | 1083/1500 [09:34<03:07,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2635, device='cuda:0', grad_fn=<NllLossBackward0>)


 72%|███████▏  | 1084/1500 [09:34<03:07,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5608, device='cuda:0', grad_fn=<NllLossBackward0>)


 72%|███████▏  | 1085/1500 [09:35<03:07,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4789, device='cuda:0', grad_fn=<NllLossBackward0>)


 72%|███████▏  | 1086/1500 [09:35<03:06,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.0672, device='cuda:0', grad_fn=<NllLossBackward0>)


 72%|███████▏  | 1087/1500 [09:36<03:05,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3044, device='cuda:0', grad_fn=<NllLossBackward0>)


 73%|███████▎  | 1088/1500 [09:36<03:05,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8345, device='cuda:0', grad_fn=<NllLossBackward0>)


 73%|███████▎  | 1089/1500 [09:36<03:05,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7953, device='cuda:0', grad_fn=<NllLossBackward0>)


 73%|███████▎  | 1090/1500 [09:37<03:04,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0421, device='cuda:0', grad_fn=<NllLossBackward0>)


 73%|███████▎  | 1091/1500 [09:37<03:04,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4816, device='cuda:0', grad_fn=<NllLossBackward0>)


 73%|███████▎  | 1092/1500 [09:38<03:04,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7285, device='cuda:0', grad_fn=<NllLossBackward0>)


 73%|███████▎  | 1093/1500 [09:38<03:03,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3095, device='cuda:0', grad_fn=<NllLossBackward0>)


 73%|███████▎  | 1094/1500 [09:39<03:03,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.9585, device='cuda:0', grad_fn=<NllLossBackward0>)


 73%|███████▎  | 1095/1500 [09:39<03:02,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0280, device='cuda:0', grad_fn=<NllLossBackward0>)


 73%|███████▎  | 1096/1500 [09:40<03:02,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7989, device='cuda:0', grad_fn=<NllLossBackward0>)


 73%|███████▎  | 1097/1500 [09:40<03:02,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7082, device='cuda:0', grad_fn=<NllLossBackward0>)


 73%|███████▎  | 1098/1500 [09:41<03:01,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5729, device='cuda:0', grad_fn=<NllLossBackward0>)


 73%|███████▎  | 1099/1500 [09:41<03:01,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9145, device='cuda:0', grad_fn=<NllLossBackward0>)


 73%|███████▎  | 1100/1500 [09:41<03:00,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch [3/3], Step [1100], Eval Loss: 6.126164798736572
tensor(6.2174, device='cuda:0', grad_fn=<NllLossBackward0>)


 73%|███████▎  | 1101/1500 [09:50<20:02,  3.01s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4947, device='cuda:0', grad_fn=<NllLossBackward0>)


 73%|███████▎  | 1102/1500 [09:51<14:53,  2.24s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6984, device='cuda:0', grad_fn=<NllLossBackward0>)


 74%|███████▎  | 1103/1500 [09:51<11:17,  1.71s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2671, device='cuda:0', grad_fn=<NllLossBackward0>)


 74%|███████▎  | 1104/1500 [09:52<08:46,  1.33s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.3550, device='cuda:0', grad_fn=<NllLossBackward0>)


 74%|███████▎  | 1105/1500 [09:52<07:01,  1.07s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4051, device='cuda:0', grad_fn=<NllLossBackward0>)


 74%|███████▎  | 1106/1500 [09:53<05:47,  1.13it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0459, device='cuda:0', grad_fn=<NllLossBackward0>)


 74%|███████▍  | 1107/1500 [09:53<04:56,  1.33it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3754, device='cuda:0', grad_fn=<NllLossBackward0>)


 74%|███████▍  | 1108/1500 [09:54<04:19,  1.51it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0068, device='cuda:0', grad_fn=<NllLossBackward0>)


 74%|███████▍  | 1109/1500 [09:54<03:54,  1.67it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.8526, device='cuda:0', grad_fn=<NllLossBackward0>)


 74%|███████▍  | 1110/1500 [09:54<03:36,  1.80it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.8063, device='cuda:0', grad_fn=<NllLossBackward0>)


 74%|███████▍  | 1111/1500 [09:55<03:23,  1.91it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1593, device='cuda:0', grad_fn=<NllLossBackward0>)


 74%|███████▍  | 1112/1500 [09:55<03:15,  1.99it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3663, device='cuda:0', grad_fn=<NllLossBackward0>)


 74%|███████▍  | 1113/1500 [09:56<03:08,  2.05it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5168, device='cuda:0', grad_fn=<NllLossBackward0>)


 74%|███████▍  | 1114/1500 [09:56<03:03,  2.10it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2899, device='cuda:0', grad_fn=<NllLossBackward0>)


 74%|███████▍  | 1115/1500 [09:57<03:00,  2.13it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7897, device='cuda:0', grad_fn=<NllLossBackward0>)


 74%|███████▍  | 1116/1500 [09:57<02:58,  2.15it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3623, device='cuda:0', grad_fn=<NllLossBackward0>)


 74%|███████▍  | 1117/1500 [09:58<02:56,  2.17it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1643, device='cuda:0', grad_fn=<NllLossBackward0>)


 75%|███████▍  | 1118/1500 [09:58<02:54,  2.18it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2133, device='cuda:0', grad_fn=<NllLossBackward0>)


 75%|███████▍  | 1119/1500 [09:59<02:53,  2.19it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0917, device='cuda:0', grad_fn=<NllLossBackward0>)


 75%|███████▍  | 1120/1500 [09:59<02:52,  2.20it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3841, device='cuda:0', grad_fn=<NllLossBackward0>)


 75%|███████▍  | 1121/1500 [09:59<02:52,  2.20it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.2146, device='cuda:0', grad_fn=<NllLossBackward0>)


 75%|███████▍  | 1122/1500 [10:00<02:51,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7627, device='cuda:0', grad_fn=<NllLossBackward0>)


 75%|███████▍  | 1123/1500 [10:00<02:50,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8051, device='cuda:0', grad_fn=<NllLossBackward0>)


 75%|███████▍  | 1124/1500 [10:01<02:50,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0253, device='cuda:0', grad_fn=<NllLossBackward0>)


 75%|███████▌  | 1125/1500 [10:01<02:49,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5855, device='cuda:0', grad_fn=<NllLossBackward0>)


 75%|███████▌  | 1126/1500 [10:02<02:48,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1993, device='cuda:0', grad_fn=<NllLossBackward0>)


 75%|███████▌  | 1127/1500 [10:02<02:48,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5548, device='cuda:0', grad_fn=<NllLossBackward0>)


 75%|███████▌  | 1128/1500 [10:03<02:47,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2643, device='cuda:0', grad_fn=<NllLossBackward0>)


 75%|███████▌  | 1129/1500 [10:03<02:47,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6008, device='cuda:0', grad_fn=<NllLossBackward0>)


 75%|███████▌  | 1130/1500 [10:03<02:47,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2830, device='cuda:0', grad_fn=<NllLossBackward0>)


 75%|███████▌  | 1131/1500 [10:04<02:46,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6263, device='cuda:0', grad_fn=<NllLossBackward0>)


 75%|███████▌  | 1132/1500 [10:04<02:45,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3717, device='cuda:0', grad_fn=<NllLossBackward0>)


 76%|███████▌  | 1133/1500 [10:05<02:45,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9334, device='cuda:0', grad_fn=<NllLossBackward0>)


 76%|███████▌  | 1134/1500 [10:05<02:45,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7938, device='cuda:0', grad_fn=<NllLossBackward0>)


 76%|███████▌  | 1135/1500 [10:06<02:44,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.9097, device='cuda:0', grad_fn=<NllLossBackward0>)


 76%|███████▌  | 1136/1500 [10:06<02:44,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.4735, device='cuda:0', grad_fn=<NllLossBackward0>)


 76%|███████▌  | 1137/1500 [10:07<02:43,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5409, device='cuda:0', grad_fn=<NllLossBackward0>)


 76%|███████▌  | 1138/1500 [10:07<02:42,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7398, device='cuda:0', grad_fn=<NllLossBackward0>)


 76%|███████▌  | 1139/1500 [10:08<02:42,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7507, device='cuda:0', grad_fn=<NllLossBackward0>)


 76%|███████▌  | 1140/1500 [10:08<02:42,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4748, device='cuda:0', grad_fn=<NllLossBackward0>)


 76%|███████▌  | 1141/1500 [10:08<02:42,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3038, device='cuda:0', grad_fn=<NllLossBackward0>)


 76%|███████▌  | 1142/1500 [10:09<02:41,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4688, device='cuda:0', grad_fn=<NllLossBackward0>)


 76%|███████▌  | 1143/1500 [10:09<02:41,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2996, device='cuda:0', grad_fn=<NllLossBackward0>)


 76%|███████▋  | 1144/1500 [10:10<02:40,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4175, device='cuda:0', grad_fn=<NllLossBackward0>)


 76%|███████▋  | 1145/1500 [10:10<02:40,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6375, device='cuda:0', grad_fn=<NllLossBackward0>)


 76%|███████▋  | 1146/1500 [10:11<02:39,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0360, device='cuda:0', grad_fn=<NllLossBackward0>)


 76%|███████▋  | 1147/1500 [10:11<02:39,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.9277, device='cuda:0', grad_fn=<NllLossBackward0>)


 77%|███████▋  | 1148/1500 [10:12<02:39,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7271, device='cuda:0', grad_fn=<NllLossBackward0>)


 77%|███████▋  | 1149/1500 [10:12<02:38,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0707, device='cuda:0', grad_fn=<NllLossBackward0>)


 77%|███████▋  | 1150/1500 [10:13<02:37,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4915, device='cuda:0', grad_fn=<NllLossBackward0>)


 77%|███████▋  | 1151/1500 [10:13<02:37,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1761, device='cuda:0', grad_fn=<NllLossBackward0>)


 77%|███████▋  | 1152/1500 [10:13<02:36,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.9639, device='cuda:0', grad_fn=<NllLossBackward0>)


 77%|███████▋  | 1153/1500 [10:14<02:36,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0499, device='cuda:0', grad_fn=<NllLossBackward0>)


 77%|███████▋  | 1154/1500 [10:14<02:36,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2745, device='cuda:0', grad_fn=<NllLossBackward0>)


 77%|███████▋  | 1155/1500 [10:15<02:35,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.5134, device='cuda:0', grad_fn=<NllLossBackward0>)


 77%|███████▋  | 1156/1500 [10:15<02:35,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.2265, device='cuda:0', grad_fn=<NllLossBackward0>)


 77%|███████▋  | 1157/1500 [10:16<02:34,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9434, device='cuda:0', grad_fn=<NllLossBackward0>)


 77%|███████▋  | 1158/1500 [10:16<02:34,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2630, device='cuda:0', grad_fn=<NllLossBackward0>)


 77%|███████▋  | 1159/1500 [10:17<02:33,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.1457, device='cuda:0', grad_fn=<NllLossBackward0>)


 77%|███████▋  | 1160/1500 [10:17<02:33,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0662, device='cuda:0', grad_fn=<NllLossBackward0>)


 77%|███████▋  | 1161/1500 [10:17<02:32,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0589, device='cuda:0', grad_fn=<NllLossBackward0>)


 77%|███████▋  | 1162/1500 [10:18<02:32,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4745, device='cuda:0', grad_fn=<NllLossBackward0>)


 78%|███████▊  | 1163/1500 [10:18<02:32,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2778, device='cuda:0', grad_fn=<NllLossBackward0>)


 78%|███████▊  | 1164/1500 [10:19<02:31,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9618, device='cuda:0', grad_fn=<NllLossBackward0>)


 78%|███████▊  | 1165/1500 [10:19<02:31,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7622, device='cuda:0', grad_fn=<NllLossBackward0>)


 78%|███████▊  | 1166/1500 [10:20<02:30,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1861, device='cuda:0', grad_fn=<NllLossBackward0>)


 78%|███████▊  | 1167/1500 [10:20<02:30,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6838, device='cuda:0', grad_fn=<NllLossBackward0>)


 78%|███████▊  | 1168/1500 [10:21<02:29,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2138, device='cuda:0', grad_fn=<NllLossBackward0>)


 78%|███████▊  | 1169/1500 [10:21<02:29,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6193, device='cuda:0', grad_fn=<NllLossBackward0>)


 78%|███████▊  | 1170/1500 [10:22<02:29,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.8653, device='cuda:0', grad_fn=<NllLossBackward0>)


 78%|███████▊  | 1171/1500 [10:22<02:28,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6599, device='cuda:0', grad_fn=<NllLossBackward0>)


 78%|███████▊  | 1172/1500 [10:22<02:27,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.1126, device='cuda:0', grad_fn=<NllLossBackward0>)


 78%|███████▊  | 1173/1500 [10:23<02:27,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4455, device='cuda:0', grad_fn=<NllLossBackward0>)


 78%|███████▊  | 1174/1500 [10:23<02:27,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.2782, device='cuda:0', grad_fn=<NllLossBackward0>)


 78%|███████▊  | 1175/1500 [10:24<02:26,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7090, device='cuda:0', grad_fn=<NllLossBackward0>)


 78%|███████▊  | 1176/1500 [10:24<02:25,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5817, device='cuda:0', grad_fn=<NllLossBackward0>)


 78%|███████▊  | 1177/1500 [10:25<02:25,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.2815, device='cuda:0', grad_fn=<NllLossBackward0>)


 79%|███████▊  | 1178/1500 [10:25<02:24,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6446, device='cuda:0', grad_fn=<NllLossBackward0>)


 79%|███████▊  | 1179/1500 [10:26<02:24,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5553, device='cuda:0', grad_fn=<NllLossBackward0>)


 79%|███████▊  | 1180/1500 [10:26<02:24,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.1662, device='cuda:0', grad_fn=<NllLossBackward0>)


 79%|███████▊  | 1181/1500 [10:26<02:23,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6560, device='cuda:0', grad_fn=<NllLossBackward0>)


 79%|███████▉  | 1182/1500 [10:27<02:23,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.3598, device='cuda:0', grad_fn=<NllLossBackward0>)


 79%|███████▉  | 1183/1500 [10:27<02:22,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.0608, device='cuda:0', grad_fn=<NllLossBackward0>)


 79%|███████▉  | 1184/1500 [10:28<02:22,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6285, device='cuda:0', grad_fn=<NllLossBackward0>)


 79%|███████▉  | 1185/1500 [10:28<02:21,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6921, device='cuda:0', grad_fn=<NllLossBackward0>)


 79%|███████▉  | 1186/1500 [10:29<02:21,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8612, device='cuda:0', grad_fn=<NllLossBackward0>)


 79%|███████▉  | 1187/1500 [10:29<02:21,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.1182, device='cuda:0', grad_fn=<NllLossBackward0>)


 79%|███████▉  | 1188/1500 [10:30<02:20,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7465, device='cuda:0', grad_fn=<NllLossBackward0>)


 79%|███████▉  | 1189/1500 [10:30<02:20,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4328, device='cuda:0', grad_fn=<NllLossBackward0>)


 79%|███████▉  | 1190/1500 [10:31<02:19,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6991, device='cuda:0', grad_fn=<NllLossBackward0>)


 79%|███████▉  | 1191/1500 [10:31<02:19,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6506, device='cuda:0', grad_fn=<NllLossBackward0>)


 79%|███████▉  | 1192/1500 [10:31<02:19,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6648, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|███████▉  | 1193/1500 [10:32<02:18,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7727, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|███████▉  | 1194/1500 [10:32<02:18,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6289, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|███████▉  | 1195/1500 [10:33<02:17,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.2228, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|███████▉  | 1196/1500 [10:33<02:17,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9706, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|███████▉  | 1197/1500 [10:34<02:16,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5891, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|███████▉  | 1198/1500 [10:34<02:16,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5588, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|███████▉  | 1199/1500 [10:35<02:15,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6172, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|████████  | 1200/1500 [10:35<02:15,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch [3/3], Step [1200], Eval Loss: 6.043583638668061
tensor(6.4844, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|████████  | 1201/1500 [10:44<14:59,  3.01s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4381, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|████████  | 1202/1500 [10:45<11:08,  2.24s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4559, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|████████  | 1203/1500 [10:45<08:26,  1.71s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0265, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|████████  | 1204/1500 [10:45<06:33,  1.33s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.8698, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|████████  | 1205/1500 [10:46<05:14,  1.07s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6509, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|████████  | 1206/1500 [10:46<04:19,  1.13it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2986, device='cuda:0', grad_fn=<NllLossBackward0>)


 80%|████████  | 1207/1500 [10:47<03:40,  1.33it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4351, device='cuda:0', grad_fn=<NllLossBackward0>)


 81%|████████  | 1208/1500 [10:47<03:13,  1.51it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4310, device='cuda:0', grad_fn=<NllLossBackward0>)


 81%|████████  | 1209/1500 [10:48<02:54,  1.66it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9128, device='cuda:0', grad_fn=<NllLossBackward0>)


 81%|████████  | 1210/1500 [10:48<02:41,  1.80it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2328, device='cuda:0', grad_fn=<NllLossBackward0>)


 81%|████████  | 1211/1500 [10:49<02:31,  1.90it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1592, device='cuda:0', grad_fn=<NllLossBackward0>)


 81%|████████  | 1212/1500 [10:49<02:25,  1.98it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9892, device='cuda:0', grad_fn=<NllLossBackward0>)


 81%|████████  | 1213/1500 [10:49<02:20,  2.05it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.9736, device='cuda:0', grad_fn=<NllLossBackward0>)


 81%|████████  | 1214/1500 [10:50<02:16,  2.10it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7992, device='cuda:0', grad_fn=<NllLossBackward0>)


 81%|████████  | 1215/1500 [10:50<02:13,  2.13it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9878, device='cuda:0', grad_fn=<NllLossBackward0>)


 81%|████████  | 1216/1500 [10:51<02:11,  2.16it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9938, device='cuda:0', grad_fn=<NllLossBackward0>)


 81%|████████  | 1217/1500 [10:51<02:10,  2.17it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5354, device='cuda:0', grad_fn=<NllLossBackward0>)


 81%|████████  | 1218/1500 [10:52<02:08,  2.19it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4428, device='cuda:0', grad_fn=<NllLossBackward0>)


 81%|████████▏ | 1219/1500 [10:52<02:07,  2.20it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3240, device='cuda:0', grad_fn=<NllLossBackward0>)


 81%|████████▏ | 1220/1500 [10:53<02:06,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5353, device='cuda:0', grad_fn=<NllLossBackward0>)


 81%|████████▏ | 1221/1500 [10:53<02:06,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6019, device='cuda:0', grad_fn=<NllLossBackward0>)


 81%|████████▏ | 1222/1500 [10:54<02:05,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4083, device='cuda:0', grad_fn=<NllLossBackward0>)


 82%|████████▏ | 1223/1500 [10:54<02:05,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9307, device='cuda:0', grad_fn=<NllLossBackward0>)


 82%|████████▏ | 1224/1500 [10:54<02:04,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9783, device='cuda:0', grad_fn=<NllLossBackward0>)


 82%|████████▏ | 1225/1500 [10:55<02:04,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8883, device='cuda:0', grad_fn=<NllLossBackward0>)


 82%|████████▏ | 1226/1500 [10:55<02:03,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6767, device='cuda:0', grad_fn=<NllLossBackward0>)


 82%|████████▏ | 1227/1500 [10:56<02:03,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9820, device='cuda:0', grad_fn=<NllLossBackward0>)


 82%|████████▏ | 1228/1500 [10:56<02:02,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9500, device='cuda:0', grad_fn=<NllLossBackward0>)


 82%|████████▏ | 1229/1500 [10:57<02:02,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0937, device='cuda:0', grad_fn=<NllLossBackward0>)


 82%|████████▏ | 1230/1500 [10:57<02:01,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4191, device='cuda:0', grad_fn=<NllLossBackward0>)


 82%|████████▏ | 1231/1500 [10:58<02:01,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0786, device='cuda:0', grad_fn=<NllLossBackward0>)


 82%|████████▏ | 1232/1500 [10:58<02:01,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1001, device='cuda:0', grad_fn=<NllLossBackward0>)


 82%|████████▏ | 1233/1500 [10:59<02:00,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4135, device='cuda:0', grad_fn=<NllLossBackward0>)


 82%|████████▏ | 1234/1500 [10:59<02:00,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.9556, device='cuda:0', grad_fn=<NllLossBackward0>)


 82%|████████▏ | 1235/1500 [10:59<01:59,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0921, device='cuda:0', grad_fn=<NllLossBackward0>)


 82%|████████▏ | 1236/1500 [11:00<01:59,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2590, device='cuda:0', grad_fn=<NllLossBackward0>)


 82%|████████▏ | 1237/1500 [11:00<01:58,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3316, device='cuda:0', grad_fn=<NllLossBackward0>)


 83%|████████▎ | 1238/1500 [11:01<01:58,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0382, device='cuda:0', grad_fn=<NllLossBackward0>)


 83%|████████▎ | 1239/1500 [11:01<01:57,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2552, device='cuda:0', grad_fn=<NllLossBackward0>)


 83%|████████▎ | 1240/1500 [11:02<01:57,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1064, device='cuda:0', grad_fn=<NllLossBackward0>)


 83%|████████▎ | 1241/1500 [11:02<01:56,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6689, device='cuda:0', grad_fn=<NllLossBackward0>)


 83%|████████▎ | 1242/1500 [11:03<01:56,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3869, device='cuda:0', grad_fn=<NllLossBackward0>)


 83%|████████▎ | 1243/1500 [11:03<01:55,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5423, device='cuda:0', grad_fn=<NllLossBackward0>)


 83%|████████▎ | 1244/1500 [11:03<01:55,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2194, device='cuda:0', grad_fn=<NllLossBackward0>)


 83%|████████▎ | 1245/1500 [11:04<01:55,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5030, device='cuda:0', grad_fn=<NllLossBackward0>)


 83%|████████▎ | 1246/1500 [11:04<01:54,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4342, device='cuda:0', grad_fn=<NllLossBackward0>)


 83%|████████▎ | 1247/1500 [11:05<01:54,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3764, device='cuda:0', grad_fn=<NllLossBackward0>)


 83%|████████▎ | 1248/1500 [11:05<01:53,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7347, device='cuda:0', grad_fn=<NllLossBackward0>)


 83%|████████▎ | 1249/1500 [11:06<01:53,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5001, device='cuda:0', grad_fn=<NllLossBackward0>)


 83%|████████▎ | 1250/1500 [11:06<01:52,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0212, device='cuda:0', grad_fn=<NllLossBackward0>)


 83%|████████▎ | 1251/1500 [11:07<01:52,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0319, device='cuda:0', grad_fn=<NllLossBackward0>)


 83%|████████▎ | 1252/1500 [11:07<01:51,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1230, device='cuda:0', grad_fn=<NllLossBackward0>)


 84%|████████▎ | 1253/1500 [11:08<01:51,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9684, device='cuda:0', grad_fn=<NllLossBackward0>)


 84%|████████▎ | 1254/1500 [11:08<01:50,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2598, device='cuda:0', grad_fn=<NllLossBackward0>)


 84%|████████▎ | 1255/1500 [11:08<01:50,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.1634, device='cuda:0', grad_fn=<NllLossBackward0>)


 84%|████████▎ | 1256/1500 [11:09<01:50,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5338, device='cuda:0', grad_fn=<NllLossBackward0>)


 84%|████████▍ | 1257/1500 [11:09<01:49,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0799, device='cuda:0', grad_fn=<NllLossBackward0>)


 84%|████████▍ | 1258/1500 [11:10<01:49,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4938, device='cuda:0', grad_fn=<NllLossBackward0>)


 84%|████████▍ | 1259/1500 [11:10<01:48,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1085, device='cuda:0', grad_fn=<NllLossBackward0>)


 84%|████████▍ | 1260/1500 [11:11<01:48,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.0350, device='cuda:0', grad_fn=<NllLossBackward0>)


 84%|████████▍ | 1261/1500 [11:11<01:47,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7382, device='cuda:0', grad_fn=<NllLossBackward0>)


 84%|████████▍ | 1262/1500 [11:12<01:47,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2630, device='cuda:0', grad_fn=<NllLossBackward0>)


 84%|████████▍ | 1263/1500 [11:12<01:46,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6038, device='cuda:0', grad_fn=<NllLossBackward0>)


 84%|████████▍ | 1264/1500 [11:12<01:46,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0908, device='cuda:0', grad_fn=<NllLossBackward0>)


 84%|████████▍ | 1265/1500 [11:13<01:45,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4572, device='cuda:0', grad_fn=<NllLossBackward0>)


 84%|████████▍ | 1266/1500 [11:13<01:45,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9243, device='cuda:0', grad_fn=<NllLossBackward0>)


 84%|████████▍ | 1267/1500 [11:14<01:45,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9964, device='cuda:0', grad_fn=<NllLossBackward0>)


 85%|████████▍ | 1268/1500 [11:14<01:44,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7416, device='cuda:0', grad_fn=<NllLossBackward0>)


 85%|████████▍ | 1269/1500 [11:15<01:44,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.8251, device='cuda:0', grad_fn=<NllLossBackward0>)


 85%|████████▍ | 1270/1500 [11:15<01:43,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6234, device='cuda:0', grad_fn=<NllLossBackward0>)


 85%|████████▍ | 1271/1500 [11:16<01:43,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5432, device='cuda:0', grad_fn=<NllLossBackward0>)


 85%|████████▍ | 1272/1500 [11:16<01:42,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2467, device='cuda:0', grad_fn=<NllLossBackward0>)


 85%|████████▍ | 1273/1500 [11:17<01:42,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9800, device='cuda:0', grad_fn=<NllLossBackward0>)


 85%|████████▍ | 1274/1500 [11:17<01:41,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.2030, device='cuda:0', grad_fn=<NllLossBackward0>)


 85%|████████▌ | 1275/1500 [11:17<01:41,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5365, device='cuda:0', grad_fn=<NllLossBackward0>)


 85%|████████▌ | 1276/1500 [11:18<01:40,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9657, device='cuda:0', grad_fn=<NllLossBackward0>)


 85%|████████▌ | 1277/1500 [11:18<01:40,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.5032, device='cuda:0', grad_fn=<NllLossBackward0>)


 85%|████████▌ | 1278/1500 [11:19<01:39,  2.23it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9063, device='cuda:0', grad_fn=<NllLossBackward0>)


 85%|████████▌ | 1279/1500 [11:19<01:39,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7852, device='cuda:0', grad_fn=<NllLossBackward0>)


 85%|████████▌ | 1280/1500 [11:20<01:39,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.4618, device='cuda:0', grad_fn=<NllLossBackward0>)


 85%|████████▌ | 1281/1500 [11:20<01:38,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8097, device='cuda:0', grad_fn=<NllLossBackward0>)


 85%|████████▌ | 1282/1500 [11:21<01:38,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9387, device='cuda:0', grad_fn=<NllLossBackward0>)


 86%|████████▌ | 1283/1500 [11:21<01:37,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.9514, device='cuda:0', grad_fn=<NllLossBackward0>)


 86%|████████▌ | 1284/1500 [11:21<01:37,  2.23it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6836, device='cuda:0', grad_fn=<NllLossBackward0>)


 86%|████████▌ | 1285/1500 [11:22<01:36,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2316, device='cuda:0', grad_fn=<NllLossBackward0>)


 86%|████████▌ | 1286/1500 [11:22<01:36,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4304, device='cuda:0', grad_fn=<NllLossBackward0>)


 86%|████████▌ | 1287/1500 [11:23<01:36,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0864, device='cuda:0', grad_fn=<NllLossBackward0>)


 86%|████████▌ | 1288/1500 [11:23<01:35,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4988, device='cuda:0', grad_fn=<NllLossBackward0>)


 86%|████████▌ | 1289/1500 [11:24<01:35,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0349, device='cuda:0', grad_fn=<NllLossBackward0>)


 86%|████████▌ | 1290/1500 [11:24<01:34,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5310, device='cuda:0', grad_fn=<NllLossBackward0>)


 86%|████████▌ | 1291/1500 [11:25<01:34,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8768, device='cuda:0', grad_fn=<NllLossBackward0>)


 86%|████████▌ | 1292/1500 [11:25<01:33,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0908, device='cuda:0', grad_fn=<NllLossBackward0>)


 86%|████████▌ | 1293/1500 [11:26<01:33,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7269, device='cuda:0', grad_fn=<NllLossBackward0>)


 86%|████████▋ | 1294/1500 [11:26<01:32,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7390, device='cuda:0', grad_fn=<NllLossBackward0>)


 86%|████████▋ | 1295/1500 [11:26<01:32,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2892, device='cuda:0', grad_fn=<NllLossBackward0>)


 86%|████████▋ | 1296/1500 [11:27<01:31,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5203, device='cuda:0', grad_fn=<NllLossBackward0>)


 86%|████████▋ | 1297/1500 [11:27<01:31,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6154, device='cuda:0', grad_fn=<NllLossBackward0>)


 87%|████████▋ | 1298/1500 [11:28<01:31,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7000, device='cuda:0', grad_fn=<NllLossBackward0>)


 87%|████████▋ | 1299/1500 [11:28<01:30,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4017, device='cuda:0', grad_fn=<NllLossBackward0>)


 87%|████████▋ | 1300/1500 [11:29<01:30,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch [3/3], Step [1300], Eval Loss: 5.924816319942474
tensor(5.4987, device='cuda:0', grad_fn=<NllLossBackward0>)


 87%|████████▋ | 1301/1500 [11:38<09:58,  3.01s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6864, device='cuda:0', grad_fn=<NllLossBackward0>)


 87%|████████▋ | 1302/1500 [11:38<07:23,  2.24s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7469, device='cuda:0', grad_fn=<NllLossBackward0>)


 87%|████████▋ | 1303/1500 [11:39<05:35,  1.70s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9172, device='cuda:0', grad_fn=<NllLossBackward0>)


 87%|████████▋ | 1304/1500 [11:39<04:20,  1.33s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2423, device='cuda:0', grad_fn=<NllLossBackward0>)


 87%|████████▋ | 1305/1500 [11:39<03:27,  1.07s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9937, device='cuda:0', grad_fn=<NllLossBackward0>)


 87%|████████▋ | 1306/1500 [11:40<02:51,  1.13it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.2864, device='cuda:0', grad_fn=<NllLossBackward0>)


 87%|████████▋ | 1307/1500 [11:40<02:25,  1.33it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6532, device='cuda:0', grad_fn=<NllLossBackward0>)


 87%|████████▋ | 1308/1500 [11:41<02:07,  1.51it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4305, device='cuda:0', grad_fn=<NllLossBackward0>)


 87%|████████▋ | 1309/1500 [11:41<01:54,  1.67it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0729, device='cuda:0', grad_fn=<NllLossBackward0>)


 87%|████████▋ | 1310/1500 [11:42<01:45,  1.80it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6425, device='cuda:0', grad_fn=<NllLossBackward0>)


 87%|████████▋ | 1311/1500 [11:42<01:38,  1.91it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.8574, device='cuda:0', grad_fn=<NllLossBackward0>)


 87%|████████▋ | 1312/1500 [11:43<01:34,  1.99it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3324, device='cuda:0', grad_fn=<NllLossBackward0>)


 88%|████████▊ | 1313/1500 [11:43<01:30,  2.06it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8727, device='cuda:0', grad_fn=<NllLossBackward0>)


 88%|████████▊ | 1314/1500 [11:44<01:28,  2.11it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0776, device='cuda:0', grad_fn=<NllLossBackward0>)


 88%|████████▊ | 1315/1500 [11:44<01:26,  2.14it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7136, device='cuda:0', grad_fn=<NllLossBackward0>)


 88%|████████▊ | 1316/1500 [11:44<01:25,  2.16it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0931, device='cuda:0', grad_fn=<NllLossBackward0>)


 88%|████████▊ | 1317/1500 [11:45<01:24,  2.18it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3572, device='cuda:0', grad_fn=<NllLossBackward0>)


 88%|████████▊ | 1318/1500 [11:45<01:22,  2.19it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7014, device='cuda:0', grad_fn=<NllLossBackward0>)


 88%|████████▊ | 1319/1500 [11:46<01:22,  2.20it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1598, device='cuda:0', grad_fn=<NllLossBackward0>)


 88%|████████▊ | 1320/1500 [11:46<01:21,  2.20it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6028, device='cuda:0', grad_fn=<NllLossBackward0>)


 88%|████████▊ | 1321/1500 [11:47<01:21,  2.20it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0048, device='cuda:0', grad_fn=<NllLossBackward0>)


 88%|████████▊ | 1322/1500 [11:47<01:20,  2.20it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2927, device='cuda:0', grad_fn=<NllLossBackward0>)


 88%|████████▊ | 1323/1500 [11:48<01:20,  2.20it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.9789, device='cuda:0', grad_fn=<NllLossBackward0>)


 88%|████████▊ | 1324/1500 [11:48<01:19,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.4880, device='cuda:0', grad_fn=<NllLossBackward0>)


 88%|████████▊ | 1325/1500 [11:49<01:18,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4802, device='cuda:0', grad_fn=<NllLossBackward0>)


 88%|████████▊ | 1326/1500 [11:49<01:18,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5706, device='cuda:0', grad_fn=<NllLossBackward0>)


 88%|████████▊ | 1327/1500 [11:49<01:17,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9483, device='cuda:0', grad_fn=<NllLossBackward0>)


 89%|████████▊ | 1328/1500 [11:50<01:17,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3238, device='cuda:0', grad_fn=<NllLossBackward0>)


 89%|████████▊ | 1329/1500 [11:50<01:17,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5860, device='cuda:0', grad_fn=<NllLossBackward0>)


 89%|████████▊ | 1330/1500 [11:51<01:16,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1063, device='cuda:0', grad_fn=<NllLossBackward0>)


 89%|████████▊ | 1331/1500 [11:51<01:16,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8608, device='cuda:0', grad_fn=<NllLossBackward0>)


 89%|████████▉ | 1332/1500 [11:52<01:15,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6687, device='cuda:0', grad_fn=<NllLossBackward0>)


 89%|████████▉ | 1333/1500 [11:52<01:15,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8040, device='cuda:0', grad_fn=<NllLossBackward0>)


 89%|████████▉ | 1334/1500 [11:53<01:14,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6068, device='cuda:0', grad_fn=<NllLossBackward0>)


 89%|████████▉ | 1335/1500 [11:53<01:14,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0686, device='cuda:0', grad_fn=<NllLossBackward0>)


 89%|████████▉ | 1336/1500 [11:53<01:14,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9208, device='cuda:0', grad_fn=<NllLossBackward0>)


 89%|████████▉ | 1337/1500 [11:54<01:13,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2274, device='cuda:0', grad_fn=<NllLossBackward0>)


 89%|████████▉ | 1338/1500 [11:54<01:13,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9788, device='cuda:0', grad_fn=<NllLossBackward0>)


 89%|████████▉ | 1339/1500 [11:55<01:12,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2334, device='cuda:0', grad_fn=<NllLossBackward0>)


 89%|████████▉ | 1340/1500 [11:55<01:12,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4792, device='cuda:0', grad_fn=<NllLossBackward0>)


 89%|████████▉ | 1341/1500 [11:56<01:11,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6276, device='cuda:0', grad_fn=<NllLossBackward0>)


 89%|████████▉ | 1342/1500 [11:56<01:11,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9986, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|████████▉ | 1343/1500 [11:57<01:10,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.1901, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|████████▉ | 1344/1500 [11:57<01:10,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7654, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|████████▉ | 1345/1500 [11:58<01:09,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7874, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|████████▉ | 1346/1500 [11:58<01:09,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6886, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|████████▉ | 1347/1500 [11:58<01:09,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4383, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|████████▉ | 1348/1500 [11:59<01:08,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6945, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|████████▉ | 1349/1500 [11:59<01:08,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4772, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|█████████ | 1350/1500 [12:00<01:07,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1121, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|█████████ | 1351/1500 [12:00<01:07,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.8906, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|█████████ | 1352/1500 [12:01<01:06,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.3824, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|█████████ | 1353/1500 [12:01<01:06,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9262, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|█████████ | 1354/1500 [12:02<01:05,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.2093, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|█████████ | 1355/1500 [12:02<01:05,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6993, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|█████████ | 1356/1500 [12:03<01:04,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7442, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|█████████ | 1357/1500 [12:03<01:04,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9443, device='cuda:0', grad_fn=<NllLossBackward0>)


 91%|█████████ | 1358/1500 [12:03<01:04,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0422, device='cuda:0', grad_fn=<NllLossBackward0>)


 91%|█████████ | 1359/1500 [12:04<01:03,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8085, device='cuda:0', grad_fn=<NllLossBackward0>)


 91%|█████████ | 1360/1500 [12:04<01:03,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5455, device='cuda:0', grad_fn=<NllLossBackward0>)


 91%|█████████ | 1361/1500 [12:05<01:02,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4661, device='cuda:0', grad_fn=<NllLossBackward0>)


 91%|█████████ | 1362/1500 [12:05<01:02,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9696, device='cuda:0', grad_fn=<NllLossBackward0>)


 91%|█████████ | 1363/1500 [12:06<01:01,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.7386, device='cuda:0', grad_fn=<NllLossBackward0>)


 91%|█████████ | 1364/1500 [12:06<01:01,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7566, device='cuda:0', grad_fn=<NllLossBackward0>)


 91%|█████████ | 1365/1500 [12:07<01:00,  2.23it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9066, device='cuda:0', grad_fn=<NllLossBackward0>)


 91%|█████████ | 1366/1500 [12:07<01:00,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4476, device='cuda:0', grad_fn=<NllLossBackward0>)


 91%|█████████ | 1367/1500 [12:07<01:00,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6998, device='cuda:0', grad_fn=<NllLossBackward0>)


 91%|█████████ | 1368/1500 [12:08<00:59,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0364, device='cuda:0', grad_fn=<NllLossBackward0>)


 91%|█████████▏| 1369/1500 [12:08<00:59,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.0708, device='cuda:0', grad_fn=<NllLossBackward0>)


 91%|█████████▏| 1370/1500 [12:09<00:58,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9865, device='cuda:0', grad_fn=<NllLossBackward0>)


 91%|█████████▏| 1371/1500 [12:09<00:58,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(3.1535, device='cuda:0', grad_fn=<NllLossBackward0>)


 91%|█████████▏| 1372/1500 [12:10<00:57,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8839, device='cuda:0', grad_fn=<NllLossBackward0>)


 92%|█████████▏| 1373/1500 [12:10<00:57,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2969, device='cuda:0', grad_fn=<NllLossBackward0>)


 92%|█████████▏| 1374/1500 [12:11<00:56,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7581, device='cuda:0', grad_fn=<NllLossBackward0>)


 92%|█████████▏| 1375/1500 [12:11<00:56,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4595, device='cuda:0', grad_fn=<NllLossBackward0>)


 92%|█████████▏| 1376/1500 [12:12<00:56,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4443, device='cuda:0', grad_fn=<NllLossBackward0>)


 92%|█████████▏| 1377/1500 [12:12<00:55,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1264, device='cuda:0', grad_fn=<NllLossBackward0>)


 92%|█████████▏| 1378/1500 [12:12<00:55,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.3879, device='cuda:0', grad_fn=<NllLossBackward0>)


 92%|█████████▏| 1379/1500 [12:13<00:54,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0597, device='cuda:0', grad_fn=<NllLossBackward0>)


 92%|█████████▏| 1380/1500 [12:13<00:54,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4693, device='cuda:0', grad_fn=<NllLossBackward0>)


 92%|█████████▏| 1381/1500 [12:14<00:53,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8299, device='cuda:0', grad_fn=<NllLossBackward0>)


 92%|█████████▏| 1382/1500 [12:14<00:53,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6995, device='cuda:0', grad_fn=<NllLossBackward0>)


 92%|█████████▏| 1383/1500 [12:15<00:52,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7629, device='cuda:0', grad_fn=<NllLossBackward0>)


 92%|█████████▏| 1384/1500 [12:15<00:52,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1286, device='cuda:0', grad_fn=<NllLossBackward0>)


 92%|█████████▏| 1385/1500 [12:16<00:51,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.3631, device='cuda:0', grad_fn=<NllLossBackward0>)


 92%|█████████▏| 1386/1500 [12:16<00:51,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4179, device='cuda:0', grad_fn=<NllLossBackward0>)


 92%|█████████▏| 1387/1500 [12:16<00:50,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7955, device='cuda:0', grad_fn=<NllLossBackward0>)


 93%|█████████▎| 1388/1500 [12:17<00:50,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0150, device='cuda:0', grad_fn=<NllLossBackward0>)


 93%|█████████▎| 1389/1500 [12:17<00:50,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8441, device='cuda:0', grad_fn=<NllLossBackward0>)


 93%|█████████▎| 1390/1500 [12:18<00:49,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4937, device='cuda:0', grad_fn=<NllLossBackward0>)


 93%|█████████▎| 1391/1500 [12:18<00:49,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7891, device='cuda:0', grad_fn=<NllLossBackward0>)


 93%|█████████▎| 1392/1500 [12:19<00:48,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5092, device='cuda:0', grad_fn=<NllLossBackward0>)


 93%|█████████▎| 1393/1500 [12:19<00:48,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9679, device='cuda:0', grad_fn=<NllLossBackward0>)


 93%|█████████▎| 1394/1500 [12:20<00:47,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8632, device='cuda:0', grad_fn=<NllLossBackward0>)


 93%|█████████▎| 1395/1500 [12:20<00:47,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1291, device='cuda:0', grad_fn=<NllLossBackward0>)


 93%|█████████▎| 1396/1500 [12:21<00:46,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7146, device='cuda:0', grad_fn=<NllLossBackward0>)


 93%|█████████▎| 1397/1500 [12:21<00:46,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0628, device='cuda:0', grad_fn=<NllLossBackward0>)


 93%|█████████▎| 1398/1500 [12:21<00:46,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1895, device='cuda:0', grad_fn=<NllLossBackward0>)


 93%|█████████▎| 1399/1500 [12:22<00:45,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2428, device='cuda:0', grad_fn=<NllLossBackward0>)


 93%|█████████▎| 1400/1500 [12:22<00:45,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch [3/3], Step [1400], Eval Loss: 5.872209646701813
tensor(5.0230, device='cuda:0', grad_fn=<NllLossBackward0>)


 93%|█████████▎| 1401/1500 [12:31<04:57,  3.01s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5128, device='cuda:0', grad_fn=<NllLossBackward0>)


 93%|█████████▎| 1402/1500 [12:32<03:39,  2.24s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0625, device='cuda:0', grad_fn=<NllLossBackward0>)


 94%|█████████▎| 1403/1500 [12:32<02:45,  1.70s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0700, device='cuda:0', grad_fn=<NllLossBackward0>)


 94%|█████████▎| 1404/1500 [12:33<02:07,  1.33s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6565, device='cuda:0', grad_fn=<NllLossBackward0>)


 94%|█████████▎| 1405/1500 [12:33<01:41,  1.06s/it]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1906, device='cuda:0', grad_fn=<NllLossBackward0>)


 94%|█████████▎| 1406/1500 [12:34<01:22,  1.13it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0105, device='cuda:0', grad_fn=<NllLossBackward0>)


 94%|█████████▍| 1407/1500 [12:34<01:10,  1.33it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7745, device='cuda:0', grad_fn=<NllLossBackward0>)


 94%|█████████▍| 1408/1500 [12:35<01:00,  1.51it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0937, device='cuda:0', grad_fn=<NllLossBackward0>)


 94%|█████████▍| 1409/1500 [12:35<00:54,  1.67it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7248, device='cuda:0', grad_fn=<NllLossBackward0>)


 94%|█████████▍| 1410/1500 [12:35<00:50,  1.80it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4582, device='cuda:0', grad_fn=<NllLossBackward0>)


 94%|█████████▍| 1411/1500 [12:36<00:46,  1.91it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9493, device='cuda:0', grad_fn=<NllLossBackward0>)


 94%|█████████▍| 1412/1500 [12:36<00:44,  1.99it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7824, device='cuda:0', grad_fn=<NllLossBackward0>)


 94%|█████████▍| 1413/1500 [12:37<00:42,  2.05it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5514, device='cuda:0', grad_fn=<NllLossBackward0>)


 94%|█████████▍| 1414/1500 [12:37<00:41,  2.10it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.0533, device='cuda:0', grad_fn=<NllLossBackward0>)


 94%|█████████▍| 1415/1500 [12:38<00:39,  2.13it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6348, device='cuda:0', grad_fn=<NllLossBackward0>)


 94%|█████████▍| 1416/1500 [12:38<00:38,  2.16it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.3857, device='cuda:0', grad_fn=<NllLossBackward0>)


 94%|█████████▍| 1417/1500 [12:39<00:38,  2.18it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6806, device='cuda:0', grad_fn=<NllLossBackward0>)


 95%|█████████▍| 1418/1500 [12:39<00:37,  2.19it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0967, device='cuda:0', grad_fn=<NllLossBackward0>)


 95%|█████████▍| 1419/1500 [12:39<00:36,  2.20it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0498, device='cuda:0', grad_fn=<NllLossBackward0>)


 95%|█████████▍| 1420/1500 [12:40<00:36,  2.20it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5195, device='cuda:0', grad_fn=<NllLossBackward0>)


 95%|█████████▍| 1421/1500 [12:40<00:35,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7598, device='cuda:0', grad_fn=<NllLossBackward0>)


 95%|█████████▍| 1422/1500 [12:41<00:35,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5007, device='cuda:0', grad_fn=<NllLossBackward0>)


 95%|█████████▍| 1423/1500 [12:41<00:34,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.3301, device='cuda:0', grad_fn=<NllLossBackward0>)


 95%|█████████▍| 1424/1500 [12:42<00:34,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.8412, device='cuda:0', grad_fn=<NllLossBackward0>)


 95%|█████████▌| 1425/1500 [12:42<00:33,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9193, device='cuda:0', grad_fn=<NllLossBackward0>)


 95%|█████████▌| 1426/1500 [12:43<00:33,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.5042, device='cuda:0', grad_fn=<NllLossBackward0>)


 95%|█████████▌| 1427/1500 [12:43<00:32,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8888, device='cuda:0', grad_fn=<NllLossBackward0>)


 95%|█████████▌| 1428/1500 [12:44<00:32,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7145, device='cuda:0', grad_fn=<NllLossBackward0>)


 95%|█████████▌| 1429/1500 [12:44<00:32,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.3594, device='cuda:0', grad_fn=<NllLossBackward0>)


 95%|█████████▌| 1430/1500 [12:44<00:31,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.0041, device='cuda:0', grad_fn=<NllLossBackward0>)


 95%|█████████▌| 1431/1500 [12:45<00:31,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.1262, device='cuda:0', grad_fn=<NllLossBackward0>)


 95%|█████████▌| 1432/1500 [12:45<00:30,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8166, device='cuda:0', grad_fn=<NllLossBackward0>)


 96%|█████████▌| 1433/1500 [12:46<00:30,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0632, device='cuda:0', grad_fn=<NllLossBackward0>)


 96%|█████████▌| 1434/1500 [12:46<00:29,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2172, device='cuda:0', grad_fn=<NllLossBackward0>)


 96%|█████████▌| 1435/1500 [12:47<00:29,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9184, device='cuda:0', grad_fn=<NllLossBackward0>)


 96%|█████████▌| 1436/1500 [12:47<00:28,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7510, device='cuda:0', grad_fn=<NllLossBackward0>)


 96%|█████████▌| 1437/1500 [12:48<00:28,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.0703, device='cuda:0', grad_fn=<NllLossBackward0>)


 96%|█████████▌| 1438/1500 [12:48<00:27,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4776, device='cuda:0', grad_fn=<NllLossBackward0>)


 96%|█████████▌| 1439/1500 [12:48<00:27,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9481, device='cuda:0', grad_fn=<NllLossBackward0>)


 96%|█████████▌| 1440/1500 [12:49<00:27,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.0531, device='cuda:0', grad_fn=<NllLossBackward0>)


 96%|█████████▌| 1441/1500 [12:49<00:26,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.1464, device='cuda:0', grad_fn=<NllLossBackward0>)


 96%|█████████▌| 1442/1500 [12:50<00:26,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.8583, device='cuda:0', grad_fn=<NllLossBackward0>)


 96%|█████████▌| 1443/1500 [12:50<00:25,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4571, device='cuda:0', grad_fn=<NllLossBackward0>)


 96%|█████████▋| 1444/1500 [12:51<00:25,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5713, device='cuda:0', grad_fn=<NllLossBackward0>)


 96%|█████████▋| 1445/1500 [12:51<00:24,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.2370, device='cuda:0', grad_fn=<NllLossBackward0>)


 96%|█████████▋| 1446/1500 [12:52<00:24,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3259, device='cuda:0', grad_fn=<NllLossBackward0>)


 96%|█████████▋| 1447/1500 [12:52<00:23,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1476, device='cuda:0', grad_fn=<NllLossBackward0>)


 97%|█████████▋| 1448/1500 [12:53<00:23,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.5082, device='cuda:0', grad_fn=<NllLossBackward0>)


 97%|█████████▋| 1449/1500 [12:53<00:23,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.6395, device='cuda:0', grad_fn=<NllLossBackward0>)


 97%|█████████▋| 1450/1500 [12:53<00:22,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5758, device='cuda:0', grad_fn=<NllLossBackward0>)


 97%|█████████▋| 1451/1500 [12:54<00:22,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7060, device='cuda:0', grad_fn=<NllLossBackward0>)


 97%|█████████▋| 1452/1500 [12:54<00:21,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1605, device='cuda:0', grad_fn=<NllLossBackward0>)


 97%|█████████▋| 1453/1500 [12:55<00:21,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0222, device='cuda:0', grad_fn=<NllLossBackward0>)


 97%|█████████▋| 1454/1500 [12:55<00:20,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.9512, device='cuda:0', grad_fn=<NllLossBackward0>)


 97%|█████████▋| 1455/1500 [12:56<00:20,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7553, device='cuda:0', grad_fn=<NllLossBackward0>)


 97%|█████████▋| 1456/1500 [12:56<00:19,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(4.7122, device='cuda:0', grad_fn=<NllLossBackward0>)


 97%|█████████▋| 1457/1500 [12:57<00:19,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4333, device='cuda:0', grad_fn=<NllLossBackward0>)


 97%|█████████▋| 1458/1500 [12:57<00:18,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7200, device='cuda:0', grad_fn=<NllLossBackward0>)


 97%|█████████▋| 1459/1500 [12:58<00:18,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.3308, device='cuda:0', grad_fn=<NllLossBackward0>)


 97%|█████████▋| 1460/1500 [12:58<00:18,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4862, device='cuda:0', grad_fn=<NllLossBackward0>)


 97%|█████████▋| 1461/1500 [12:58<00:17,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.1049, device='cuda:0', grad_fn=<NllLossBackward0>)


 97%|█████████▋| 1462/1500 [12:59<00:17,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4457, device='cuda:0', grad_fn=<NllLossBackward0>)


 98%|█████████▊| 1463/1500 [12:59<00:16,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1152, device='cuda:0', grad_fn=<NllLossBackward0>)


 98%|█████████▊| 1464/1500 [13:00<00:16,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1427, device='cuda:0', grad_fn=<NllLossBackward0>)


 98%|█████████▊| 1465/1500 [13:00<00:15,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6667, device='cuda:0', grad_fn=<NllLossBackward0>)


 98%|█████████▊| 1466/1500 [13:01<00:15,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4879, device='cuda:0', grad_fn=<NllLossBackward0>)


 98%|█████████▊| 1467/1500 [13:01<00:14,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7207, device='cuda:0', grad_fn=<NllLossBackward0>)


 98%|█████████▊| 1468/1500 [13:02<00:14,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.3136, device='cuda:0', grad_fn=<NllLossBackward0>)


 98%|█████████▊| 1469/1500 [13:02<00:13,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4238, device='cuda:0', grad_fn=<NllLossBackward0>)


 98%|█████████▊| 1470/1500 [13:02<00:13,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.1794, device='cuda:0', grad_fn=<NllLossBackward0>)


 98%|█████████▊| 1471/1500 [13:03<00:13,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.4709, device='cuda:0', grad_fn=<NllLossBackward0>)


 98%|█████████▊| 1472/1500 [13:03<00:12,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4474, device='cuda:0', grad_fn=<NllLossBackward0>)


 98%|█████████▊| 1473/1500 [13:04<00:12,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.1990, device='cuda:0', grad_fn=<NllLossBackward0>)


 98%|█████████▊| 1474/1500 [13:04<00:11,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.7285, device='cuda:0', grad_fn=<NllLossBackward0>)


 98%|█████████▊| 1475/1500 [13:05<00:11,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.2442, device='cuda:0', grad_fn=<NllLossBackward0>)


 98%|█████████▊| 1476/1500 [13:05<00:10,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0209, device='cuda:0', grad_fn=<NllLossBackward0>)


 98%|█████████▊| 1477/1500 [13:06<00:10,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8314, device='cuda:0', grad_fn=<NllLossBackward0>)


 99%|█████████▊| 1478/1500 [13:06<00:09,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5489, device='cuda:0', grad_fn=<NllLossBackward0>)


 99%|█████████▊| 1479/1500 [13:07<00:09,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5357, device='cuda:0', grad_fn=<NllLossBackward0>)


 99%|█████████▊| 1480/1500 [13:07<00:08,  2.23it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4168, device='cuda:0', grad_fn=<NllLossBackward0>)


 99%|█████████▊| 1481/1500 [13:07<00:08,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6412, device='cuda:0', grad_fn=<NllLossBackward0>)


 99%|█████████▉| 1482/1500 [13:08<00:08,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2439, device='cuda:0', grad_fn=<NllLossBackward0>)


 99%|█████████▉| 1483/1500 [13:08<00:07,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.6840, device='cuda:0', grad_fn=<NllLossBackward0>)


 99%|█████████▉| 1484/1500 [13:09<00:07,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.3584, device='cuda:0', grad_fn=<NllLossBackward0>)


 99%|█████████▉| 1485/1500 [13:09<00:06,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8616, device='cuda:0', grad_fn=<NllLossBackward0>)


 99%|█████████▉| 1486/1500 [13:10<00:06,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5594, device='cuda:0', grad_fn=<NllLossBackward0>)


 99%|█████████▉| 1487/1500 [13:10<00:05,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.7092, device='cuda:0', grad_fn=<NllLossBackward0>)


 99%|█████████▉| 1488/1500 [13:11<00:05,  2.21it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4136, device='cuda:0', grad_fn=<NllLossBackward0>)


 99%|█████████▉| 1489/1500 [13:11<00:04,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8363, device='cuda:0', grad_fn=<NllLossBackward0>)


 99%|█████████▉| 1490/1500 [13:11<00:04,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.3923, device='cuda:0', grad_fn=<NllLossBackward0>)


 99%|█████████▉| 1491/1500 [13:12<00:04,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2198, device='cuda:0', grad_fn=<NllLossBackward0>)


 99%|█████████▉| 1492/1500 [13:12<00:03,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0042, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|█████████▉| 1493/1500 [13:13<00:03,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.8719, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|█████████▉| 1494/1500 [13:13<00:02,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.2269, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|█████████▉| 1495/1500 [13:14<00:02,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4843, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|█████████▉| 1496/1500 [13:14<00:01,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(6.0695, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|█████████▉| 1497/1500 [13:15<00:01,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5587, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|█████████▉| 1498/1500 [13:15<00:00,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.4624, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|█████████▉| 1499/1500 [13:16<00:00,  2.22it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(5.5589, device='cuda:0', grad_fn=<NllLossBackward0>)


100%|██████████| 1500/1500 [13:16<00:00,  2.22it/s]

Epoch [3/3], Step [1500], Eval Loss: 5.842406475543976


In [8]:
model.eval()
eval_loss = 0
for eval_batch in eval_dl:
    with torch.no_grad():
        eval_outputs = model(**eval_batch)
        eval_loss += eval_outputs.loss.item()
eval_loss /= len(eval_dl)
print(f"Epoch [{epoch+1}/{num_epochs}], Step [{global_step}], Eval Loss: {eval_loss}")

Epoch [3/3], Step [1500], Eval Loss: 5.842406475543976


In [None]:
## Exploding gradients

In [11]:
model.eval()
eval_loss = 0
for eval_batch in train_dl:
    with torch.no_grad():
        eval_outputs = model(**eval_batch)
        eval_loss += eval_outputs.loss.item()
    break
        #print(eval_outputs.loss.item())

eval_loss /= len(eval_dl)
eval_loss

## This works too so it's not the train_dl as well

0.02767714262008667

In [9]:
model.train()
train_loss = 0
for train_batch in train_dl:
    outputs = model(**train_batch)
    train_loss = outputs.loss.item()
    break
        #print(eval_outputs.loss.item())

train_loss

## THis works too. It's basically a problem with the model.gradient_checkpointing
##  but without that, training doesn't fit on the gpu

3.0101101398468018

In [10]:
!pip install scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/8a/5d/047cde25131eef3a38d03317fa7d25d6f60ce6e8ccfd24ac88b3e309fc00/scikit_learn-1.5.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading scikit_learn-1.5.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.6.0 from https://files.pythonhosted.org/packages/9b/00/ce54410e344b3a6032cd42ed53fe425cf57a66d28e337670292bbb419ebc/scipy-1.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading scipy-1.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m499.1 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Obtaining dependency information 

In [26]:
batch["labels"].cpu().numpy().shape

(2, 512)

### Testing

In [28]:
from sklearn.metrics import accuracy_score

model.eval()
predictions, references = [], []

for batch in eval_dl:
    with torch.no_grad():
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
    decoded_preds = tokenizer.batch_decode(preds.cpu().numpy(), skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(batch["input_ids"].cpu().numpy(), skip_special_tokens=True)

    predictions.extend(decoded_preds)
    references.extend(decoded_labels)


In [33]:
references[0]

" Given a question and the possible answer choices, give the index of the right choice.\n    ### Question\n    Lewis Henry Morgan's stages of human culture are known as:\n    ### Choices\n    ['Three-Age system.', 'Natural selection.', 'Multilineal evolution.', 'Unilineal evolution.']\n    ### Answer\n    3\n    "

In [42]:
print(predictions[6])

 a question and the possible answer choices, give the index of the right choice.
    ### Question
    ###.. to the right.. The     the right. the...I.. the.. the the have.. the the the the. a the the right. the right be. the right be. the the. the right. the the right be. the. a. right.. the the school to. a to the. a right the... the a... the to a the.. the. the the and. a the the to the right. the. the first... the have.. the. the.. a the right.I.. the.. the. the to the to the the first and.. to the.. the be of The be.. the right..... the the the.. the right... be to school be of the right. the be... the to. the the and... the right.. the right to. The   . be. the school. the and was.... the. the..   . the right. and. right.. the. the right. to
    ### Choices
    ###'t first. be the the right... to's.. right.. ''t first. be the school... to's.. right.. ''t first. to the be... ''t first. to right.. the right and. the right..'t
    ### Cho
    ###
    ###..............................

## Deepspeed world

In [None]:
from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live
estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=2, num_nodes=1)