In [1]:
from peft import (
    get_peft_config,
    get_peft_model,
    PromptTuningInit,
    PromptTuningConfig,
    TaskType,
)
import torch
from datasets import load_dataset
import os
from tqdm.auto import tqdm

device = "cuda"
model_name_or_path = "bigscience/bloomz-560m"
tokenizer_name_or_path = "bigscience/bloomz-560m"
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text="Classify if the tweet is a complaint or not.",
    tokenizer_name_or_path=tokenizer_name_or_path,
)

dataset_name = "twitter_complaints"
checkpoint_name = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt".replace(
    "/", "_"
)
text_column = "Tweet text"
label_column = "text_label"
max_length = 64
lr = 3e-2
num_epochs = 50
batch_size = 8

In [2]:
dataset = load_dataset("ought/raft", dataset_name)
dataset["train"][0]

{'Tweet text': '@HMRCcustomers No this is my first job', 'ID': 0, 'Label': 2}

In [3]:
classes = [k.replace("_", " ") for k in dataset["train"].features["Label"].names]
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["Label"]]},
    batched=True,
    num_proc=4,
)
dataset["train"][0]

{'Tweet text': '@HMRCcustomers No this is my first job',
 'ID': 0,
 'Label': 2,
 'text_label': 'no complaint'}

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = max(
    [len(tokenizer(class_label)["input_ids"]) for class_label in classes]
)
print(target_max_length)

3


In [5]:
def preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label: " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])

    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (
            max_length - len(sample_input_ids)
        ) + model_inputs["attention_mask"][i]
        labels["input_ids"][i] = [-100] * (
            max_length - len(sample_input_ids)
        ) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(
            model_inputs["input_ids"][i][:max_length]
        )
        model_inputs["attention_mask"][i] = torch.tensor(
            model_inputs["attention_mask"][i][:max_length]
        )
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
processed_dataset = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset (num_proc=4):   0%|          | 0/50 [00:00<?, ? examples/s]

Running tokenizer on dataset (num_proc=4):   0%|          | 0/3399 [00:00<?, ? examples/s]

In [7]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

train_dataset = processed_dataset["train"]
eval_dataset = processed_dataset["test"]

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True,
)
eval_dataloader = DataLoader(
    eval_dataset,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True,
)

In [10]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

loading configuration file config.json from cache at /home/nevermore/.cache/huggingface/hub/models--bigscience--bloomz-560m/snapshots/a2845d7e13dd12efae154a9f1c63fcc2e0cc4b05/config.json
Model config BloomConfig {
  "_name_or_path": "bigscience/bloomz-560m",
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "BloomForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_softmax_in_fp32": true,
  "bias_dropout_fusion": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_dropout": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "masked_softmax_fusion": true,
  "model_type": "bloom",
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "offset_alibi": 100,
  "pad_token_id": 3,
  "pretraining_tp": 1,
  "seq_length": 2048,
  "skip_bias_add": true,
  "skip_bias_add_qkv": false,
  "slow_but_exact": false,
  "transformers_version": "4.45.2",
  "unk_token_id": 0,
  "use_cache": true,
  "vocab_size": 250880
}

loading 

trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0015


In [12]:
from transformers import get_linear_schedule_with_warmup

model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=0: train_ppl=tensor(1.2581e+12, device='cuda:0') train_epoch_loss=tensor(27.8606, device='cuda:0') eval_ppl=tensor(4500.4170, device='cuda:0') eval_epoch_loss=tensor(8.4119, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=1: train_ppl=tensor(3050.1570, device='cuda:0') train_epoch_loss=tensor(8.0229, device='cuda:0') eval_ppl=tensor(5142.6089, device='cuda:0') eval_epoch_loss=tensor(8.5453, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=2: train_ppl=tensor(707.2935, device='cuda:0') train_epoch_loss=tensor(6.5614, device='cuda:0') eval_ppl=tensor(6265.5386, device='cuda:0') eval_epoch_loss=tensor(8.7428, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=3: train_ppl=tensor(319.5354, device='cuda:0') train_epoch_loss=tensor(5.7669, device='cuda:0') eval_ppl=tensor(10755.7568, device='cuda:0') eval_epoch_loss=tensor(9.2832, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=4: train_ppl=tensor(231.5135, device='cuda:0') train_epoch_loss=tensor(5.4446, device='cuda:0') eval_ppl=tensor(12836.2891, device='cuda:0') eval_epoch_loss=tensor(9.4600, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=5: train_ppl=tensor(184.7994, device='cuda:0') train_epoch_loss=tensor(5.2193, device='cuda:0') eval_ppl=tensor(12361.5977, device='cuda:0') eval_epoch_loss=tensor(9.4223, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=6: train_ppl=tensor(145.3695, device='cuda:0') train_epoch_loss=tensor(4.9793, device='cuda:0') eval_ppl=tensor(11881.6621, device='cuda:0') eval_epoch_loss=tensor(9.3828, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=7: train_ppl=tensor(107.2547, device='cuda:0') train_epoch_loss=tensor(4.6752, device='cuda:0') eval_ppl=tensor(11421.9609, device='cuda:0') eval_epoch_loss=tensor(9.3433, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=8: train_ppl=tensor(79.5611, device='cuda:0') train_epoch_loss=tensor(4.3765, device='cuda:0') eval_ppl=tensor(15976.3145, device='cuda:0') eval_epoch_loss=tensor(9.6789, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=9: train_ppl=tensor(62.2457, device='cuda:0') train_epoch_loss=tensor(4.1311, device='cuda:0') eval_ppl=tensor(27199.4688, device='cuda:0') eval_epoch_loss=tensor(10.2110, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=10: train_ppl=tensor(46.6520, device='cuda:0') train_epoch_loss=tensor(3.8427, device='cuda:0') eval_ppl=tensor(25573.7207, device='cuda:0') eval_epoch_loss=tensor(10.1493, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=11: train_ppl=tensor(36.0147, device='cuda:0') train_epoch_loss=tensor(3.5839, device='cuda:0') eval_ppl=tensor(20323.6875, device='cuda:0') eval_epoch_loss=tensor(9.9195, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=12: train_ppl=tensor(28.3991, device='cuda:0') train_epoch_loss=tensor(3.3464, device='cuda:0') eval_ppl=tensor(41547.8750, device='cuda:0') eval_epoch_loss=tensor(10.6346, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=13: train_ppl=tensor(26.4741, device='cuda:0') train_epoch_loss=tensor(3.2762, device='cuda:0') eval_ppl=tensor(53079.9102, device='cuda:0') eval_epoch_loss=tensor(10.8796, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=14: train_ppl=tensor(19.8123, device='cuda:0') train_epoch_loss=tensor(2.9863, device='cuda:0') eval_ppl=tensor(78092.6875, device='cuda:0') eval_epoch_loss=tensor(11.2657, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=15: train_ppl=tensor(15.4446, device='cuda:0') train_epoch_loss=tensor(2.7373, device='cuda:0') eval_ppl=tensor(117470.6172, device='cuda:0') eval_epoch_loss=tensor(11.6739, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=16: train_ppl=tensor(12.8025, device='cuda:0') train_epoch_loss=tensor(2.5496, device='cuda:0') eval_ppl=tensor(81354.7812, device='cuda:0') eval_epoch_loss=tensor(11.3066, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=17: train_ppl=tensor(8.8371, device='cuda:0') train_epoch_loss=tensor(2.1790, device='cuda:0') eval_ppl=tensor(136329.1406, device='cuda:0') eval_epoch_loss=tensor(11.8228, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=18: train_ppl=tensor(6.9569, device='cuda:0') train_epoch_loss=tensor(1.9397, device='cuda:0') eval_ppl=tensor(108162.3594, device='cuda:0') eval_epoch_loss=tensor(11.5914, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=19: train_ppl=tensor(6.3554, device='cuda:0') train_epoch_loss=tensor(1.8493, device='cuda:0') eval_ppl=tensor(45707.8906, device='cuda:0') eval_epoch_loss=tensor(10.7300, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=20: train_ppl=tensor(4.3381, device='cuda:0') train_epoch_loss=tensor(1.4674, device='cuda:0') eval_ppl=tensor(47502.1797, device='cuda:0') eval_epoch_loss=tensor(10.7685, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=21: train_ppl=tensor(3.4596, device='cuda:0') train_epoch_loss=tensor(1.2412, device='cuda:0') eval_ppl=tensor(60014.0859, device='cuda:0') eval_epoch_loss=tensor(11.0023, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=22: train_ppl=tensor(2.9005, device='cuda:0') train_epoch_loss=tensor(1.0649, device='cuda:0') eval_ppl=tensor(62069.0117, device='cuda:0') eval_epoch_loss=tensor(11.0360, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=23: train_ppl=tensor(2.4541, device='cuda:0') train_epoch_loss=tensor(0.8978, device='cuda:0') eval_ppl=tensor(41490.7773, device='cuda:0') eval_epoch_loss=tensor(10.6332, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=24: train_ppl=tensor(1.9883, device='cuda:0') train_epoch_loss=tensor(0.6873, device='cuda:0') eval_ppl=tensor(34396.9609, device='cuda:0') eval_epoch_loss=tensor(10.4457, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=25: train_ppl=tensor(1.7437, device='cuda:0') train_epoch_loss=tensor(0.5560, device='cuda:0') eval_ppl=tensor(30264.8809, device='cuda:0') eval_epoch_loss=tensor(10.3177, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=26: train_ppl=tensor(1.5927, device='cuda:0') train_epoch_loss=tensor(0.4655, device='cuda:0') eval_ppl=tensor(44139.3750, device='cuda:0') eval_epoch_loss=tensor(10.6951, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=27: train_ppl=tensor(1.5831, device='cuda:0') train_epoch_loss=tensor(0.4594, device='cuda:0') eval_ppl=tensor(32781.7695, device='cuda:0') eval_epoch_loss=tensor(10.3976, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=28: train_ppl=tensor(1.5250, device='cuda:0') train_epoch_loss=tensor(0.4220, device='cuda:0') eval_ppl=tensor(35397.5430, device='cuda:0') eval_epoch_loss=tensor(10.4744, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=29: train_ppl=tensor(1.4842, device='cuda:0') train_epoch_loss=tensor(0.3949, device='cuda:0') eval_ppl=tensor(31921.3027, device='cuda:0') eval_epoch_loss=tensor(10.3710, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=30: train_ppl=tensor(1.4356, device='cuda:0') train_epoch_loss=tensor(0.3616, device='cuda:0') eval_ppl=tensor(19438.8809, device='cuda:0') eval_epoch_loss=tensor(9.8750, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=31: train_ppl=tensor(1.6354, device='cuda:0') train_epoch_loss=tensor(0.4919, device='cuda:0') eval_ppl=tensor(15435.2646, device='cuda:0') eval_epoch_loss=tensor(9.6444, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=32: train_ppl=tensor(1.5915, device='cuda:0') train_epoch_loss=tensor(0.4647, device='cuda:0') eval_ppl=tensor(8225.7373, device='cuda:0') eval_epoch_loss=tensor(9.0150, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=33: train_ppl=tensor(1.4918, device='cuda:0') train_epoch_loss=tensor(0.4000, device='cuda:0') eval_ppl=tensor(10591.4688, device='cuda:0') eval_epoch_loss=tensor(9.2678, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=34: train_ppl=tensor(1.4034, device='cuda:0') train_epoch_loss=tensor(0.3389, device='cuda:0') eval_ppl=tensor(11791.3906, device='cuda:0') eval_epoch_loss=tensor(9.3751, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=35: train_ppl=tensor(1.3221, device='cuda:0') train_epoch_loss=tensor(0.2792, device='cuda:0') eval_ppl=tensor(16070.0781, device='cuda:0') eval_epoch_loss=tensor(9.6847, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=36: train_ppl=tensor(1.3105, device='cuda:0') train_epoch_loss=tensor(0.2704, device='cuda:0') eval_ppl=tensor(21831.3281, device='cuda:0') eval_epoch_loss=tensor(9.9911, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=37: train_ppl=tensor(1.3518, device='cuda:0') train_epoch_loss=tensor(0.3014, device='cuda:0') eval_ppl=tensor(19502.0508, device='cuda:0') eval_epoch_loss=tensor(9.8783, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=38: train_ppl=tensor(1.2902, device='cuda:0') train_epoch_loss=tensor(0.2548, device='cuda:0') eval_ppl=tensor(21095.4551, device='cuda:0') eval_epoch_loss=tensor(9.9568, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=39: train_ppl=tensor(1.2705, device='cuda:0') train_epoch_loss=tensor(0.2394, device='cuda:0') eval_ppl=tensor(22574.6074, device='cuda:0') eval_epoch_loss=tensor(10.0246, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=40: train_ppl=tensor(1.2272, device='cuda:0') train_epoch_loss=tensor(0.2047, device='cuda:0') eval_ppl=tensor(30997.6348, device='cuda:0') eval_epoch_loss=tensor(10.3417, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=41: train_ppl=tensor(1.2084, device='cuda:0') train_epoch_loss=tensor(0.1893, device='cuda:0') eval_ppl=tensor(31153.2520, device='cuda:0') eval_epoch_loss=tensor(10.3467, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=42: train_ppl=tensor(1.2093, device='cuda:0') train_epoch_loss=tensor(0.1901, device='cuda:0') eval_ppl=tensor(36221.8047, device='cuda:0') eval_epoch_loss=tensor(10.4974, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=43: train_ppl=tensor(1.2124, device='cuda:0') train_epoch_loss=tensor(0.1926, device='cuda:0') eval_ppl=tensor(38890.2773, device='cuda:0') eval_epoch_loss=tensor(10.5685, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=44: train_ppl=tensor(1.2051, device='cuda:0') train_epoch_loss=tensor(0.1866, device='cuda:0') eval_ppl=tensor(40902.2500, device='cuda:0') eval_epoch_loss=tensor(10.6189, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=45: train_ppl=tensor(1.1930, device='cuda:0') train_epoch_loss=tensor(0.1765, device='cuda:0') eval_ppl=tensor(38627.5078, device='cuda:0') eval_epoch_loss=tensor(10.5617, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=46: train_ppl=tensor(1.1838, device='cuda:0') train_epoch_loss=tensor(0.1687, device='cuda:0') eval_ppl=tensor(42601.3516, device='cuda:0') eval_epoch_loss=tensor(10.6596, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=47: train_ppl=tensor(1.1951, device='cuda:0') train_epoch_loss=tensor(0.1782, device='cuda:0') eval_ppl=tensor(46084.5312, device='cuda:0') eval_epoch_loss=tensor(10.7382, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=48: train_ppl=tensor(1.1899, device='cuda:0') train_epoch_loss=tensor(0.1739, device='cuda:0') eval_ppl=tensor(45721.7969, device='cuda:0') eval_epoch_loss=tensor(10.7303, device='cuda:0')


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/425 [00:00<?, ?it/s]

epoch=49: train_ppl=tensor(1.1931, device='cuda:0') train_epoch_loss=tensor(0.1766, device='cuda:0') eval_ppl=tensor(44892.4414, device='cuda:0') eval_epoch_loss=tensor(10.7120, device='cuda:0')
