# Prompt tunning with huggingface
Most of the code is from [huggingface task guide](https://huggingface.co/docs/peft/task_guides/clm-prompt-tuning)

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType
import torch
from datasets import load_dataset
import os
import wandb

from torch.utils.data import DataLoader
from tqdm import tqdm

import numpy as np

os.environ["WANDB_NOTEBOOK_NAME"] = "prompt_tunning.ipynb"

In [2]:
dataset_name = "twitter_complaints"
model_name_or_path = "bigscience/bloomz-560m"
run_i = 1

peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text="Classify if the tweet is a complaint or not:",
    tokenizer_name_or_path=model_name_or_path,
)

config={
    "dataset_hf_account": "ought/raft",
    "dataset_name": dataset_name,
    "device": "cuda",
    "learning_rate": 3e-2,
    "architecture": "LM",
    "model_name_or_path": model_name_or_path,
    "tokenizer_name_or_path": model_name_or_path, 
    "max_length": 64,
    "num_epochs": 50,
    "batch_size": 8,
    "peft_config_peft_type": peft_config.peft_type,
    "peft_config_task_type": peft_config.task_type,
    "checkpoint_name": f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt".replace("/", "_")
}

wandb.init(project="bloomz-560m_prompt_tunning_casual_lm", config=config, group="bloomz-560m_prompt_tunning_casual_lm", name=f"run_{run_i}")

text_column = "Tweet text"
label_column = "text_label"

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jovyan/.netrc


In [3]:
dataset = load_dataset("ought/raft", dataset_name)
dataset["train"][0]

Downloading builder script:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/56.1k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.2k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/11 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/7.79k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/662k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.91k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/327k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/917k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/54.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/70.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/196k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.58k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/412k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.09M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.64k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/412k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.38k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/336k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.12k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/68.5k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/11 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/50 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3399 [00:00<?, ? examples/s]

{'Tweet text': '@HMRCcustomers No this is my first job', 'ID': 0, 'Label': 2}

In [4]:
classes = [k.replace("_", " ") for k in dataset["train"].features["Label"].names]
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["Label"]]},
    batched=True,
    num_proc=1,
)
dataset["train"][0]


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/3399 [00:00<?, ? examples/s]

{'Tweet text': '@HMRCcustomers No this is my first job',
 'ID': 0,
 'Label': 2,
 'text_label': 'no complaint'}

## Preprocessing

In [5]:
tokenizer = AutoTokenizer.from_pretrained(wandb.config["tokenizer_name_or_path"])
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
target_max_length

Downloading (…)okenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

3

In [6]:
# why do we do this black magic?
# [1] *, [-100] * etc
def preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]] # better prompting
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i] 
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id] # first we add padding token at the end of every label
        # print(i, sample_input_ids, label_input_ids)

        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids # concatenate sample ids with label ids with padding token id at the end
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids # now make the labels same lenght as the samples and instead of samples there are -100 everywhere
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i]) # we change the attenttion masks to be all 1 for every id of the input sample
    
    # print(model_inputs)

    # now we have input sample token lenghts matching input label lenghts

    for i in range(batch_size):
        # load a single sample and its coresponding label
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
   
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (wandb.config["max_length"] - len(sample_input_ids)) + sample_input_ids # prepend 64 - sample_len padding tokens 
        model_inputs["attention_mask"][i] = [0] * (wandb.config["max_length"] - len(sample_input_ids)) + model_inputs["attention_mask"][i] # prepend 64 - sample_len 0 zeros to attention mask, because we want to ingnore the padding tokens

        labels["input_ids"][i] = [-100] * (wandb.config["max_length"] - len(sample_input_ids)) + label_input_ids # prepend 64 - sample len -100 also for labels (sum will be 64)

        # in case some samples or lables are larger than 64 tokens take only first 64 tokens
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:wandb.config["max_length"]])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:wandb.config["max_length"]])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:wandb.config["max_length"]])
        
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/3399 [00:00<?, ? examples/s]

In [8]:
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]

train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=wandb.config["batch_size"], pin_memory=True)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=wandb.config["batch_size"], pin_memory=True)

In [11]:
n_runs = 10
for r in range(1, n_runs):
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
    model = get_peft_model(model, peft_config)
    wandb.watch(model, log_freq=100)
    
    model.print_trainable_parameters()
        
    optimizer = torch.optim.AdamW(model.parameters(), lr=wandb.config["learning_rate"])
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=(len(train_dataloader) * wandb.config["num_epochs"]),
    )
    
    min_eval_epoch_loss = np.inf
    model = model.to(wandb.config["device"])

    for epoch in range(wandb.config["num_epochs"]):
        model.train()
        total_loss = 0
        for step, batch in enumerate(tqdm(train_dataloader)):
            batch = {k: v.to(wandb.config["device"]) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.detach().float()
            # wandb.log({"total_loss": total_loss})
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        model.eval()
        eval_loss = 0
        eval_preds = []
        for step, batch in enumerate(tqdm(eval_dataloader)):
            batch = {k: v.to(wandb.config["device"]) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            loss = outputs.loss
            eval_loss += loss.detach().float()
            # wandb.log({"eval_loss": eval_loss})
            eval_preds.extend(
                tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
            )

        eval_epoch_loss = eval_loss / len(eval_dataloader)

        if eval_epoch_loss < min_eval_epoch_loss:
            min_eval_epoch_loss = eval_epoch_loss

            torch.save({
                "epoch": epoch,
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "loss": min_eval_epoch_loss,
                }, wandb.config["checkpoint_name"])
            
            wandb.save(wandb.config["checkpoint_name"] + str(r))


        eval_ppl = torch.exp(eval_epoch_loss)
        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
        wandb.log({"epoch": epoch, "train_ppl": train_ppl, "train_epoch_loss": train_epoch_loss, "eval_ppl": eval_ppl, "eval_epoch_loss": eval_epoch_loss})

    wandb.finish()
    wandb.init(project="bloomz-560m_prompt_tunning_casual_lm", config=config, group="bloomz-560m_prompt_tunning_casual_lm", name=f"run_{run_i+r}")

wandb.finish()

trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358


100%|██████████| 7/7 [00:01<00:00,  6.15it/s]
100%|██████████| 425/425 [00:24<00:00, 17.03it/s]


epoch=0: train_ppl=tensor(8.3367e+17, device='cuda:0') train_epoch_loss=tensor(41.2646, device='cuda:0') eval_ppl=tensor(15351.5586, device='cuda:0') eval_epoch_loss=tensor(9.6390, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.54it/s]
100%|██████████| 425/425 [00:25<00:00, 16.96it/s]


epoch=1: train_ppl=tensor(397190.7188, device='cuda:0') train_epoch_loss=tensor(12.8922, device='cuda:0') eval_ppl=tensor(8629.9424, device='cuda:0') eval_epoch_loss=tensor(9.0630, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.64it/s]
100%|██████████| 425/425 [00:25<00:00, 16.80it/s]


epoch=2: train_ppl=tensor(122973.3359, device='cuda:0') train_epoch_loss=tensor(11.7197, device='cuda:0') eval_ppl=tensor(6201.0703, device='cuda:0') eval_epoch_loss=tensor(8.7325, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.67it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=3: train_ppl=tensor(18153.1895, device='cuda:0') train_epoch_loss=tensor(9.8066, device='cuda:0') eval_ppl=tensor(7777.5581, device='cuda:0') eval_epoch_loss=tensor(8.9590, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.55it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=4: train_ppl=tensor(3274.8696, device='cuda:0') train_epoch_loss=tensor(8.0940, device='cuda:0') eval_ppl=tensor(4911.7930, device='cuda:0') eval_epoch_loss=tensor(8.4994, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.33it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=5: train_ppl=tensor(811.9193, device='cuda:0') train_epoch_loss=tensor(6.6994, device='cuda:0') eval_ppl=tensor(6605.7725, device='cuda:0') eval_epoch_loss=tensor(8.7957, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=6: train_ppl=tensor(349.5655, device='cuda:0') train_epoch_loss=tensor(5.8567, device='cuda:0') eval_ppl=tensor(12908.9023, device='cuda:0') eval_epoch_loss=tensor(9.4657, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.73it/s]
100%|██████████| 425/425 [00:25<00:00, 16.65it/s]


epoch=7: train_ppl=tensor(282.3629, device='cuda:0') train_epoch_loss=tensor(5.6432, device='cuda:0') eval_ppl=tensor(18044.6543, device='cuda:0') eval_epoch_loss=tensor(9.8006, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.51it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=8: train_ppl=tensor(243.4718, device='cuda:0') train_epoch_loss=tensor(5.4950, device='cuda:0') eval_ppl=tensor(16669.2520, device='cuda:0') eval_epoch_loss=tensor(9.7213, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.38it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=9: train_ppl=tensor(194.8473, device='cuda:0') train_epoch_loss=tensor(5.2722, device='cuda:0') eval_ppl=tensor(18875.3047, device='cuda:0') eval_epoch_loss=tensor(9.8456, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.51it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=10: train_ppl=tensor(165.3395, device='cuda:0') train_epoch_loss=tensor(5.1080, device='cuda:0') eval_ppl=tensor(28455.3945, device='cuda:0') eval_epoch_loss=tensor(10.2561, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=11: train_ppl=tensor(143.3752, device='cuda:0') train_epoch_loss=tensor(4.9655, device='cuda:0') eval_ppl=tensor(29776.0117, device='cuda:0') eval_epoch_loss=tensor(10.3015, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.74it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=12: train_ppl=tensor(106.9004, device='cuda:0') train_epoch_loss=tensor(4.6719, device='cuda:0') eval_ppl=tensor(36787.4414, device='cuda:0') eval_epoch_loss=tensor(10.5129, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.69it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=13: train_ppl=tensor(104.7922, device='cuda:0') train_epoch_loss=tensor(4.6520, device='cuda:0') eval_ppl=tensor(20367.9844, device='cuda:0') eval_epoch_loss=tensor(9.9217, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.51it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=14: train_ppl=tensor(72.7380, device='cuda:0') train_epoch_loss=tensor(4.2869, device='cuda:0') eval_ppl=tensor(53271.8086, device='cuda:0') eval_epoch_loss=tensor(10.8832, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.59it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=15: train_ppl=tensor(53.4192, device='cuda:0') train_epoch_loss=tensor(3.9782, device='cuda:0') eval_ppl=tensor(85101.2344, device='cuda:0') eval_epoch_loss=tensor(11.3516, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.45it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=16: train_ppl=tensor(44.3690, device='cuda:0') train_epoch_loss=tensor(3.7925, device='cuda:0') eval_ppl=tensor(100129.1797, device='cuda:0') eval_epoch_loss=tensor(11.5142, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=17: train_ppl=tensor(34.3122, device='cuda:0') train_epoch_loss=tensor(3.5355, device='cuda:0') eval_ppl=tensor(77048.8047, device='cuda:0') eval_epoch_loss=tensor(11.2522, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.75it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=18: train_ppl=tensor(26.7739, device='cuda:0') train_epoch_loss=tensor(3.2874, device='cuda:0') eval_ppl=tensor(99527.0234, device='cuda:0') eval_epoch_loss=tensor(11.5082, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.70it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=19: train_ppl=tensor(22.4255, device='cuda:0') train_epoch_loss=tensor(3.1102, device='cuda:0') eval_ppl=tensor(145401.7188, device='cuda:0') eval_epoch_loss=tensor(11.8873, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.30it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=20: train_ppl=tensor(29.2556, device='cuda:0') train_epoch_loss=tensor(3.3761, device='cuda:0') eval_ppl=tensor(141533.8750, device='cuda:0') eval_epoch_loss=tensor(11.8603, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.78it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=21: train_ppl=tensor(22.8127, device='cuda:0') train_epoch_loss=tensor(3.1273, device='cuda:0') eval_ppl=tensor(112792.1172, device='cuda:0') eval_epoch_loss=tensor(11.6333, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.78it/s]
100%|██████████| 425/425 [00:25<00:00, 16.70it/s]


epoch=22: train_ppl=tensor(17.5944, device='cuda:0') train_epoch_loss=tensor(2.8676, device='cuda:0') eval_ppl=tensor(229141.3750, device='cuda:0') eval_epoch_loss=tensor(12.3421, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.78it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=23: train_ppl=tensor(17.4579, device='cuda:0') train_epoch_loss=tensor(2.8598, device='cuda:0') eval_ppl=tensor(126170.9297, device='cuda:0') eval_epoch_loss=tensor(11.7454, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.77it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=24: train_ppl=tensor(12.7798, device='cuda:0') train_epoch_loss=tensor(2.5479, device='cuda:0') eval_ppl=tensor(79957.8672, device='cuda:0') eval_epoch_loss=tensor(11.2893, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.66it/s]
100%|██████████| 425/425 [00:25<00:00, 16.70it/s]


epoch=25: train_ppl=tensor(11.3607, device='cuda:0') train_epoch_loss=tensor(2.4302, device='cuda:0') eval_ppl=tensor(72763.6641, device='cuda:0') eval_epoch_loss=tensor(11.1950, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.78it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=26: train_ppl=tensor(8.7195, device='cuda:0') train_epoch_loss=tensor(2.1656, device='cuda:0') eval_ppl=tensor(172356.7188, device='cuda:0') eval_epoch_loss=tensor(12.0573, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.46it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=27: train_ppl=tensor(7.2746, device='cuda:0') train_epoch_loss=tensor(1.9844, device='cuda:0') eval_ppl=tensor(429537.2500, device='cuda:0') eval_epoch_loss=tensor(12.9705, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.64it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=28: train_ppl=tensor(6.2448, device='cuda:0') train_epoch_loss=tensor(1.8317, device='cuda:0') eval_ppl=tensor(813246.4375, device='cuda:0') eval_epoch_loss=tensor(13.6088, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]
100%|██████████| 425/425 [00:25<00:00, 16.70it/s]


epoch=29: train_ppl=tensor(6.5383, device='cuda:0') train_epoch_loss=tensor(1.8777, device='cuda:0') eval_ppl=tensor(764394.0625, device='cuda:0') eval_epoch_loss=tensor(13.5468, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.66it/s]
100%|██████████| 425/425 [00:25<00:00, 16.71it/s]


epoch=30: train_ppl=tensor(4.9109, device='cuda:0') train_epoch_loss=tensor(1.5915, device='cuda:0') eval_ppl=tensor(482998.3750, device='cuda:0') eval_epoch_loss=tensor(13.0878, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.47it/s]
100%|██████████| 425/425 [00:25<00:00, 16.71it/s]


epoch=31: train_ppl=tensor(4.0801, device='cuda:0') train_epoch_loss=tensor(1.4061, device='cuda:0') eval_ppl=tensor(724874.9375, device='cuda:0') eval_epoch_loss=tensor(13.4938, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.39it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=32: train_ppl=tensor(3.4566, device='cuda:0') train_epoch_loss=tensor(1.2403, device='cuda:0') eval_ppl=tensor(1073653.2500, device='cuda:0') eval_epoch_loss=tensor(13.8866, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.28it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=33: train_ppl=tensor(3.0780, device='cuda:0') train_epoch_loss=tensor(1.1243, device='cuda:0') eval_ppl=tensor(1071286.5000, device='cuda:0') eval_epoch_loss=tensor(13.8844, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.61it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=34: train_ppl=tensor(2.6685, device='cuda:0') train_epoch_loss=tensor(0.9815, device='cuda:0') eval_ppl=tensor(1113092., device='cuda:0') eval_epoch_loss=tensor(13.9227, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.65it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=35: train_ppl=tensor(2.6390, device='cuda:0') train_epoch_loss=tensor(0.9704, device='cuda:0') eval_ppl=tensor(1103362.3750, device='cuda:0') eval_epoch_loss=tensor(13.9139, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.64it/s]
100%|██████████| 425/425 [00:25<00:00, 16.70it/s]


epoch=36: train_ppl=tensor(2.3081, device='cuda:0') train_epoch_loss=tensor(0.8364, device='cuda:0') eval_ppl=tensor(871761.6250, device='cuda:0') eval_epoch_loss=tensor(13.6783, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.75it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=37: train_ppl=tensor(2.0807, device='cuda:0') train_epoch_loss=tensor(0.7327, device='cuda:0') eval_ppl=tensor(1187785.5000, device='cuda:0') eval_epoch_loss=tensor(13.9876, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.43it/s]
100%|██████████| 425/425 [00:25<00:00, 16.70it/s]


epoch=38: train_ppl=tensor(2.0457, device='cuda:0') train_epoch_loss=tensor(0.7157, device='cuda:0') eval_ppl=tensor(885743.2500, device='cuda:0') eval_epoch_loss=tensor(13.6942, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.45it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=39: train_ppl=tensor(1.8992, device='cuda:0') train_epoch_loss=tensor(0.6415, device='cuda:0') eval_ppl=tensor(960676.9375, device='cuda:0') eval_epoch_loss=tensor(13.7754, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=40: train_ppl=tensor(1.8040, device='cuda:0') train_epoch_loss=tensor(0.5900, device='cuda:0') eval_ppl=tensor(1017149.5625, device='cuda:0') eval_epoch_loss=tensor(13.8325, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=41: train_ppl=tensor(1.7467, device='cuda:0') train_epoch_loss=tensor(0.5577, device='cuda:0') eval_ppl=tensor(954078.5000, device='cuda:0') eval_epoch_loss=tensor(13.7685, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.61it/s]
100%|██████████| 425/425 [00:25<00:00, 16.70it/s]


epoch=42: train_ppl=tensor(1.6469, device='cuda:0') train_epoch_loss=tensor(0.4989, device='cuda:0') eval_ppl=tensor(1120674.6250, device='cuda:0') eval_epoch_loss=tensor(13.9294, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.35it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=43: train_ppl=tensor(1.5923, device='cuda:0') train_epoch_loss=tensor(0.4652, device='cuda:0') eval_ppl=tensor(1072183.8750, device='cuda:0') eval_epoch_loss=tensor(13.8852, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.50it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=44: train_ppl=tensor(1.6286, device='cuda:0') train_epoch_loss=tensor(0.4877, device='cuda:0') eval_ppl=tensor(1113374.3750, device='cuda:0') eval_epoch_loss=tensor(13.9229, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=45: train_ppl=tensor(1.5516, device='cuda:0') train_epoch_loss=tensor(0.4393, device='cuda:0') eval_ppl=tensor(1385079., device='cuda:0') eval_epoch_loss=tensor(14.1413, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=46: train_ppl=tensor(1.5472, device='cuda:0') train_epoch_loss=tensor(0.4364, device='cuda:0') eval_ppl=tensor(1442916., device='cuda:0') eval_epoch_loss=tensor(14.1822, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.78it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=47: train_ppl=tensor(1.4895, device='cuda:0') train_epoch_loss=tensor(0.3984, device='cuda:0') eval_ppl=tensor(1405874.2500, device='cuda:0') eval_epoch_loss=tensor(14.1562, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.75it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=48: train_ppl=tensor(1.4705, device='cuda:0') train_epoch_loss=tensor(0.3856, device='cuda:0') eval_ppl=tensor(1411344.3750, device='cuda:0') eval_epoch_loss=tensor(14.1601, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.67it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=49: train_ppl=tensor(1.4615, device='cuda:0') train_epoch_loss=tensor(0.3795, device='cuda:0') eval_ppl=tensor(1428084.3750, device='cuda:0') eval_epoch_loss=tensor(14.1718, device='cuda:0')




0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
eval_epoch_loss,▂▁▁▁▁▂▂▂▃▃▃▃▄▅▄▅▅▅▆▅▄▅▆▇▇▇███▇█▇█▇██████
eval_ppl,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▁▂▃▅▃▅▆▆▆▅▇▅▆▆▆▆████
train_epoch_loss,█▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_ppl,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,49.0
eval_epoch_loss,14.17184
eval_ppl,1428084.375
train_epoch_loss,0.37949
train_ppl,1.46153


[34m[1mwandb[0m: Currently logged in as: [33mrobert-belanec[0m ([33mrbelanec[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112471519865923, max=1.0…

trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358


100%|██████████| 7/7 [00:00<00:00,  8.77it/s]
100%|██████████| 425/425 [00:25<00:00, 16.80it/s]


epoch=0: train_ppl=tensor(7.2794e+12, device='cuda:0') train_epoch_loss=tensor(29.6161, device='cuda:0') eval_ppl=tensor(5357.9365, device='cuda:0') eval_epoch_loss=tensor(8.5863, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.65it/s]
100%|██████████| 425/425 [00:25<00:00, 16.77it/s]


epoch=1: train_ppl=tensor(2528.4409, device='cuda:0') train_epoch_loss=tensor(7.8354, device='cuda:0') eval_ppl=tensor(4073.4800, device='cuda:0') eval_epoch_loss=tensor(8.3123, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.42it/s]
100%|██████████| 425/425 [00:25<00:00, 16.75it/s]


epoch=2: train_ppl=tensor(315.3751, device='cuda:0') train_epoch_loss=tensor(5.7538, device='cuda:0') eval_ppl=tensor(4998.1343, device='cuda:0') eval_epoch_loss=tensor(8.5168, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.38it/s]
100%|██████████| 425/425 [00:25<00:00, 16.66it/s]


epoch=3: train_ppl=tensor(172.4394, device='cuda:0') train_epoch_loss=tensor(5.1500, device='cuda:0') eval_ppl=tensor(4854.7778, device='cuda:0') eval_epoch_loss=tensor(8.4877, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.74it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=4: train_ppl=tensor(120.1308, device='cuda:0') train_epoch_loss=tensor(4.7886, device='cuda:0') eval_ppl=tensor(5026.9692, device='cuda:0') eval_epoch_loss=tensor(8.5226, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.37it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=5: train_ppl=tensor(84.0298, device='cuda:0') train_epoch_loss=tensor(4.4312, device='cuda:0') eval_ppl=tensor(5310.4644, device='cuda:0') eval_epoch_loss=tensor(8.5774, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.42it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=6: train_ppl=tensor(64.9872, device='cuda:0') train_epoch_loss=tensor(4.1742, device='cuda:0') eval_ppl=tensor(6183.7319, device='cuda:0') eval_epoch_loss=tensor(8.7297, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.53it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=7: train_ppl=tensor(46.9676, device='cuda:0') train_epoch_loss=tensor(3.8495, device='cuda:0') eval_ppl=tensor(5348.3647, device='cuda:0') eval_epoch_loss=tensor(8.5845, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=8: train_ppl=tensor(31.4723, device='cuda:0') train_epoch_loss=tensor(3.4491, device='cuda:0') eval_ppl=tensor(5832.0093, device='cuda:0') eval_epoch_loss=tensor(8.6711, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.54it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=9: train_ppl=tensor(21.8920, device='cuda:0') train_epoch_loss=tensor(3.0861, device='cuda:0') eval_ppl=tensor(7713.1997, device='cuda:0') eval_epoch_loss=tensor(8.9507, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.50it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=10: train_ppl=tensor(16.9697, device='cuda:0') train_epoch_loss=tensor(2.8314, device='cuda:0') eval_ppl=tensor(13619.3398, device='cuda:0') eval_epoch_loss=tensor(9.5192, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.53it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=11: train_ppl=tensor(14.3628, device='cuda:0') train_epoch_loss=tensor(2.6646, device='cuda:0') eval_ppl=tensor(9310.6045, device='cuda:0') eval_epoch_loss=tensor(9.1389, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=12: train_ppl=tensor(9.3392, device='cuda:0') train_epoch_loss=tensor(2.2342, device='cuda:0') eval_ppl=tensor(21019.7441, device='cuda:0') eval_epoch_loss=tensor(9.9532, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.62it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=13: train_ppl=tensor(6.3016, device='cuda:0') train_epoch_loss=tensor(1.8408, device='cuda:0') eval_ppl=tensor(24469.5449, device='cuda:0') eval_epoch_loss=tensor(10.1052, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.47it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=14: train_ppl=tensor(4.5054, device='cuda:0') train_epoch_loss=tensor(1.5053, device='cuda:0') eval_ppl=tensor(39493.9727, device='cuda:0') eval_epoch_loss=tensor(10.5839, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.74it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=15: train_ppl=tensor(3.1072, device='cuda:0') train_epoch_loss=tensor(1.1337, device='cuda:0') eval_ppl=tensor(21519.1367, device='cuda:0') eval_epoch_loss=tensor(9.9767, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.61it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=16: train_ppl=tensor(2.3373, device='cuda:0') train_epoch_loss=tensor(0.8490, device='cuda:0') eval_ppl=tensor(20653.5039, device='cuda:0') eval_epoch_loss=tensor(9.9356, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.27it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=17: train_ppl=tensor(1.9580, device='cuda:0') train_epoch_loss=tensor(0.6719, device='cuda:0') eval_ppl=tensor(18780.7324, device='cuda:0') eval_epoch_loss=tensor(9.8406, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.63it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=18: train_ppl=tensor(1.6851, device='cuda:0') train_epoch_loss=tensor(0.5218, device='cuda:0') eval_ppl=tensor(40511.1680, device='cuda:0') eval_epoch_loss=tensor(10.6093, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.63it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=19: train_ppl=tensor(1.8563, device='cuda:0') train_epoch_loss=tensor(0.6186, device='cuda:0') eval_ppl=tensor(23113.3867, device='cuda:0') eval_epoch_loss=tensor(10.0482, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.53it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=20: train_ppl=tensor(1.5473, device='cuda:0') train_epoch_loss=tensor(0.4365, device='cuda:0') eval_ppl=tensor(18548.0898, device='cuda:0') eval_epoch_loss=tensor(9.8281, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.55it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=21: train_ppl=tensor(1.6197, device='cuda:0') train_epoch_loss=tensor(0.4822, device='cuda:0') eval_ppl=tensor(12760.2627, device='cuda:0') eval_epoch_loss=tensor(9.4541, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.77it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=22: train_ppl=tensor(1.4938, device='cuda:0') train_epoch_loss=tensor(0.4014, device='cuda:0') eval_ppl=tensor(16094.8936, device='cuda:0') eval_epoch_loss=tensor(9.6863, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=23: train_ppl=tensor(1.3317, device='cuda:0') train_epoch_loss=tensor(0.2864, device='cuda:0') eval_ppl=tensor(22076.9805, device='cuda:0') eval_epoch_loss=tensor(10.0023, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.36it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=24: train_ppl=tensor(1.2616, device='cuda:0') train_epoch_loss=tensor(0.2324, device='cuda:0') eval_ppl=tensor(47196.5234, device='cuda:0') eval_epoch_loss=tensor(10.7621, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=25: train_ppl=tensor(1.2490, device='cuda:0') train_epoch_loss=tensor(0.2224, device='cuda:0') eval_ppl=tensor(54790.7617, device='cuda:0') eval_epoch_loss=tensor(10.9113, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=26: train_ppl=tensor(1.2555, device='cuda:0') train_epoch_loss=tensor(0.2275, device='cuda:0') eval_ppl=tensor(47388.5625, device='cuda:0') eval_epoch_loss=tensor(10.7661, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.64it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=27: train_ppl=tensor(1.2181, device='cuda:0') train_epoch_loss=tensor(0.1973, device='cuda:0') eval_ppl=tensor(54185.1797, device='cuda:0') eval_epoch_loss=tensor(10.9002, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.73it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=28: train_ppl=tensor(1.3149, device='cuda:0') train_epoch_loss=tensor(0.2738, device='cuda:0') eval_ppl=tensor(35922.7852, device='cuda:0') eval_epoch_loss=tensor(10.4891, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=29: train_ppl=tensor(1.2037, device='cuda:0') train_epoch_loss=tensor(0.1854, device='cuda:0') eval_ppl=tensor(52663.0273, device='cuda:0') eval_epoch_loss=tensor(10.8717, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=30: train_ppl=tensor(1.1753, device='cuda:0') train_epoch_loss=tensor(0.1615, device='cuda:0') eval_ppl=tensor(64595.6914, device='cuda:0') eval_epoch_loss=tensor(11.0759, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.36it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=31: train_ppl=tensor(1.3131, device='cuda:0') train_epoch_loss=tensor(0.2724, device='cuda:0') eval_ppl=tensor(30423.0566, device='cuda:0') eval_epoch_loss=tensor(10.3230, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.65it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=32: train_ppl=tensor(1.2615, device='cuda:0') train_epoch_loss=tensor(0.2323, device='cuda:0') eval_ppl=tensor(40267.1992, device='cuda:0') eval_epoch_loss=tensor(10.6033, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.66it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=33: train_ppl=tensor(1.2114, device='cuda:0') train_epoch_loss=tensor(0.1918, device='cuda:0') eval_ppl=tensor(60221.0625, device='cuda:0') eval_epoch_loss=tensor(11.0058, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.44it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=34: train_ppl=tensor(1.1612, device='cuda:0') train_epoch_loss=tensor(0.1495, device='cuda:0') eval_ppl=tensor(50349.6016, device='cuda:0') eval_epoch_loss=tensor(10.8267, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.35it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=35: train_ppl=tensor(1.1786, device='cuda:0') train_epoch_loss=tensor(0.1643, device='cuda:0') eval_ppl=tensor(65326.4023, device='cuda:0') eval_epoch_loss=tensor(11.0872, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=36: train_ppl=tensor(1.1537, device='cuda:0') train_epoch_loss=tensor(0.1430, device='cuda:0') eval_ppl=tensor(58730.6914, device='cuda:0') eval_epoch_loss=tensor(10.9807, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.51it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=37: train_ppl=tensor(1.1450, device='cuda:0') train_epoch_loss=tensor(0.1354, device='cuda:0') eval_ppl=tensor(70118.4844, device='cuda:0') eval_epoch_loss=tensor(11.1579, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.75it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=38: train_ppl=tensor(1.1242, device='cuda:0') train_epoch_loss=tensor(0.1171, device='cuda:0') eval_ppl=tensor(73181.1406, device='cuda:0') eval_epoch_loss=tensor(11.2007, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.51it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=39: train_ppl=tensor(1.1663, device='cuda:0') train_epoch_loss=tensor(0.1539, device='cuda:0') eval_ppl=tensor(68697.5859, device='cuda:0') eval_epoch_loss=tensor(11.1375, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.76it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=40: train_ppl=tensor(1.1734, device='cuda:0') train_epoch_loss=tensor(0.1599, device='cuda:0') eval_ppl=tensor(49961.3516, device='cuda:0') eval_epoch_loss=tensor(10.8190, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.76it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=41: train_ppl=tensor(1.0958, device='cuda:0') train_epoch_loss=tensor(0.0915, device='cuda:0') eval_ppl=tensor(55654.0156, device='cuda:0') eval_epoch_loss=tensor(10.9269, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.53it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=42: train_ppl=tensor(1.0932, device='cuda:0') train_epoch_loss=tensor(0.0891, device='cuda:0') eval_ppl=tensor(67724.8203, device='cuda:0') eval_epoch_loss=tensor(11.1232, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.50it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=43: train_ppl=tensor(1.0979, device='cuda:0') train_epoch_loss=tensor(0.0934, device='cuda:0') eval_ppl=tensor(80168.6875, device='cuda:0') eval_epoch_loss=tensor(11.2919, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.61it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=48: train_ppl=tensor(1.0650, device='cuda:0') train_epoch_loss=tensor(0.0630, device='cuda:0') eval_ppl=tensor(85026.9297, device='cuda:0') eval_epoch_loss=tensor(11.3507, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.62it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]

epoch=49: train_ppl=tensor(1.0647, device='cuda:0') train_epoch_loss=tensor(0.0627, device='cuda:0') eval_ppl=tensor(84989.6328, device='cuda:0') eval_epoch_loss=tensor(11.3503, device='cuda:0')







VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
eval_epoch_loss,▂▁▁▁▂▂▂▂▄▃▅▅▅▅▅▆▄▄▄▅▇▇▇▆▇▆▆▇▇▇██▇▇▇█████
eval_ppl,▁▁▁▁▁▁▁▁▂▁▂▃▃▂▂▄▂▂▂▃▅▅▅▄▆▃▄▆▆▆▇▇▅▅▇█▇███
train_epoch_loss,█▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_ppl,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,49.0
eval_epoch_loss,11.35028
eval_ppl,84989.63281
train_epoch_loss,0.06272
train_ppl,1.06473


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112396111194458, max=1.0…

trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358


100%|██████████| 7/7 [00:00<00:00,  8.61it/s]
100%|██████████| 425/425 [00:25<00:00, 16.82it/s]


epoch=0: train_ppl=tensor(4.7644e+15, device='cuda:0') train_epoch_loss=tensor(36.0999, device='cuda:0') eval_ppl=tensor(14897.1309, device='cuda:0') eval_epoch_loss=tensor(9.6089, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.31it/s]
100%|██████████| 425/425 [00:25<00:00, 16.76it/s]


epoch=1: train_ppl=tensor(91517.4453, device='cuda:0') train_epoch_loss=tensor(11.4243, device='cuda:0') eval_ppl=tensor(8017.8584, device='cuda:0') eval_epoch_loss=tensor(8.9894, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.40it/s]
100%|██████████| 425/425 [00:25<00:00, 16.72it/s]


epoch=2: train_ppl=tensor(4147.0581, device='cuda:0') train_epoch_loss=tensor(8.3302, device='cuda:0') eval_ppl=tensor(11366.3467, device='cuda:0') eval_epoch_loss=tensor(9.3384, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.24it/s]
100%|██████████| 425/425 [00:25<00:00, 16.66it/s]


epoch=3: train_ppl=tensor(655.2675, device='cuda:0') train_epoch_loss=tensor(6.4850, device='cuda:0') eval_ppl=tensor(10403.4844, device='cuda:0') eval_epoch_loss=tensor(9.2499, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.49it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=4: train_ppl=tensor(303.9400, device='cuda:0') train_epoch_loss=tensor(5.7168, device='cuda:0') eval_ppl=tensor(14210.7393, device='cuda:0') eval_epoch_loss=tensor(9.5618, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.35it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=5: train_ppl=tensor(215.4874, device='cuda:0') train_epoch_loss=tensor(5.3729, device='cuda:0') eval_ppl=tensor(14155.8242, device='cuda:0') eval_epoch_loss=tensor(9.5579, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.53it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=6: train_ppl=tensor(170.3973, device='cuda:0') train_epoch_loss=tensor(5.1381, device='cuda:0') eval_ppl=tensor(12769.9043, device='cuda:0') eval_epoch_loss=tensor(9.4548, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.59it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=7: train_ppl=tensor(130.5339, device='cuda:0') train_epoch_loss=tensor(4.8716, device='cuda:0') eval_ppl=tensor(19752.5137, device='cuda:0') eval_epoch_loss=tensor(9.8910, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.30it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=8: train_ppl=tensor(96.5464, device='cuda:0') train_epoch_loss=tensor(4.5700, device='cuda:0') eval_ppl=tensor(28497.2441, device='cuda:0') eval_epoch_loss=tensor(10.2576, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.73it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=9: train_ppl=tensor(68.4862, device='cuda:0') train_epoch_loss=tensor(4.2266, device='cuda:0') eval_ppl=tensor(38949.0703, device='cuda:0') eval_epoch_loss=tensor(10.5700, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.58it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=10: train_ppl=tensor(52.7933, device='cuda:0') train_epoch_loss=tensor(3.9664, device='cuda:0') eval_ppl=tensor(42054.0625, device='cuda:0') eval_epoch_loss=tensor(10.6467, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.54it/s]
 38%|███▊      | 162/425 [00:09<00:15, 16.69it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=18: train_ppl=tensor(38.2894, device='cuda:0') train_epoch_loss=tensor(3.6452, device='cuda:0') eval_ppl=tensor(185658.5156, device='cuda:0') eval_epoch_loss=tensor(12.1317, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.38it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=19: train_ppl=tensor(35.0785, device='cuda:0') train_epoch_loss=tensor(3.5576, device='cuda:0') eval_ppl=tensor(145508.3906, device='cuda:0') eval_epoch_loss=tensor(11.8880, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.46it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=20: train_ppl=tensor(26.5294, device='cuda:0') train_epoch_loss=tensor(3.2783, device='cuda:0') eval_ppl=tensor(208657.4844, device='cuda:0') eval_epoch_loss=tensor(12.2484, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.53it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=21: train_ppl=tensor(22.6208, device='cuda:0') train_epoch_loss=tensor(3.1189, device='cuda:0') eval_ppl=tensor(303169.8125, device='cuda:0') eval_epoch_loss=tensor(12.6220, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.75it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=22: train_ppl=tensor(19.2612, device='cuda:0') train_epoch_loss=tensor(2.9581, device='cuda:0') eval_ppl=tensor(278526.5625, device='cuda:0') eval_epoch_loss=tensor(12.5373, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.37it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=23: train_ppl=tensor(16.4537, device='cuda:0') train_epoch_loss=tensor(2.8006, device='cuda:0') eval_ppl=tensor(173137.1094, device='cuda:0') eval_epoch_loss=tensor(12.0618, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.50it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=24: train_ppl=tensor(14.2578, device='cuda:0') train_epoch_loss=tensor(2.6573, device='cuda:0') eval_ppl=tensor(338701., device='cuda:0') eval_epoch_loss=tensor(12.7329, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.73it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=25: train_ppl=tensor(11.9651, device='cuda:0') train_epoch_loss=tensor(2.4820, device='cuda:0') eval_ppl=tensor(364981.1562, device='cuda:0') eval_epoch_loss=tensor(12.8076, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.55it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=26: train_ppl=tensor(9.7725, device='cuda:0') train_epoch_loss=tensor(2.2796, device='cuda:0') eval_ppl=tensor(494010.2500, device='cuda:0') eval_epoch_loss=tensor(13.1103, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.49it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=27: train_ppl=tensor(8.4536, device='cuda:0') train_epoch_loss=tensor(2.1346, device='cuda:0') eval_ppl=tensor(785310.4375, device='cuda:0') eval_epoch_loss=tensor(13.5738, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.50it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=28: train_ppl=tensor(7.3200, device='cuda:0') train_epoch_loss=tensor(1.9906, device='cuda:0') eval_ppl=tensor(1224031., device='cuda:0') eval_epoch_loss=tensor(14.0177, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.50it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=29: train_ppl=tensor(7.5225, device='cuda:0') train_epoch_loss=tensor(2.0179, device='cuda:0') eval_ppl=tensor(1120122.2500, device='cuda:0') eval_epoch_loss=tensor(13.9289, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.28it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=30: train_ppl=tensor(7.7406, device='cuda:0') train_epoch_loss=tensor(2.0465, device='cuda:0') eval_ppl=tensor(346346.7188, device='cuda:0') eval_epoch_loss=tensor(12.7552, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.78it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=31: train_ppl=tensor(6.0760, device='cuda:0') train_epoch_loss=tensor(1.8043, device='cuda:0') eval_ppl=tensor(832451.8125, device='cuda:0') eval_epoch_loss=tensor(13.6321, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=32: train_ppl=tensor(5.1913, device='cuda:0') train_epoch_loss=tensor(1.6470, device='cuda:0') eval_ppl=tensor(594128.1250, device='cuda:0') eval_epoch_loss=tensor(13.2949, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.64it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=33: train_ppl=tensor(4.5196, device='cuda:0') train_epoch_loss=tensor(1.5084, device='cuda:0') eval_ppl=tensor(535598.0625, device='cuda:0') eval_epoch_loss=tensor(13.1911, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.44it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=34: train_ppl=tensor(3.8999, device='cuda:0') train_epoch_loss=tensor(1.3610, device='cuda:0') eval_ppl=tensor(684639.9375, device='cuda:0') eval_epoch_loss=tensor(13.4366, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=35: train_ppl=tensor(3.7010, device='cuda:0') train_epoch_loss=tensor(1.3086, device='cuda:0') eval_ppl=tensor(639247.3125, device='cuda:0') eval_epoch_loss=tensor(13.3680, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=36: train_ppl=tensor(3.2720, device='cuda:0') train_epoch_loss=tensor(1.1854, device='cuda:0') eval_ppl=tensor(693875.2500, device='cuda:0') eval_epoch_loss=tensor(13.4500, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=37: train_ppl=tensor(3.1154, device='cuda:0') train_epoch_loss=tensor(1.1364, device='cuda:0') eval_ppl=tensor(682786.1875, device='cuda:0') eval_epoch_loss=tensor(13.4339, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=38: train_ppl=tensor(2.7908, device='cuda:0') train_epoch_loss=tensor(1.0263, device='cuda:0') eval_ppl=tensor(836578.3125, device='cuda:0') eval_epoch_loss=tensor(13.6371, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.81it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=39: train_ppl=tensor(2.7524, device='cuda:0') train_epoch_loss=tensor(1.0125, device='cuda:0') eval_ppl=tensor(887552.7500, device='cuda:0') eval_epoch_loss=tensor(13.6962, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.67it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=40: train_ppl=tensor(2.5650, device='cuda:0') train_epoch_loss=tensor(0.9420, device='cuda:0') eval_ppl=tensor(853334.6875, device='cuda:0') eval_epoch_loss=tensor(13.6569, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.59it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=41: train_ppl=tensor(2.4381, device='cuda:0') train_epoch_loss=tensor(0.8912, device='cuda:0') eval_ppl=tensor(701939.3125, device='cuda:0') eval_epoch_loss=tensor(13.4616, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.60it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=42: train_ppl=tensor(2.4551, device='cuda:0') train_epoch_loss=tensor(0.8982, device='cuda:0') eval_ppl=tensor(621940.1250, device='cuda:0') eval_epoch_loss=tensor(13.3406, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.65it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=43: train_ppl=tensor(2.4294, device='cuda:0') train_epoch_loss=tensor(0.8877, device='cuda:0') eval_ppl=tensor(672859.9375, device='cuda:0') eval_epoch_loss=tensor(13.4193, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.58it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=44: train_ppl=tensor(2.2586, device='cuda:0') train_epoch_loss=tensor(0.8147, device='cuda:0') eval_ppl=tensor(601433.1875, device='cuda:0') eval_epoch_loss=tensor(13.3071, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=45: train_ppl=tensor(2.3241, device='cuda:0') train_epoch_loss=tensor(0.8433, device='cuda:0') eval_ppl=tensor(539859., device='cuda:0') eval_epoch_loss=tensor(13.1991, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.76it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=46: train_ppl=tensor(2.2091, device='cuda:0') train_epoch_loss=tensor(0.7926, device='cuda:0') eval_ppl=tensor(677158.8125, device='cuda:0') eval_epoch_loss=tensor(13.4257, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=47: train_ppl=tensor(2.1983, device='cuda:0') train_epoch_loss=tensor(0.7877, device='cuda:0') eval_ppl=tensor(600535.6250, device='cuda:0') eval_epoch_loss=tensor(13.3056, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.77it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=48: train_ppl=tensor(2.0748, device='cuda:0') train_epoch_loss=tensor(0.7299, device='cuda:0') eval_ppl=tensor(611419., device='cuda:0') eval_epoch_loss=tensor(13.3235, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.61it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]

epoch=49: train_ppl=tensor(2.0661, device='cuda:0') train_epoch_loss=tensor(0.7257, device='cuda:0') eval_ppl=tensor(643179.3750, device='cuda:0') eval_epoch_loss=tensor(13.3742, device='cuda:0')







VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
eval_epoch_loss,▃▁▁▁▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇█▇█▇▇▇▇▇██▇▇▇▇▇▇▇
eval_ppl,▁▁▁▁▁▁▁▁▁▁▁▁▂▁▂▂▂▃▃▂▃▄▅█▃▆▄▄▅▅▅▆▆▅▅▅▄▅▄▅
train_epoch_loss,█▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_ppl,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,49.0
eval_epoch_loss,13.37418
eval_ppl,643179.375
train_epoch_loss,0.72566
train_ppl,2.06609


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112357823488612, max=1.0…

trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358


100%|██████████| 7/7 [00:00<00:00,  8.62it/s]
100%|██████████| 425/425 [00:25<00:00, 16.74it/s]


epoch=0: train_ppl=tensor(9.7592e+11, device='cuda:0') train_epoch_loss=tensor(27.6066, device='cuda:0') eval_ppl=tensor(3808.4226, device='cuda:0') eval_epoch_loss=tensor(8.2450, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.68it/s]
100%|██████████| 425/425 [00:25<00:00, 16.70it/s]


epoch=1: train_ppl=tensor(1877.3406, device='cuda:0') train_epoch_loss=tensor(7.5376, device='cuda:0') eval_ppl=tensor(4270.9048, device='cuda:0') eval_epoch_loss=tensor(8.3596, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=2: train_ppl=tensor(418.1140, device='cuda:0') train_epoch_loss=tensor(6.0358, device='cuda:0') eval_ppl=tensor(5108.7871, device='cuda:0') eval_epoch_loss=tensor(8.5387, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.47it/s]
100%|██████████| 425/425 [00:25<00:00, 16.66it/s]


epoch=3: train_ppl=tensor(238.1051, device='cuda:0') train_epoch_loss=tensor(5.4727, device='cuda:0') eval_ppl=tensor(7422.0562, device='cuda:0') eval_epoch_loss=tensor(8.9122, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.35it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=4: train_ppl=tensor(159.7598, device='cuda:0') train_epoch_loss=tensor(5.0737, device='cuda:0') eval_ppl=tensor(5416.7275, device='cuda:0') eval_epoch_loss=tensor(8.5972, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.62it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=5: train_ppl=tensor(116.8667, device='cuda:0') train_epoch_loss=tensor(4.7610, device='cuda:0') eval_ppl=tensor(7315.7388, device='cuda:0') eval_epoch_loss=tensor(8.8978, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.35it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=6: train_ppl=tensor(83.2340, device='cuda:0') train_epoch_loss=tensor(4.4217, device='cuda:0') eval_ppl=tensor(7337.4478, device='cuda:0') eval_epoch_loss=tensor(8.9007, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.36it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=7: train_ppl=tensor(59.6598, device='cuda:0') train_epoch_loss=tensor(4.0887, device='cuda:0') eval_ppl=tensor(28444.3789, device='cuda:0') eval_epoch_loss=tensor(10.2557, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.48it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=8: train_ppl=tensor(41.5877, device='cuda:0') train_epoch_loss=tensor(3.7278, device='cuda:0') eval_ppl=tensor(34023.6641, device='cuda:0') eval_epoch_loss=tensor(10.4348, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.60it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=9: train_ppl=tensor(30.0099, device='cuda:0') train_epoch_loss=tensor(3.4015, device='cuda:0') eval_ppl=tensor(285137.5938, device='cuda:0') eval_epoch_loss=tensor(12.5607, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.34it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=10: train_ppl=tensor(22.6456, device='cuda:0') train_epoch_loss=tensor(3.1200, device='cuda:0') eval_ppl=tensor(1.4031e+09, device='cuda:0') eval_epoch_loss=tensor(21.0620, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.39it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=11: train_ppl=tensor(16.3238, device='cuda:0') train_epoch_loss=tensor(2.7926, device='cuda:0') eval_ppl=tensor(1.0916e+09, device='cuda:0') eval_epoch_loss=tensor(20.8109, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.58it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=12: train_ppl=tensor(16.7979, device='cuda:0') train_epoch_loss=tensor(2.8213, device='cuda:0') eval_ppl=tensor(22090.2480, device='cuda:0') eval_epoch_loss=tensor(10.0029, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.50it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=13: train_ppl=tensor(13.0748, device='cuda:0') train_epoch_loss=tensor(2.5707, device='cuda:0') eval_ppl=tensor(35155.5547, device='cuda:0') eval_epoch_loss=tensor(10.4675, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.34it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=14: train_ppl=tensor(9.1235, device='cuda:0') train_epoch_loss=tensor(2.2109, device='cuda:0') eval_ppl=tensor(63348.4102, device='cuda:0') eval_epoch_loss=tensor(11.0564, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.24it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=15: train_ppl=tensor(6.4077, device='cuda:0') train_epoch_loss=tensor(1.8575, device='cuda:0') eval_ppl=tensor(69794.5234, device='cuda:0') eval_epoch_loss=tensor(11.1533, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.73it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=16: train_ppl=tensor(4.6780, device='cuda:0') train_epoch_loss=tensor(1.5429, device='cuda:0') eval_ppl=tensor(120622.7578, device='cuda:0') eval_epoch_loss=tensor(11.7004, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.60it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=17: train_ppl=tensor(3.9067, device='cuda:0') train_epoch_loss=tensor(1.3627, device='cuda:0') eval_ppl=tensor(77845.7500, device='cuda:0') eval_epoch_loss=tensor(11.2625, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=18: train_ppl=tensor(2.9489, device='cuda:0') train_epoch_loss=tensor(1.0814, device='cuda:0') eval_ppl=tensor(139450.0625, device='cuda:0') eval_epoch_loss=tensor(11.8455, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.64it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=19: train_ppl=tensor(2.2711, device='cuda:0') train_epoch_loss=tensor(0.8203, device='cuda:0') eval_ppl=tensor(187529.2812, device='cuda:0') eval_epoch_loss=tensor(12.1417, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.71it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=20: train_ppl=tensor(1.9107, device='cuda:0') train_epoch_loss=tensor(0.6475, device='cuda:0') eval_ppl=tensor(174830.5156, device='cuda:0') eval_epoch_loss=tensor(12.0716, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.37it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=21: train_ppl=tensor(1.8769, device='cuda:0') train_epoch_loss=tensor(0.6296, device='cuda:0') eval_ppl=tensor(143222.0312, device='cuda:0') eval_epoch_loss=tensor(11.8722, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.58it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=22: train_ppl=tensor(1.5187, device='cuda:0') train_epoch_loss=tensor(0.4178, device='cuda:0') eval_ppl=tensor(242109.9375, device='cuda:0') eval_epoch_loss=tensor(12.3971, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.59it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=23: train_ppl=tensor(1.4759, device='cuda:0') train_epoch_loss=tensor(0.3893, device='cuda:0') eval_ppl=tensor(294534.6875, device='cuda:0') eval_epoch_loss=tensor(12.5932, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.77it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=24: train_ppl=tensor(1.4041, device='cuda:0') train_epoch_loss=tensor(0.3394, device='cuda:0') eval_ppl=tensor(192540.0312, device='cuda:0') eval_epoch_loss=tensor(12.1681, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=25: train_ppl=tensor(1.4311, device='cuda:0') train_epoch_loss=tensor(0.3584, device='cuda:0') eval_ppl=tensor(203920.1875, device='cuda:0') eval_epoch_loss=tensor(12.2255, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.76it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=26: train_ppl=tensor(1.5717, device='cuda:0') train_epoch_loss=tensor(0.4522, device='cuda:0') eval_ppl=tensor(277300.4688, device='cuda:0') eval_epoch_loss=tensor(12.5329, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.51it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=27: train_ppl=tensor(1.4317, device='cuda:0') train_epoch_loss=tensor(0.3589, device='cuda:0') eval_ppl=tensor(164153.8281, device='cuda:0') eval_epoch_loss=tensor(12.0086, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.69it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=28: train_ppl=tensor(1.4700, device='cuda:0') train_epoch_loss=tensor(0.3852, device='cuda:0') eval_ppl=tensor(112701.1562, device='cuda:0') eval_epoch_loss=tensor(11.6325, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=29: train_ppl=tensor(1.3476, device='cuda:0') train_epoch_loss=tensor(0.2983, device='cuda:0') eval_ppl=tensor(179334.7812, device='cuda:0') eval_epoch_loss=tensor(12.0970, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=30: train_ppl=tensor(1.3449, device='cuda:0') train_epoch_loss=tensor(0.2963, device='cuda:0') eval_ppl=tensor(229386.7031, device='cuda:0') eval_epoch_loss=tensor(12.3432, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.37it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=31: train_ppl=tensor(1.2348, device='cuda:0') train_epoch_loss=tensor(0.2109, device='cuda:0') eval_ppl=tensor(253246.0312, device='cuda:0') eval_epoch_loss=tensor(12.4421, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.64it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=32: train_ppl=tensor(1.2419, device='cuda:0') train_epoch_loss=tensor(0.2166, device='cuda:0') eval_ppl=tensor(366870.8438, device='cuda:0') eval_epoch_loss=tensor(12.8128, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.62it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=33: train_ppl=tensor(1.2718, device='cuda:0') train_epoch_loss=tensor(0.2404, device='cuda:0') eval_ppl=tensor(402079.7188, device='cuda:0') eval_epoch_loss=tensor(12.9044, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.43it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=34: train_ppl=tensor(1.2807, device='cuda:0') train_epoch_loss=tensor(0.2474, device='cuda:0') eval_ppl=tensor(275142.2188, device='cuda:0') eval_epoch_loss=tensor(12.5250, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.49it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=35: train_ppl=tensor(1.2713, device='cuda:0') train_epoch_loss=tensor(0.2401, device='cuda:0') eval_ppl=tensor(392261.6875, device='cuda:0') eval_epoch_loss=tensor(12.8797, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=36: train_ppl=tensor(1.2221, device='cuda:0') train_epoch_loss=tensor(0.2005, device='cuda:0') eval_ppl=tensor(443642.0625, device='cuda:0') eval_epoch_loss=tensor(13.0028, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=37: train_ppl=tensor(1.2165, device='cuda:0') train_epoch_loss=tensor(0.1960, device='cuda:0') eval_ppl=tensor(499801.5000, device='cuda:0') eval_epoch_loss=tensor(13.1220, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.64it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=38: train_ppl=tensor(1.1929, device='cuda:0') train_epoch_loss=tensor(0.1764, device='cuda:0') eval_ppl=tensor(703960.5000, device='cuda:0') eval_epoch_loss=tensor(13.4645, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.55it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=39: train_ppl=tensor(1.2159, device='cuda:0') train_epoch_loss=tensor(0.1955, device='cuda:0') eval_ppl=tensor(933623.1250, device='cuda:0') eval_epoch_loss=tensor(13.7468, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.51it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=40: train_ppl=tensor(1.1975, device='cuda:0') train_epoch_loss=tensor(0.1802, device='cuda:0') eval_ppl=tensor(868784.5625, device='cuda:0') eval_epoch_loss=tensor(13.6749, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.41it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=41: train_ppl=tensor(1.2035, device='cuda:0') train_epoch_loss=tensor(0.1853, device='cuda:0') eval_ppl=tensor(816294.7500, device='cuda:0') eval_epoch_loss=tensor(13.6125, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.54it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=42: train_ppl=tensor(1.1748, device='cuda:0') train_epoch_loss=tensor(0.1611, device='cuda:0') eval_ppl=tensor(932279.5625, device='cuda:0') eval_epoch_loss=tensor(13.7454, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.79it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=43: train_ppl=tensor(1.1749, device='cuda:0') train_epoch_loss=tensor(0.1612, device='cuda:0') eval_ppl=tensor(1037073.5625, device='cuda:0') eval_epoch_loss=tensor(13.8519, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=44: train_ppl=tensor(1.1760, device='cuda:0') train_epoch_loss=tensor(0.1622, device='cuda:0') eval_ppl=tensor(1091108., device='cuda:0') eval_epoch_loss=tensor(13.9027, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.66it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=45: train_ppl=tensor(1.1551, device='cuda:0') train_epoch_loss=tensor(0.1442, device='cuda:0') eval_ppl=tensor(1154852.5000, device='cuda:0') eval_epoch_loss=tensor(13.9595, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.31it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=46: train_ppl=tensor(1.1625, device='cuda:0') train_epoch_loss=tensor(0.1506, device='cuda:0') eval_ppl=tensor(1273828., device='cuda:0') eval_epoch_loss=tensor(14.0575, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.51it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=47: train_ppl=tensor(1.1646, device='cuda:0') train_epoch_loss=tensor(0.1524, device='cuda:0') eval_ppl=tensor(1374541.3750, device='cuda:0') eval_epoch_loss=tensor(14.1336, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.61it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=48: train_ppl=tensor(1.1486, device='cuda:0') train_epoch_loss=tensor(0.1386, device='cuda:0') eval_ppl=tensor(1354205.8750, device='cuda:0') eval_epoch_loss=tensor(14.1187, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.51it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]

epoch=49: train_ppl=tensor(1.1539, device='cuda:0') train_epoch_loss=tensor(0.1432, device='cuda:0') eval_ppl=tensor(1375652.2500, device='cuda:0') eval_epoch_loss=tensor(14.1344, device='cuda:0')







VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
eval_epoch_loss,▁▁▁▁▁▁▂▂██▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▄▄▄▄
eval_ppl,▁▁▁▁▁▁▁▁█▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_epoch_loss,█▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_ppl,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,49.0
eval_epoch_loss,14.13444
eval_ppl,1375652.25
train_epoch_loss,0.14315
train_ppl,1.15391


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112443675907949, max=1.0…

trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358


100%|██████████| 7/7 [00:00<00:00,  8.60it/s]
100%|██████████| 425/425 [00:25<00:00, 16.79it/s]


epoch=0: train_ppl=tensor(5.7300e+11, device='cuda:0') train_epoch_loss=tensor(27.0742, device='cuda:0') eval_ppl=tensor(3204.5491, device='cuda:0') eval_epoch_loss=tensor(8.0723, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]
100%|██████████| 425/425 [00:25<00:00, 16.70it/s]


epoch=1: train_ppl=tensor(1394.1077, device='cuda:0') train_epoch_loss=tensor(7.2400, device='cuda:0') eval_ppl=tensor(2450.0789, device='cuda:0') eval_epoch_loss=tensor(7.8039, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.70it/s]


epoch=2: train_ppl=tensor(345.7880, device='cuda:0') train_epoch_loss=tensor(5.8458, device='cuda:0') eval_ppl=tensor(3716.4062, device='cuda:0') eval_epoch_loss=tensor(8.2205, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.41it/s]
100%|██████████| 425/425 [00:25<00:00, 16.65it/s]


epoch=3: train_ppl=tensor(225.4353, device='cuda:0') train_epoch_loss=tensor(5.4180, device='cuda:0') eval_ppl=tensor(4673.5830, device='cuda:0') eval_epoch_loss=tensor(8.4497, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.65it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=4: train_ppl=tensor(154.1380, device='cuda:0') train_epoch_loss=tensor(5.0378, device='cuda:0') eval_ppl=tensor(4852.7085, device='cuda:0') eval_epoch_loss=tensor(8.4873, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.27it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=5: train_ppl=tensor(138.1260, device='cuda:0') train_epoch_loss=tensor(4.9282, device='cuda:0') eval_ppl=tensor(3876.4622, device='cuda:0') eval_epoch_loss=tensor(8.2627, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.72it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=6: train_ppl=tensor(92.0394, device='cuda:0') train_epoch_loss=tensor(4.5222, device='cuda:0') eval_ppl=tensor(4626.1392, device='cuda:0') eval_epoch_loss=tensor(8.4395, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.42it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=7: train_ppl=tensor(65.2345, device='cuda:0') train_epoch_loss=tensor(4.1780, device='cuda:0') eval_ppl=tensor(7032.9551, device='cuda:0') eval_epoch_loss=tensor(8.8584, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.59it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=8: train_ppl=tensor(49.5112, device='cuda:0') train_epoch_loss=tensor(3.9022, device='cuda:0') eval_ppl=tensor(11628.8701, device='cuda:0') eval_epoch_loss=tensor(9.3612, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=9: train_ppl=tensor(37.9270, device='cuda:0') train_epoch_loss=tensor(3.6357, device='cuda:0') eval_ppl=tensor(19085.5098, device='cuda:0') eval_epoch_loss=tensor(9.8567, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.63it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=10: train_ppl=tensor(28.9936, device='cuda:0') train_epoch_loss=tensor(3.3671, device='cuda:0') eval_ppl=tensor(23025.7344, device='cuda:0') eval_epoch_loss=tensor(10.0444, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.53it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=15: train_ppl=tensor(7.8520, device='cuda:0') train_epoch_loss=tensor(2.0608, device='cuda:0') eval_ppl=tensor(237179.7344, device='cuda:0') eval_epoch_loss=tensor(12.3766, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.30it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=16: train_ppl=tensor(9.9369, device='cuda:0') train_epoch_loss=tensor(2.2963, device='cuda:0') eval_ppl=tensor(155709.0938, device='cuda:0') eval_epoch_loss=tensor(11.9557, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.45it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=17: train_ppl=tensor(6.1648, device='cuda:0') train_epoch_loss=tensor(1.8189, device='cuda:0') eval_ppl=tensor(193082.4844, device='cuda:0') eval_epoch_loss=tensor(12.1709, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.76it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=18: train_ppl=tensor(3.7944, device='cuda:0') train_epoch_loss=tensor(1.3335, device='cuda:0') eval_ppl=tensor(161418.7812, device='cuda:0') eval_epoch_loss=tensor(11.9918, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.75it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=19: train_ppl=tensor(2.7619, device='cuda:0') train_epoch_loss=tensor(1.0159, device='cuda:0') eval_ppl=tensor(231952.7656, device='cuda:0') eval_epoch_loss=tensor(12.3543, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.53it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=20: train_ppl=tensor(2.6627, device='cuda:0') train_epoch_loss=tensor(0.9793, device='cuda:0') eval_ppl=tensor(321025.4062, device='cuda:0') eval_epoch_loss=tensor(12.6793, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.75it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=21: train_ppl=tensor(2.2777, device='cuda:0') train_epoch_loss=tensor(0.8232, device='cuda:0') eval_ppl=tensor(198114.4062, device='cuda:0') eval_epoch_loss=tensor(12.1966, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=22: train_ppl=tensor(2.5316, device='cuda:0') train_epoch_loss=tensor(0.9288, device='cuda:0') eval_ppl=tensor(175749.4531, device='cuda:0') eval_epoch_loss=tensor(12.0768, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.78it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=23: train_ppl=tensor(2.3245, device='cuda:0') train_epoch_loss=tensor(0.8435, device='cuda:0') eval_ppl=tensor(241949.2969, device='cuda:0') eval_epoch_loss=tensor(12.3965, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.62it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=24: train_ppl=tensor(1.8042, device='cuda:0') train_epoch_loss=tensor(0.5901, device='cuda:0') eval_ppl=tensor(411535.8125, device='cuda:0') eval_epoch_loss=tensor(12.9277, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.64it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=25: train_ppl=tensor(1.6528, device='cuda:0') train_epoch_loss=tensor(0.5024, device='cuda:0') eval_ppl=tensor(368994.7500, device='cuda:0') eval_epoch_loss=tensor(12.8185, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.58it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=26: train_ppl=tensor(1.5800, device='cuda:0') train_epoch_loss=tensor(0.4574, device='cuda:0') eval_ppl=tensor(423990.0312, device='cuda:0') eval_epoch_loss=tensor(12.9575, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.53it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=27: train_ppl=tensor(2.0621, device='cuda:0') train_epoch_loss=tensor(0.7237, device='cuda:0') eval_ppl=tensor(241429.0625, device='cuda:0') eval_epoch_loss=tensor(12.3943, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]
 89%|████████▉ | 378/425 [00:22<00:02, 16.68it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=33: train_ppl=tensor(1.3190, device='cuda:0') train_epoch_loss=tensor(0.2769, device='cuda:0') eval_ppl=tensor(701364.5000, device='cuda:0') eval_epoch_loss=tensor(13.4608, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.74it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=34: train_ppl=tensor(1.2836, device='cuda:0') train_epoch_loss=tensor(0.2496, device='cuda:0') eval_ppl=tensor(1047648.4375, device='cuda:0') eval_epoch_loss=tensor(13.8621, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.46it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=35: train_ppl=tensor(1.3235, device='cuda:0') train_epoch_loss=tensor(0.2803, device='cuda:0') eval_ppl=tensor(1390879.3750, device='cuda:0') eval_epoch_loss=tensor(14.1454, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.75it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=36: train_ppl=tensor(1.2597, device='cuda:0') train_epoch_loss=tensor(0.2309, device='cuda:0') eval_ppl=tensor(1500060.5000, device='cuda:0') eval_epoch_loss=tensor(14.2210, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.29it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=37: train_ppl=tensor(1.2843, device='cuda:0') train_epoch_loss=tensor(0.2502, device='cuda:0') eval_ppl=tensor(1984312.2500, device='cuda:0') eval_epoch_loss=tensor(14.5008, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.44it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=38: train_ppl=tensor(1.2676, device='cuda:0') train_epoch_loss=tensor(0.2372, device='cuda:0') eval_ppl=tensor(1999764.7500, device='cuda:0') eval_epoch_loss=tensor(14.5085, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=39: train_ppl=tensor(1.2475, device='cuda:0') train_epoch_loss=tensor(0.2211, device='cuda:0') eval_ppl=tensor(1903472.3750, device='cuda:0') eval_epoch_loss=tensor(14.4592, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.77it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=40: train_ppl=tensor(1.2587, device='cuda:0') train_epoch_loss=tensor(0.2301, device='cuda:0') eval_ppl=tensor(3032289.7500, device='cuda:0') eval_epoch_loss=tensor(14.9248, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.38it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=41: train_ppl=tensor(1.2367, device='cuda:0') train_epoch_loss=tensor(0.2125, device='cuda:0') eval_ppl=tensor(2745238.2500, device='cuda:0') eval_epoch_loss=tensor(14.8254, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.54it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=42: train_ppl=tensor(1.3137, device='cuda:0') train_epoch_loss=tensor(0.2729, device='cuda:0') eval_ppl=tensor(1876383.2500, device='cuda:0') eval_epoch_loss=tensor(14.4449, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=43: train_ppl=tensor(1.2588, device='cuda:0') train_epoch_loss=tensor(0.2302, device='cuda:0') eval_ppl=tensor(2206812., device='cuda:0') eval_epoch_loss=tensor(14.6071, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.33it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=44: train_ppl=tensor(1.2654, device='cuda:0') train_epoch_loss=tensor(0.2354, device='cuda:0') eval_ppl=tensor(2624215.2500, device='cuda:0') eval_epoch_loss=tensor(14.7803, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.59it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=45: train_ppl=tensor(1.2168, device='cuda:0') train_epoch_loss=tensor(0.1962, device='cuda:0') eval_ppl=tensor(2636512., device='cuda:0') eval_epoch_loss=tensor(14.7850, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.43it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=46: train_ppl=tensor(1.2137, device='cuda:0') train_epoch_loss=tensor(0.1937, device='cuda:0') eval_ppl=tensor(2916505.5000, device='cuda:0') eval_epoch_loss=tensor(14.8859, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.55it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=47: train_ppl=tensor(1.1854, device='cuda:0') train_epoch_loss=tensor(0.1701, device='cuda:0') eval_ppl=tensor(2979771.5000, device='cuda:0') eval_epoch_loss=tensor(14.9074, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.74it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=48: train_ppl=tensor(1.1641, device='cuda:0') train_epoch_loss=tensor(0.1519, device='cuda:0') eval_ppl=tensor(2974561.7500, device='cuda:0') eval_epoch_loss=tensor(14.9056, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.43it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]

epoch=49: train_ppl=tensor(1.1746, device='cuda:0') train_epoch_loss=tensor(0.1609, device='cuda:0') eval_ppl=tensor(3079750.5000, device='cuda:0') eval_epoch_loss=tensor(14.9404, device='cuda:0')







VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
eval_epoch_loss,▁▁▁▂▁▂▂▃▃▃▄▄▅▅▅▅▆▅▅▆▆▆▆▇▆▆▇▇▇▇██████████
eval_ppl,▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▂▁▁▂▂▂▂▃▂▂▃▃▄▄▆▆█▇▅▆▇███
train_epoch_loss,█▃▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_ppl,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,49.0
eval_epoch_loss,14.94036
eval_ppl,3079750.5
train_epoch_loss,0.16094
train_ppl,1.17461


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112640811026924, max=1.0…

trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358


100%|██████████| 7/7 [00:00<00:00,  8.71it/s]
100%|██████████| 425/425 [00:25<00:00, 16.82it/s]


epoch=0: train_ppl=tensor(1.2133e+12, device='cuda:0') train_epoch_loss=tensor(27.8243, device='cuda:0') eval_ppl=tensor(3101.4595, device='cuda:0') eval_epoch_loss=tensor(8.0396, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.54it/s]
100%|██████████| 425/425 [00:25<00:00, 16.73it/s]


epoch=1: train_ppl=tensor(1670.7041, device='cuda:0') train_epoch_loss=tensor(7.4210, device='cuda:0') eval_ppl=tensor(3005.9746, device='cuda:0') eval_epoch_loss=tensor(8.0084, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.54it/s]
100%|██████████| 425/425 [00:25<00:00, 16.70it/s]


epoch=2: train_ppl=tensor(351.0168, device='cuda:0') train_epoch_loss=tensor(5.8608, device='cuda:0') eval_ppl=tensor(4541.1851, device='cuda:0') eval_epoch_loss=tensor(8.4209, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.50it/s]
100%|██████████| 425/425 [00:25<00:00, 16.65it/s]


epoch=3: train_ppl=tensor(203.6784, device='cuda:0') train_epoch_loss=tensor(5.3165, device='cuda:0') eval_ppl=tensor(5508.4390, device='cuda:0') eval_epoch_loss=tensor(8.6140, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=4: train_ppl=tensor(139.7689, device='cuda:0') train_epoch_loss=tensor(4.9400, device='cuda:0') eval_ppl=tensor(6481.3604, device='cuda:0') eval_epoch_loss=tensor(8.7767, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.49it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=5: train_ppl=tensor(105.1820, device='cuda:0') train_epoch_loss=tensor(4.6557, device='cuda:0') eval_ppl=tensor(12631.3398, device='cuda:0') eval_epoch_loss=tensor(9.4439, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.47it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=6: train_ppl=tensor(94.8654, device='cuda:0') train_epoch_loss=tensor(4.5525, device='cuda:0') eval_ppl=tensor(10247.9482, device='cuda:0') eval_epoch_loss=tensor(9.2348, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.28it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=7: train_ppl=tensor(73.4079, device='cuda:0') train_epoch_loss=tensor(4.2960, device='cuda:0') eval_ppl=tensor(14890.3701, device='cuda:0') eval_epoch_loss=tensor(9.6085, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.47it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=8: train_ppl=tensor(50.8556, device='cuda:0') train_epoch_loss=tensor(3.9290, device='cuda:0') eval_ppl=tensor(22023.0840, device='cuda:0') eval_epoch_loss=tensor(9.9998, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.73it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=9: train_ppl=tensor(38.0501, device='cuda:0') train_epoch_loss=tensor(3.6389, device='cuda:0') eval_ppl=tensor(38057.9219, device='cuda:0') eval_epoch_loss=tensor(10.5469, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.60it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=10: train_ppl=tensor(28.9344, device='cuda:0') train_epoch_loss=tensor(3.3650, device='cuda:0') eval_ppl=tensor(66947.5547, device='cuda:0') eval_epoch_loss=tensor(11.1117, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.29it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=11: train_ppl=tensor(23.5179, device='cuda:0') train_epoch_loss=tensor(3.1578, device='cuda:0') eval_ppl=tensor(76206.8828, device='cuda:0') eval_epoch_loss=tensor(11.2412, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.53it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=12: train_ppl=tensor(17.0262, device='cuda:0') train_epoch_loss=tensor(2.8348, device='cuda:0') eval_ppl=tensor(155908.5000, device='cuda:0') eval_epoch_loss=tensor(11.9570, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.75it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=13: train_ppl=tensor(12.3415, device='cuda:0') train_epoch_loss=tensor(2.5130, device='cuda:0') eval_ppl=tensor(446886.6875, device='cuda:0') eval_epoch_loss=tensor(13.0101, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.53it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=14: train_ppl=tensor(11.5646, device='cuda:0') train_epoch_loss=tensor(2.4479, device='cuda:0') eval_ppl=tensor(159156.3125, device='cuda:0') eval_epoch_loss=tensor(11.9776, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.53it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=15: train_ppl=tensor(6.6249, device='cuda:0') train_epoch_loss=tensor(1.8908, device='cuda:0') eval_ppl=tensor(610315.6250, device='cuda:0') eval_epoch_loss=tensor(13.3217, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.54it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=16: train_ppl=tensor(4.6232, device='cuda:0') train_epoch_loss=tensor(1.5311, device='cuda:0') eval_ppl=tensor(1221624.1250, device='cuda:0') eval_epoch_loss=tensor(14.0157, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.41it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=17: train_ppl=tensor(3.7623, device='cuda:0') train_epoch_loss=tensor(1.3250, device='cuda:0') eval_ppl=tensor(545910.7500, device='cuda:0') eval_epoch_loss=tensor(13.2102, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.49it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=18: train_ppl=tensor(3.2881, device='cuda:0') train_epoch_loss=tensor(1.1903, device='cuda:0') eval_ppl=tensor(586450.8750, device='cuda:0') eval_epoch_loss=tensor(13.2818, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.48it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=19: train_ppl=tensor(2.5198, device='cuda:0') train_epoch_loss=tensor(0.9242, device='cuda:0') eval_ppl=tensor(356228.6562, device='cuda:0') eval_epoch_loss=tensor(12.7833, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.60it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=20: train_ppl=tensor(2.3711, device='cuda:0') train_epoch_loss=tensor(0.8634, device='cuda:0') eval_ppl=tensor(383125.6250, device='cuda:0') eval_epoch_loss=tensor(12.8561, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=21: train_ppl=tensor(1.8740, device='cuda:0') train_epoch_loss=tensor(0.6281, device='cuda:0') eval_ppl=tensor(256787., device='cuda:0') eval_epoch_loss=tensor(12.4560, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.32it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=22: train_ppl=tensor(1.6888, device='cuda:0') train_epoch_loss=tensor(0.5240, device='cuda:0') eval_ppl=tensor(509441.5312, device='cuda:0') eval_epoch_loss=tensor(13.1411, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.58it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=23: train_ppl=tensor(1.6445, device='cuda:0') train_epoch_loss=tensor(0.4974, device='cuda:0') eval_ppl=tensor(320932.0625, device='cuda:0') eval_epoch_loss=tensor(12.6790, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.76it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=24: train_ppl=tensor(1.5085, device='cuda:0') train_epoch_loss=tensor(0.4111, device='cuda:0') eval_ppl=tensor(465890.6875, device='cuda:0') eval_epoch_loss=tensor(13.0517, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=25: train_ppl=tensor(1.5587, device='cuda:0') train_epoch_loss=tensor(0.4438, device='cuda:0') eval_ppl=tensor(271548.2500, device='cuda:0') eval_epoch_loss=tensor(12.5119, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.29it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=26: train_ppl=tensor(1.4340, device='cuda:0') train_epoch_loss=tensor(0.3605, device='cuda:0') eval_ppl=tensor(474163.9062, device='cuda:0') eval_epoch_loss=tensor(13.0693, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.49it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=27: train_ppl=tensor(1.4597, device='cuda:0') train_epoch_loss=tensor(0.3782, device='cuda:0') eval_ppl=tensor(365374.6875, device='cuda:0') eval_epoch_loss=tensor(12.8087, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.50it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=28: train_ppl=tensor(1.4856, device='cuda:0') train_epoch_loss=tensor(0.3958, device='cuda:0') eval_ppl=tensor(228620.7812, device='cuda:0') eval_epoch_loss=tensor(12.3398, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.31it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=29: train_ppl=tensor(1.6015, device='cuda:0') train_epoch_loss=tensor(0.4709, device='cuda:0') eval_ppl=tensor(163442.7500, device='cuda:0') eval_epoch_loss=tensor(12.0042, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.32it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=30: train_ppl=tensor(1.5057, device='cuda:0') train_epoch_loss=tensor(0.4093, device='cuda:0') eval_ppl=tensor(170840.8438, device='cuda:0') eval_epoch_loss=tensor(12.0485, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.55it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=31: train_ppl=tensor(1.4107, device='cuda:0') train_epoch_loss=tensor(0.3441, device='cuda:0') eval_ppl=tensor(184855.5156, device='cuda:0') eval_epoch_loss=tensor(12.1273, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=32: train_ppl=tensor(1.3588, device='cuda:0') train_epoch_loss=tensor(0.3066, device='cuda:0') eval_ppl=tensor(157703.2812, device='cuda:0') eval_epoch_loss=tensor(11.9685, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.54it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=33: train_ppl=tensor(1.4153, device='cuda:0') train_epoch_loss=tensor(0.3473, device='cuda:0') eval_ppl=tensor(212664.3438, device='cuda:0') eval_epoch_loss=tensor(12.2675, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=34: train_ppl=tensor(1.4584, device='cuda:0') train_epoch_loss=tensor(0.3774, device='cuda:0') eval_ppl=tensor(174915.5625, device='cuda:0') eval_epoch_loss=tensor(12.0721, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.65it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=35: train_ppl=tensor(1.3149, device='cuda:0') train_epoch_loss=tensor(0.2737, device='cuda:0') eval_ppl=tensor(261538.2031, device='cuda:0') eval_epoch_loss=tensor(12.4743, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.76it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=36: train_ppl=tensor(1.3075, device='cuda:0') train_epoch_loss=tensor(0.2681, device='cuda:0') eval_ppl=tensor(394264.4062, device='cuda:0') eval_epoch_loss=tensor(12.8848, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.44it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=37: train_ppl=tensor(1.2731, device='cuda:0') train_epoch_loss=tensor(0.2415, device='cuda:0') eval_ppl=tensor(444863.9062, device='cuda:0') eval_epoch_loss=tensor(13.0055, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.28it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=38: train_ppl=tensor(1.2786, device='cuda:0') train_epoch_loss=tensor(0.2457, device='cuda:0') eval_ppl=tensor(678022.8125, device='cuda:0') eval_epoch_loss=tensor(13.4269, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.59it/s]
100%|██████████| 425/425 [00:25<00:00, 16.69it/s]


epoch=39: train_ppl=tensor(1.4081, device='cuda:0') train_epoch_loss=tensor(0.3423, device='cuda:0') eval_ppl=tensor(575591.8125, device='cuda:0') eval_epoch_loss=tensor(13.2632, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.47it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=40: train_ppl=tensor(1.2947, device='cuda:0') train_epoch_loss=tensor(0.2583, device='cuda:0') eval_ppl=tensor(396395.2812, device='cuda:0') eval_epoch_loss=tensor(12.8902, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.29it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=41: train_ppl=tensor(1.2494, device='cuda:0') train_epoch_loss=tensor(0.2227, device='cuda:0') eval_ppl=tensor(551263.8750, device='cuda:0') eval_epoch_loss=tensor(13.2200, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.47it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=42: train_ppl=tensor(1.2997, device='cuda:0') train_epoch_loss=tensor(0.2621, device='cuda:0') eval_ppl=tensor(641263.5625, device='cuda:0') eval_epoch_loss=tensor(13.3712, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.29it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=43: train_ppl=tensor(1.2299, device='cuda:0') train_epoch_loss=tensor(0.2069, device='cuda:0') eval_ppl=tensor(625384.3750, device='cuda:0') eval_epoch_loss=tensor(13.3461, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=44: train_ppl=tensor(1.2292, device='cuda:0') train_epoch_loss=tensor(0.2064, device='cuda:0') eval_ppl=tensor(809723.6250, device='cuda:0') eval_epoch_loss=tensor(13.6044, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.64it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=45: train_ppl=tensor(1.2289, device='cuda:0') train_epoch_loss=tensor(0.2061, device='cuda:0') eval_ppl=tensor(1173598.1250, device='cuda:0') eval_epoch_loss=tensor(13.9756, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.62it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=46: train_ppl=tensor(1.2146, device='cuda:0') train_epoch_loss=tensor(0.1944, device='cuda:0') eval_ppl=tensor(1447344.1250, device='cuda:0') eval_epoch_loss=tensor(14.1852, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=47: train_ppl=tensor(1.2166, device='cuda:0') train_epoch_loss=tensor(0.1960, device='cuda:0') eval_ppl=tensor(1487237.6250, device='cuda:0') eval_epoch_loss=tensor(14.2124, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.39it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=48: train_ppl=tensor(1.2003, device='cuda:0') train_epoch_loss=tensor(0.1825, device='cuda:0') eval_ppl=tensor(1486150., device='cuda:0') eval_epoch_loss=tensor(14.2117, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.66it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]

epoch=49: train_ppl=tensor(1.2172, device='cuda:0') train_epoch_loss=tensor(0.1965, device='cuda:0') eval_ppl=tensor(1490206.2500, device='cuda:0') eval_epoch_loss=tensor(14.2144, device='cuda:0')







VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
eval_epoch_loss,▁▁▁▂▃▂▃▃▅▅▅▇▇█▇▇▆▆▇▆▆▇▆▆▆▆▅▆▆▇▇▇▇▇▇▇████
eval_ppl,▁▁▁▁▁▁▁▁▁▁▂▃▄▇▄▄▃▂▃▂▂▃▃▂▂▂▂▂▂▃▃▄▃▄▄▄▇███
train_epoch_loss,█▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_ppl,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,49.0
eval_epoch_loss,14.21443
eval_ppl,1490206.25
train_epoch_loss,0.19653
train_ppl,1.21718


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112396776055296, max=1.0…

trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358


100%|██████████| 7/7 [00:00<00:00,  8.71it/s]
100%|██████████| 425/425 [00:25<00:00, 16.73it/s]


epoch=0: train_ppl=tensor(6.4972e+11, device='cuda:0') train_epoch_loss=tensor(27.1998, device='cuda:0') eval_ppl=tensor(4846.6177, device='cuda:0') eval_epoch_loss=tensor(8.4860, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.62it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=1: train_ppl=tensor(1959.3911, device='cuda:0') train_epoch_loss=tensor(7.5804, device='cuda:0') eval_ppl=tensor(4876.7217, device='cuda:0') eval_epoch_loss=tensor(8.4922, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.74it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=2: train_ppl=tensor(495.7654, device='cuda:0') train_epoch_loss=tensor(6.2061, device='cuda:0') eval_ppl=tensor(5753.5693, device='cuda:0') eval_epoch_loss=tensor(8.6576, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.60it/s]
100%|██████████| 425/425 [00:25<00:00, 16.65it/s]


epoch=3: train_ppl=tensor(275.5797, device='cuda:0') train_epoch_loss=tensor(5.6189, device='cuda:0') eval_ppl=tensor(6750.7480, device='cuda:0') eval_epoch_loss=tensor(8.8174, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.39it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=4: train_ppl=tensor(209.9704, device='cuda:0') train_epoch_loss=tensor(5.3470, device='cuda:0') eval_ppl=tensor(6617.2227, device='cuda:0') eval_epoch_loss=tensor(8.7974, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.38it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=5: train_ppl=tensor(152.6708, device='cuda:0') train_epoch_loss=tensor(5.0283, device='cuda:0') eval_ppl=tensor(4448.0791, device='cuda:0') eval_epoch_loss=tensor(8.4002, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.60it/s]
100%|██████████| 425/425 [00:25<00:00, 16.72it/s]


epoch=6: train_ppl=tensor(124.3295, device='cuda:0') train_epoch_loss=tensor(4.8229, device='cuda:0') eval_ppl=tensor(4253.3335, device='cuda:0') eval_epoch_loss=tensor(8.3555, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.62it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=7: train_ppl=tensor(94.7837, device='cuda:0') train_epoch_loss=tensor(4.5516, device='cuda:0') eval_ppl=tensor(5751.4785, device='cuda:0') eval_epoch_loss=tensor(8.6572, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.60it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=8: train_ppl=tensor(75.3613, device='cuda:0') train_epoch_loss=tensor(4.3223, device='cuda:0') eval_ppl=tensor(7763.6709, device='cuda:0') eval_epoch_loss=tensor(8.9572, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.73it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=9: train_ppl=tensor(58.3230, device='cuda:0') train_epoch_loss=tensor(4.0660, device='cuda:0') eval_ppl=tensor(11982.8584, device='cuda:0') eval_epoch_loss=tensor(9.3912, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.54it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=10: train_ppl=tensor(48.3928, device='cuda:0') train_epoch_loss=tensor(3.8794, device='cuda:0') eval_ppl=tensor(24348.1719, device='cuda:0') eval_epoch_loss=tensor(10.1002, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.63it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=11: train_ppl=tensor(37.7157, device='cuda:0') train_epoch_loss=tensor(3.6301, device='cuda:0') eval_ppl=tensor(38234.9766, device='cuda:0') eval_epoch_loss=tensor(10.5515, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.77it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=12: train_ppl=tensor(31.4416, device='cuda:0') train_epoch_loss=tensor(3.4481, device='cuda:0') eval_ppl=tensor(57756.6641, device='cuda:0') eval_epoch_loss=tensor(10.9640, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=13: train_ppl=tensor(23.0579, device='cuda:0') train_epoch_loss=tensor(3.1380, device='cuda:0') eval_ppl=tensor(114308.4766, device='cuda:0') eval_epoch_loss=tensor(11.6467, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.59it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=14: train_ppl=tensor(15.2535, device='cuda:0') train_epoch_loss=tensor(2.7248, device='cuda:0') eval_ppl=tensor(286311.4688, device='cuda:0') eval_epoch_loss=tensor(12.5648, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.60it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=15: train_ppl=tensor(13.0037, device='cuda:0') train_epoch_loss=tensor(2.5652, device='cuda:0') eval_ppl=tensor(450981.3438, device='cuda:0') eval_epoch_loss=tensor(13.0192, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=16: train_ppl=tensor(12.8170, device='cuda:0') train_epoch_loss=tensor(2.5508, device='cuda:0') eval_ppl=tensor(443877.7500, device='cuda:0') eval_epoch_loss=tensor(13.0033, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.58it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=17: train_ppl=tensor(9.5003, device='cuda:0') train_epoch_loss=tensor(2.2513, device='cuda:0') eval_ppl=tensor(1201940.3750, device='cuda:0') eval_epoch_loss=tensor(13.9994, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.63it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=18: train_ppl=tensor(6.7162, device='cuda:0') train_epoch_loss=tensor(1.9045, device='cuda:0') eval_ppl=tensor(1573095.1250, device='cuda:0') eval_epoch_loss=tensor(14.2686, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.48it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=19: train_ppl=tensor(5.5593, device='cuda:0') train_epoch_loss=tensor(1.7155, device='cuda:0') eval_ppl=tensor(827025.1250, device='cuda:0') eval_epoch_loss=tensor(13.6256, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.62it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=24: train_ppl=tensor(2.3243, device='cuda:0') train_epoch_loss=tensor(0.8434, device='cuda:0') eval_ppl=tensor(652864.5000, device='cuda:0') eval_epoch_loss=tensor(13.3891, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=25: train_ppl=tensor(1.9719, device='cuda:0') train_epoch_loss=tensor(0.6790, device='cuda:0') eval_ppl=tensor(953069.1250, device='cuda:0') eval_epoch_loss=tensor(13.7674, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.78it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=26: train_ppl=tensor(1.8584, device='cuda:0') train_epoch_loss=tensor(0.6197, device='cuda:0') eval_ppl=tensor(716234.5000, device='cuda:0') eval_epoch_loss=tensor(13.4818, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=27: train_ppl=tensor(1.7383, device='cuda:0') train_epoch_loss=tensor(0.5529, device='cuda:0') eval_ppl=tensor(358863.7500, device='cuda:0') eval_epoch_loss=tensor(12.7907, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.55it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=28: train_ppl=tensor(1.6243, device='cuda:0') train_epoch_loss=tensor(0.4851, device='cuda:0') eval_ppl=tensor(658479.6250, device='cuda:0') eval_epoch_loss=tensor(13.3977, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.66it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=29: train_ppl=tensor(1.6842, device='cuda:0') train_epoch_loss=tensor(0.5213, device='cuda:0') eval_ppl=tensor(466451.3125, device='cuda:0') eval_epoch_loss=tensor(13.0529, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.65it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=30: train_ppl=tensor(1.5242, device='cuda:0') train_epoch_loss=tensor(0.4214, device='cuda:0') eval_ppl=tensor(389777.4062, device='cuda:0') eval_epoch_loss=tensor(12.8733, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.60it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=31: train_ppl=tensor(1.4163, device='cuda:0') train_epoch_loss=tensor(0.3480, device='cuda:0') eval_ppl=tensor(693243.6250, device='cuda:0') eval_epoch_loss=tensor(13.4491, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.77it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=32: train_ppl=tensor(1.4958, device='cuda:0') train_epoch_loss=tensor(0.4026, device='cuda:0') eval_ppl=tensor(375730., device='cuda:0') eval_epoch_loss=tensor(12.8366, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.57it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=33: train_ppl=tensor(1.4621, device='cuda:0') train_epoch_loss=tensor(0.3799, device='cuda:0') eval_ppl=tensor(402524., device='cuda:0') eval_epoch_loss=tensor(12.9055, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.58it/s]
100%|██████████| 425/425 [00:25<00:00, 16.67it/s]


epoch=34: train_ppl=tensor(1.4084, device='cuda:0') train_epoch_loss=tensor(0.3424, device='cuda:0') eval_ppl=tensor(343188.6562, device='cuda:0') eval_epoch_loss=tensor(12.7460, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=35: train_ppl=tensor(1.3619, device='cuda:0') train_epoch_loss=tensor(0.3089, device='cuda:0') eval_ppl=tensor(467434.5625, device='cuda:0') eval_epoch_loss=tensor(13.0550, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.65it/s]
100%|██████████| 425/425 [00:25<00:00, 16.68it/s]


epoch=36: train_ppl=tensor(1.3472, device='cuda:0') train_epoch_loss=tensor(0.2980, device='cuda:0') eval_ppl=tensor(407284.7500, device='cuda:0') eval_epoch_loss=tensor(12.9173, device='cuda:0')


100%|██████████| 7/7 [00:00<00:00,  8.73it/s]
 71%|███████   | 302/425 [00:18<00:07, 16.68it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [12]:
wandb.finish()



VBox(children=(Label(value='0.021 MB of 0.023 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.945264…

In [15]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
peft_model_id = "rbelanec/bloomz-560m_PROMPT_TUNING_CAUSAL_LM"
model.push_to_hub("rbelanec/bloomz-560m_PROMPT_TUNING_CAUSAL_LM", use_auth_token=True)



adapter_model.bin:   0%|          | 0.00/34.0k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rbelanec/bloomz-560m_PROMPT_TUNING_CAUSAL_LM/commit/d0a1880f4368b94c0fa878348a6563cda7b61a8e', commit_message='Upload model', commit_description='', oid='d0a1880f4368b94c0fa878348a6563cda7b61a8e', pr_url=None, pr_revision=None, pr_num=None)

In [16]:
from peft import PeftModel, PeftConfig

peft_model_id = "rbelanec/bloomz-560m_PROMPT_TUNING_CAUSAL_LM"

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)

Downloading (…)/adapter_config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading adapter_model.bin:   0%|          | 0.00/34.0k [00:00<?, ?B/s]

In [17]:
inputs = tokenizer(
    f'{text_column} : {"@nationalgridus I have no water and the bill is current and paid. Can you do something about this?"} Label : ',
    return_tensors="pt",
)

In [21]:
model.to("cuda")

with torch.no_grad():
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3
    )
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

['Tweet text : @nationalgridus I have no water and the bill is current and paid. Can you do something about this? Label : complaint']


In [22]:
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)

inputs = tokenizer(
    f'{text_column} : {"@nationalgridus I have no water and the bill is current and paid. Can you do something about this?"} Label : ',
    return_tensors="pt",
)

model.to("cuda")

with torch.no_grad():
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3
    )
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

['Tweet text : @nationalgridus I have no water and the bill is current and paid. Can you do something about this? Label :  NoThe present invention relates to a method of']


In [32]:
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 1024)
    (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-23): 24 x BloomBlock(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (

In [34]:
peft_model = get_peft_model(model, peft_config)
peft_model

PeftModelForCausalLM(
  (base_model): BloomForCausalLM(
    (transformer): BloomModel(
      (word_embeddings): Embedding(250880, 1024)
      (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (h): ModuleList(
        (0-23): 24 x BloomBlock(
          (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (self_attention): BloomAttention(
            (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (attention_dropout): Dropout(p=0.0, inplace=False)
          )
          (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): BloomMLP(
            (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu_impl): BloomGelu()
            (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
          )
        )
      

In [35]:
peft_model.print_trainable_parameters()

trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358


In [49]:
tokenizer('hello my friend, how are you')['input_ids'] + [tokenizer.pad_token_id]

[101579, 2670, 24015, 15, 4143, 1306, 1152, 3]

In [52]:
len(processed_datasets['train'][0]['input_ids'])

64