In [None]:
! pip install transformers peft datasets

In [2]:
from huggingface_hub import notebook_login,login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PrefixTuningConfig, TaskType, PeftType
import torch
import os
from torch.utils.data import DataLoader
from datasets import load_dataset
from tqdm import tqdm

model_name='bigscience/bloomz-560m'
tokenizer_name='bigscience/bloomz-560m'
peft_config=PrefixTuningConfig(task_type=TaskType.CAUSAL_LM, num_virtual_tokens=30)

dataset_name='twitter_complaints'
checkpoing_name=f"{dataset_name}_{model_name}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt".replace('/','_')
text_column='Tweet text'
label_column='text_label'
max_length=64
lr=3e-2
num_epochs=50
batch_size=8

In [6]:
dataset=load_dataset('ought/raft',dataset_name)
# Extract and format class names
classes=[k.replace('_',' ')for k in dataset['train'].features['Label'].names]
# Map label IDs to text labels
dataset=dataset.map(lambda x:{'text_label':[classes[label]for label in x['Label']]},batched=True,num_proc=1   )

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/3399 [00:00<?, ? examples/s]

In [8]:
# Data Preprocessing
tokenizer=AutoTokenizer.from_pretrained(model_name)
# Ensures padding tokens exists
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id=tokenizer.eos_token_id
# Computes max token length for class labels
target_max_length=max([len(tokenizer(class_label)['input_ids'])for class_label in classes])

def preprocess_function(examples):
  # Get batch size from input
  batch_size=len(examples[text_column])
  # Format input prompt
  inputs= [f"{text_column}: {x} Label :" for x in examples[text_column]]
  # Target labels
  targets=[str(x) for x in examples[label_column]]
  # TOkenizer input prompt
  model_inputs=tokenizer(inputs)
  labels=tokenizer(targets, add_special_tokens=False)
  for i in range(batch_size):
    sample_input_ids=model_inputs['input_ids'][i]
    label_input_ids=labels['input_ids'][i] + [tokenizer.eos_token_id]

    model_inputs['input_ids'][i] = sample_input_ids + label_input_ids
    labels['input_ids'][i] =[-100] * len(sample_input_ids) + label_input_ids  #  ignore the input tokens, Focus only generated label part
    model_inputs['attention_mask'][i] = [i] * len(model_inputs['input_ids'][i])
  # Ensures all sequences are padded to the same length
  for i in range(batch_size):
    sample_input_ids=model_inputs['input_ids'][i]
    label_input_ids =labels['input_ids'][i]
    # Left pad input IDs with pad_token_id
    model_inputs['input_ids'][i] = [tokenizer.pad_token_id] * (max_length -len(sample_input_ids)) + sample_input_ids
    # Adjust attention mask to match padding
    model_inputs['attention_mask'][i] =[0] * (max_length - len(sample_input_ids)) + model_inputs['attention_mask'][i]
    # Ignore the padded tokens
    labels['input_ids'][i] =[-100] * (max_length - len(sample_input_ids)) + label_input_ids
    model_inputs['input_ids'][i] = torch.tensor(model_inputs['input_ids'][i][:max_length])
    model_inputs['attention_mask'][i]=torch.tensor(model_inputs['attention_mask'][i][:max_length])
    labels['input_ids'][i] = torch.tensor(labels['input_ids'][i][:max_length])
  model_inputs['labels'] = labels['input_ids']
  return model_inputs

processed_datasets=dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset['train'].column_names,
    load_from_cache_file=False,
    desc='Running tokenizer on dataset'
)
train_dataset=processed_datasets['train']
eval_dataset=processed_datasets['train']

train_dataloader=DataLoader(train_dataset,shuffle=True, collate_fn=default_data_collator, batch_size=batch_size,pin_memory=True)
eval_dataloader=DataLoader(eval_dataset,collate_fn=default_data_collator, batch_size=batch_size,pin_memory=True)

Running tokenizer on dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/3399 [00:00<?, ? examples/s]

In [10]:
def test_preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    model_inputs = tokenizer(inputs)
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
    return model_inputs


test_dataset = dataset["test"].map(
    test_preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

test_dataloader = DataLoader(test_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)


Running tokenizer on dataset:   0%|          | 0/3399 [00:00<?, ? examples/s]

## Actually we are training the Trainable parameters on our datast. Which will help to LLM behaviour of the data.

In [11]:
# Creating Model
model=AutoModelForCausalLM.from_pretrained(model_name)
model=get_peft_model(model,peft_config)
model.print_trainable_parameters()

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

trainable params: 1,474,560 || all params: 560,689,152 || trainable%: 0.2630


In [12]:
model.peft_config

{'default': PrefixTuningConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.PREFIX_TUNING: 'PREFIX_TUNING'>, auto_mapping=None, base_model_name_or_path='bigscience/bloomz-560m', revision=None, inference_mode=False, num_virtual_tokens=30, token_dim=1024, num_transformer_submodules=1, num_attention_heads=16, num_layers=24, encoder_hidden_size=1024, prefix_projection=False)}

## Now we are doing Training and BackPropagation of the Trainable parameter. So that it adjust its value according to accuracy.

In [14]:
# model
# optimizer and lr scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [None]:
# Training and Evaluation
for epoch in range(num_epochs):
  model.train()
  total_loss=0
  for step,batch in enumerate(tqdm(train_dataloader)):
    batch ={k: v for k,v in batch.items()}
    outputs=model(**batch)
    loss=outputs.loss
    total_loss += loss.detach().float()
    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()

  model.eval()
  eval_loss=0
  eval_preds=[]
  for step, batch in enumerate(tqdm(eval_dataloader)):
    batch ={k:v for k, v in batch.items()}
    with torch.no_grad():
      outputs=model(**batch)
    loss=outputs.loss
    eval_loss += loss.detach().float()
    eval_preds.extend(tokenizer.batch_decode(torch.argmax(outputs.logits,-1).detach().cpu().numpy(),skip_special_tokens=True))


    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|██████████| 7/7 [03:45<00:00, 32.25s/it]
 14%|█▍        | 1/7 [00:08<00:52,  8.80s/it]

epoch=0: train_ppl=tensor(1073.8441) train_epoch_loss=tensor(6.9790) eval_ppl=tensor(1.3376) eval_epoch_loss=tensor(0.2909)


 29%|██▊       | 2/7 [00:17<00:44,  8.80s/it]

epoch=0: train_ppl=tensor(1073.8441) train_epoch_loss=tensor(6.9790) eval_ppl=tensor(2.3518) eval_epoch_loss=tensor(0.8552)


 43%|████▎     | 3/7 [00:25<00:33,  8.34s/it]

epoch=0: train_ppl=tensor(1073.8441) train_epoch_loss=tensor(6.9790) eval_ppl=tensor(7.2299) eval_epoch_loss=tensor(1.9782)


 57%|█████▋    | 4/7 [00:34<00:25,  8.49s/it]

epoch=0: train_ppl=tensor(1073.8441) train_epoch_loss=tensor(6.9790) eval_ppl=tensor(25.5156) eval_epoch_loss=tensor(3.2393)


 71%|███████▏  | 5/7 [00:42<00:17,  8.63s/it]

epoch=0: train_ppl=tensor(1073.8441) train_epoch_loss=tensor(6.9790) eval_ppl=tensor(92.4858) eval_epoch_loss=tensor(4.5271)


 86%|████████▌ | 6/7 [00:51<00:08,  8.62s/it]

epoch=0: train_ppl=tensor(1073.8441) train_epoch_loss=tensor(6.9790) eval_ppl=tensor(372.4107) eval_epoch_loss=tensor(5.9200)


100%|██████████| 7/7 [00:55<00:00,  7.97s/it]


epoch=0: train_ppl=tensor(1073.8441) train_epoch_loss=tensor(6.9790) eval_ppl=tensor(1547.6750) eval_epoch_loss=tensor(7.3445)


100%|██████████| 7/7 [03:43<00:00, 31.94s/it]
 14%|█▍        | 1/7 [00:08<00:51,  8.60s/it]

epoch=1: train_ppl=tensor(1114.0072) train_epoch_loss=tensor(7.0157) eval_ppl=tensor(1.2997) eval_epoch_loss=tensor(0.2621)


 29%|██▊       | 2/7 [00:17<00:43,  8.78s/it]

epoch=1: train_ppl=tensor(1114.0072) train_epoch_loss=tensor(7.0157) eval_ppl=tensor(2.0570) eval_epoch_loss=tensor(0.7213)


 43%|████▎     | 3/7 [00:26<00:34,  8.70s/it]

epoch=1: train_ppl=tensor(1114.0072) train_epoch_loss=tensor(7.0157) eval_ppl=tensor(5.9585) eval_epoch_loss=tensor(1.7848)


 57%|█████▋    | 4/7 [00:34<00:26,  8.70s/it]

epoch=1: train_ppl=tensor(1114.0072) train_epoch_loss=tensor(7.0157) eval_ppl=tensor(20.1729) eval_epoch_loss=tensor(3.0043)


 71%|███████▏  | 5/7 [00:43<00:17,  8.66s/it]

epoch=1: train_ppl=tensor(1114.0072) train_epoch_loss=tensor(7.0157) eval_ppl=tensor(70.8426) eval_epoch_loss=tensor(4.2605)


 86%|████████▌ | 6/7 [00:52<00:08,  8.67s/it]

epoch=1: train_ppl=tensor(1114.0072) train_epoch_loss=tensor(7.0157) eval_ppl=tensor(280.7458) eval_epoch_loss=tensor(5.6374)


100%|██████████| 7/7 [00:56<00:00,  8.01s/it]


epoch=1: train_ppl=tensor(1114.0072) train_epoch_loss=tensor(7.0157) eval_ppl=tensor(1166.7317) eval_epoch_loss=tensor(7.0620)


100%|██████████| 7/7 [03:36<00:00, 30.87s/it]
 14%|█▍        | 1/7 [00:08<00:52,  8.80s/it]

epoch=2: train_ppl=tensor(390.8069) train_epoch_loss=tensor(5.9682) eval_ppl=tensor(1.1639) eval_epoch_loss=tensor(0.1518)


 29%|██▊       | 2/7 [00:17<00:43,  8.63s/it]

epoch=2: train_ppl=tensor(390.8069) train_epoch_loss=tensor(5.9682) eval_ppl=tensor(1.6143) eval_epoch_loss=tensor(0.4789)


 43%|████▎     | 3/7 [00:25<00:33,  8.48s/it]

epoch=2: train_ppl=tensor(390.8069) train_epoch_loss=tensor(5.9682) eval_ppl=tensor(4.4238) eval_epoch_loss=tensor(1.4870)


 57%|█████▋    | 4/7 [00:34<00:26,  8.68s/it]

epoch=2: train_ppl=tensor(390.8069) train_epoch_loss=tensor(5.9682) eval_ppl=tensor(14.4133) eval_epoch_loss=tensor(2.6682)


 71%|███████▏  | 5/7 [00:43<00:17,  8.69s/it]

epoch=2: train_ppl=tensor(390.8069) train_epoch_loss=tensor(5.9682) eval_ppl=tensor(49.1765) eval_epoch_loss=tensor(3.8954)


 86%|████████▌ | 6/7 [00:52<00:08,  8.69s/it]

epoch=2: train_ppl=tensor(390.8069) train_epoch_loss=tensor(5.9682) eval_ppl=tensor(191.6109) eval_epoch_loss=tensor(5.2555)


100%|██████████| 7/7 [00:56<00:00,  8.03s/it]


epoch=2: train_ppl=tensor(390.8069) train_epoch_loss=tensor(5.9682) eval_ppl=tensor(796.3022) eval_epoch_loss=tensor(6.6800)


100%|██████████| 7/7 [03:40<00:00, 31.56s/it]
 14%|█▍        | 1/7 [00:08<00:49,  8.20s/it]

epoch=3: train_ppl=tensor(439.4962) train_epoch_loss=tensor(6.0856) eval_ppl=tensor(1.1646) eval_epoch_loss=tensor(0.1524)


 29%|██▊       | 2/7 [00:16<00:42,  8.49s/it]

epoch=3: train_ppl=tensor(439.4962) train_epoch_loss=tensor(6.0856) eval_ppl=tensor(1.5203) eval_epoch_loss=tensor(0.4189)


 43%|████▎     | 3/7 [00:25<00:34,  8.59s/it]

epoch=3: train_ppl=tensor(439.4962) train_epoch_loss=tensor(6.0856) eval_ppl=tensor(3.9757) eval_epoch_loss=tensor(1.3802)


 57%|█████▋    | 4/7 [00:33<00:25,  8.40s/it]

epoch=3: train_ppl=tensor(439.4962) train_epoch_loss=tensor(6.0856) eval_ppl=tensor(12.5281) eval_epoch_loss=tensor(2.5280)


 71%|███████▏  | 5/7 [00:41<00:16,  8.33s/it]

epoch=3: train_ppl=tensor(439.4962) train_epoch_loss=tensor(6.0856) eval_ppl=tensor(41.6999) eval_epoch_loss=tensor(3.7305)


 86%|████████▌ | 6/7 [00:50<00:08,  8.42s/it]

epoch=3: train_ppl=tensor(439.4962) train_epoch_loss=tensor(6.0856) eval_ppl=tensor(160.0213) eval_epoch_loss=tensor(5.0753)


100%|██████████| 7/7 [00:54<00:00,  7.77s/it]


epoch=3: train_ppl=tensor(439.4962) train_epoch_loss=tensor(6.0856) eval_ppl=tensor(665.0214) eval_epoch_loss=tensor(6.4998)


100%|██████████| 7/7 [03:34<00:00, 30.69s/it]
 14%|█▍        | 1/7 [00:08<00:52,  8.80s/it]

epoch=4: train_ppl=tensor(215.8248) train_epoch_loss=tensor(5.3745) eval_ppl=tensor(1.0876) eval_epoch_loss=tensor(0.0840)


 29%|██▊       | 2/7 [00:17<00:43,  8.68s/it]

epoch=4: train_ppl=tensor(215.8248) train_epoch_loss=tensor(5.3745) eval_ppl=tensor(1.3045) eval_epoch_loss=tensor(0.2658)


 43%|████▎     | 3/7 [00:26<00:35,  8.78s/it]

epoch=4: train_ppl=tensor(215.8248) train_epoch_loss=tensor(5.3745) eval_ppl=tensor(3.2704) eval_epoch_loss=tensor(1.1849)


 57%|█████▋    | 4/7 [00:34<00:26,  8.67s/it]

epoch=4: train_ppl=tensor(215.8248) train_epoch_loss=tensor(5.3745) eval_ppl=tensor(10.0003) eval_epoch_loss=tensor(2.3026)


 71%|███████▏  | 5/7 [00:42<00:17,  8.50s/it]

epoch=4: train_ppl=tensor(215.8248) train_epoch_loss=tensor(5.3745) eval_ppl=tensor(32.5802) eval_epoch_loss=tensor(3.4837)


 86%|████████▌ | 6/7 [00:51<00:08,  8.47s/it]

epoch=4: train_ppl=tensor(215.8248) train_epoch_loss=tensor(5.3745) eval_ppl=tensor(123.2802) eval_epoch_loss=tensor(4.8145)


100%|██████████| 7/7 [00:55<00:00,  7.93s/it]


epoch=4: train_ppl=tensor(215.8248) train_epoch_loss=tensor(5.3745) eval_ppl=tensor(512.3314) eval_epoch_loss=tensor(6.2390)


100%|██████████| 7/7 [03:35<00:00, 30.79s/it]
 14%|█▍        | 1/7 [00:08<00:52,  8.70s/it]

epoch=5: train_ppl=tensor(269.1813) train_epoch_loss=tensor(5.5954) eval_ppl=tensor(1.0601) eval_epoch_loss=tensor(0.0584)


 29%|██▊       | 2/7 [00:16<00:41,  8.35s/it]

epoch=5: train_ppl=tensor(269.1813) train_epoch_loss=tensor(5.5954) eval_ppl=tensor(1.1977) eval_epoch_loss=tensor(0.1804)


 43%|████▎     | 3/7 [00:25<00:34,  8.56s/it]

epoch=5: train_ppl=tensor(269.1813) train_epoch_loss=tensor(5.5954) eval_ppl=tensor(2.8910) eval_epoch_loss=tensor(1.0616)


 57%|█████▋    | 4/7 [00:34<00:25,  8.65s/it]

epoch=5: train_ppl=tensor(269.1813) train_epoch_loss=tensor(5.5954) eval_ppl=tensor(8.5908) eval_epoch_loss=tensor(2.1507)


 71%|███████▏  | 5/7 [00:43<00:17,  8.81s/it]

epoch=5: train_ppl=tensor(269.1813) train_epoch_loss=tensor(5.5954) eval_ppl=tensor(27.4456) eval_epoch_loss=tensor(3.3122)


 86%|████████▌ | 6/7 [00:52<00:08,  8.71s/it]

epoch=5: train_ppl=tensor(269.1813) train_epoch_loss=tensor(5.5954) eval_ppl=tensor(102.5585) eval_epoch_loss=tensor(4.6304)


100%|██████████| 7/7 [00:56<00:00,  8.03s/it]


epoch=5: train_ppl=tensor(269.1813) train_epoch_loss=tensor(5.5954) eval_ppl=tensor(426.2159) eval_epoch_loss=tensor(6.0549)


100%|██████████| 7/7 [03:37<00:00, 31.06s/it]
 14%|█▍        | 1/7 [00:08<00:52,  8.70s/it]

epoch=6: train_ppl=tensor(215.5365) train_epoch_loss=tensor(5.3731) eval_ppl=tensor(1.0531) eval_epoch_loss=tensor(0.0517)


 29%|██▊       | 2/7 [00:17<00:43,  8.70s/it]

epoch=6: train_ppl=tensor(215.5365) train_epoch_loss=tensor(5.3731) eval_ppl=tensor(1.1463) eval_epoch_loss=tensor(0.1366)


 43%|████▎     | 3/7 [00:26<00:34,  8.66s/it]

epoch=6: train_ppl=tensor(215.5365) train_epoch_loss=tensor(5.3731) eval_ppl=tensor(2.6692) eval_epoch_loss=tensor(0.9818)


 57%|█████▋    | 4/7 [00:33<00:24,  8.28s/it]

epoch=6: train_ppl=tensor(215.5365) train_epoch_loss=tensor(5.3731) eval_ppl=tensor(7.7357) eval_epoch_loss=tensor(2.0459)


 71%|███████▏  | 5/7 [00:42<00:16,  8.43s/it]

epoch=6: train_ppl=tensor(215.5365) train_epoch_loss=tensor(5.3731) eval_ppl=tensor(24.2541) eval_epoch_loss=tensor(3.1886)


 86%|████████▌ | 6/7 [00:50<00:08,  8.39s/it]

epoch=6: train_ppl=tensor(215.5365) train_epoch_loss=tensor(5.3731) eval_ppl=tensor(89.4853) eval_epoch_loss=tensor(4.4941)


100%|██████████| 7/7 [00:54<00:00,  7.84s/it]


epoch=6: train_ppl=tensor(215.5365) train_epoch_loss=tensor(5.3731) eval_ppl=tensor(371.8858) eval_epoch_loss=tensor(5.9186)


100%|██████████| 7/7 [03:39<00:00, 31.29s/it]
 14%|█▍        | 1/7 [00:08<00:49,  8.30s/it]

epoch=7: train_ppl=tensor(266.1092) train_epoch_loss=tensor(5.5839) eval_ppl=tensor(1.0392) eval_epoch_loss=tensor(0.0385)


 29%|██▊       | 2/7 [00:16<00:42,  8.48s/it]

epoch=7: train_ppl=tensor(266.1092) train_epoch_loss=tensor(5.5839) eval_ppl=tensor(1.1050) eval_epoch_loss=tensor(0.0998)


 43%|████▎     | 3/7 [00:25<00:34,  8.58s/it]

epoch=7: train_ppl=tensor(266.1092) train_epoch_loss=tensor(5.5839) eval_ppl=tensor(2.4649) eval_epoch_loss=tensor(0.9021)


 57%|█████▋    | 4/7 [00:34<00:25,  8.67s/it]

epoch=7: train_ppl=tensor(266.1092) train_epoch_loss=tensor(5.5839) eval_ppl=tensor(6.9487) eval_epoch_loss=tensor(1.9386)


 71%|███████▏  | 5/7 [00:43<00:17,  8.75s/it]

epoch=7: train_ppl=tensor(266.1092) train_epoch_loss=tensor(5.5839) eval_ppl=tensor(21.4258) eval_epoch_loss=tensor(3.0646)


 86%|████████▌ | 6/7 [00:51<00:08,  8.63s/it]

epoch=7: train_ppl=tensor(266.1092) train_epoch_loss=tensor(5.5839) eval_ppl=tensor(78.1372) eval_epoch_loss=tensor(4.3585)


100%|██████████| 7/7 [00:55<00:00,  7.99s/it]


epoch=7: train_ppl=tensor(266.1092) train_epoch_loss=tensor(5.5839) eval_ppl=tensor(324.7249) eval_epoch_loss=tensor(5.7830)


100%|██████████| 7/7 [03:38<00:00, 31.24s/it]
 14%|█▍        | 1/7 [00:08<00:52,  8.70s/it]

epoch=8: train_ppl=tensor(237.4848) train_epoch_loss=tensor(5.4701) eval_ppl=tensor(1.0377) eval_epoch_loss=tensor(0.0370)


 29%|██▊       | 2/7 [00:17<00:44,  8.94s/it]

epoch=8: train_ppl=tensor(237.4848) train_epoch_loss=tensor(5.4701) eval_ppl=tensor(1.0854) eval_epoch_loss=tensor(0.0819)


 43%|████▎     | 3/7 [00:26<00:35,  8.92s/it]

epoch=8: train_ppl=tensor(237.4848) train_epoch_loss=tensor(5.4701) eval_ppl=tensor(2.3444) eval_epoch_loss=tensor(0.8520)


 57%|█████▋    | 4/7 [00:34<00:25,  8.40s/it]

epoch=8: train_ppl=tensor(237.4848) train_epoch_loss=tensor(5.4701) eval_ppl=tensor(6.4427) eval_epoch_loss=tensor(1.8630)


 71%|███████▏  | 5/7 [00:42<00:16,  8.47s/it]

epoch=8: train_ppl=tensor(237.4848) train_epoch_loss=tensor(5.4701) eval_ppl=tensor(19.5578) eval_epoch_loss=tensor(2.9734)


 86%|████████▌ | 6/7 [00:51<00:08,  8.48s/it]

epoch=8: train_ppl=tensor(237.4848) train_epoch_loss=tensor(5.4701) eval_ppl=tensor(70.5598) eval_epoch_loss=tensor(4.2565)


100%|██████████| 7/7 [00:55<00:00,  7.89s/it]


epoch=8: train_ppl=tensor(237.4848) train_epoch_loss=tensor(5.4701) eval_ppl=tensor(293.2345) eval_epoch_loss=tensor(5.6810)


100%|██████████| 7/7 [03:35<00:00, 30.83s/it]
 14%|█▍        | 1/7 [00:08<00:52,  8.70s/it]

epoch=9: train_ppl=tensor(229.6255) train_epoch_loss=tensor(5.4364) eval_ppl=tensor(1.0351) eval_epoch_loss=tensor(0.0345)


 29%|██▊       | 2/7 [00:17<00:44,  8.88s/it]

epoch=9: train_ppl=tensor(229.6255) train_epoch_loss=tensor(5.4364) eval_ppl=tensor(1.0688) eval_epoch_loss=tensor(0.0665)


 43%|████▎     | 3/7 [00:25<00:33,  8.43s/it]

epoch=9: train_ppl=tensor(229.6255) train_epoch_loss=tensor(5.4364) eval_ppl=tensor(2.2341) eval_epoch_loss=tensor(0.8038)


 57%|█████▋    | 4/7 [00:33<00:25,  8.38s/it]

epoch=9: train_ppl=tensor(229.6255) train_epoch_loss=tensor(5.4364) eval_ppl=tensor(6.0171) eval_epoch_loss=tensor(1.7946)


 71%|███████▏  | 5/7 [00:42<00:16,  8.39s/it]

epoch=9: train_ppl=tensor(229.6255) train_epoch_loss=tensor(5.4364) eval_ppl=tensor(17.9919) eval_epoch_loss=tensor(2.8899)


 86%|████████▌ | 6/7 [00:50<00:08,  8.29s/it]

epoch=9: train_ppl=tensor(229.6255) train_epoch_loss=tensor(5.4364) eval_ppl=tensor(64.2337) eval_epoch_loss=tensor(4.1625)


100%|██████████| 7/7 [00:54<00:00,  7.80s/it]


epoch=9: train_ppl=tensor(229.6255) train_epoch_loss=tensor(5.4364) eval_ppl=tensor(266.9443) eval_epoch_loss=tensor(5.5870)


100%|██████████| 7/7 [03:39<00:00, 31.34s/it]
 14%|█▍        | 1/7 [00:08<00:51,  8.60s/it]

epoch=10: train_ppl=tensor(123.0719) train_epoch_loss=tensor(4.8128) eval_ppl=tensor(1.0280) eval_epoch_loss=tensor(0.0276)


 29%|██▊       | 2/7 [00:16<00:41,  8.30s/it]

epoch=10: train_ppl=tensor(123.0719) train_epoch_loss=tensor(4.8128) eval_ppl=tensor(1.0541) eval_epoch_loss=tensor(0.0527)


 43%|████▎     | 3/7 [00:25<00:33,  8.39s/it]

epoch=10: train_ppl=tensor(123.0719) train_epoch_loss=tensor(4.8128) eval_ppl=tensor(2.1353) eval_epoch_loss=tensor(0.7586)


 57%|█████▋    | 4/7 [00:33<00:25,  8.36s/it]

epoch=10: train_ppl=tensor(123.0719) train_epoch_loss=tensor(4.8128) eval_ppl=tensor(5.6124) eval_epoch_loss=tensor(1.7250)


 71%|███████▏  | 5/7 [00:41<00:16,  8.19s/it]

epoch=10: train_ppl=tensor(123.0719) train_epoch_loss=tensor(4.8128) eval_ppl=tensor(16.5605) eval_epoch_loss=tensor(2.8070)


 86%|████████▌ | 6/7 [00:49<00:08,  8.20s/it]

epoch=10: train_ppl=tensor(123.0719) train_epoch_loss=tensor(4.8128) eval_ppl=tensor(58.6311) eval_epoch_loss=tensor(4.0713)


100%|██████████| 7/7 [00:53<00:00,  7.67s/it]


epoch=10: train_ppl=tensor(123.0719) train_epoch_loss=tensor(4.8128) eval_ppl=tensor(243.6608) eval_epoch_loss=tensor(5.4958)


100%|██████████| 7/7 [03:33<00:00, 30.43s/it]
 14%|█▍        | 1/7 [00:08<00:49,  8.30s/it]

epoch=11: train_ppl=tensor(174.8485) train_epoch_loss=tensor(5.1639) eval_ppl=tensor(1.0261) eval_epoch_loss=tensor(0.0258)


 29%|██▊       | 2/7 [00:16<00:41,  8.36s/it]

epoch=11: train_ppl=tensor(174.8485) train_epoch_loss=tensor(5.1639) eval_ppl=tensor(1.0437) eval_epoch_loss=tensor(0.0428)


 43%|████▎     | 3/7 [00:25<00:34,  8.51s/it]

epoch=11: train_ppl=tensor(174.8485) train_epoch_loss=tensor(5.1639) eval_ppl=tensor(2.0497) eval_epoch_loss=tensor(0.7177)


 57%|█████▋    | 4/7 [00:33<00:25,  8.47s/it]

epoch=11: train_ppl=tensor(174.8485) train_epoch_loss=tensor(5.1639) eval_ppl=tensor(5.3123) eval_epoch_loss=tensor(1.6700)


 71%|███████▏  | 5/7 [00:41<00:16,  8.23s/it]

epoch=11: train_ppl=tensor(174.8485) train_epoch_loss=tensor(5.1639) eval_ppl=tensor(15.4839) eval_epoch_loss=tensor(2.7398)


 86%|████████▌ | 6/7 [00:50<00:08,  8.29s/it]

epoch=11: train_ppl=tensor(174.8485) train_epoch_loss=tensor(5.1639) eval_ppl=tensor(54.4173) eval_epoch_loss=tensor(3.9967)


100%|██████████| 7/7 [00:53<00:00,  7.70s/it]


epoch=11: train_ppl=tensor(174.8485) train_epoch_loss=tensor(5.1639) eval_ppl=tensor(226.1488) eval_epoch_loss=tensor(5.4212)


100%|██████████| 7/7 [03:36<00:00, 30.99s/it]
 14%|█▍        | 1/7 [00:08<00:51,  8.61s/it]

epoch=12: train_ppl=tensor(141.3080) train_epoch_loss=tensor(4.9509) eval_ppl=tensor(1.0246) eval_epoch_loss=tensor(0.0243)


 29%|██▊       | 2/7 [00:17<00:43,  8.66s/it]

epoch=12: train_ppl=tensor(141.3080) train_epoch_loss=tensor(4.9509) eval_ppl=tensor(1.0394) eval_epoch_loss=tensor(0.0386)


 43%|████▎     | 3/7 [00:25<00:34,  8.55s/it]

epoch=12: train_ppl=tensor(141.3080) train_epoch_loss=tensor(4.9509) eval_ppl=tensor(1.9819) eval_epoch_loss=tensor(0.6841)


 57%|█████▋    | 4/7 [00:34<00:25,  8.48s/it]

epoch=12: train_ppl=tensor(141.3080) train_epoch_loss=tensor(4.9509) eval_ppl=tensor(5.0244) eval_epoch_loss=tensor(1.6143)


 71%|███████▏  | 5/7 [00:41<00:16,  8.24s/it]

epoch=12: train_ppl=tensor(141.3080) train_epoch_loss=tensor(4.9509) eval_ppl=tensor(14.4696) eval_epoch_loss=tensor(2.6721)


 86%|████████▌ | 6/7 [00:50<00:08,  8.39s/it]

epoch=12: train_ppl=tensor(141.3080) train_epoch_loss=tensor(4.9509) eval_ppl=tensor(50.4797) eval_epoch_loss=tensor(3.9216)


100%|██████████| 7/7 [00:54<00:00,  7.80s/it]


epoch=12: train_ppl=tensor(141.3080) train_epoch_loss=tensor(4.9509) eval_ppl=tensor(209.7851) eval_epoch_loss=tensor(5.3461)


100%|██████████| 7/7 [03:37<00:00, 31.06s/it]
 14%|█▍        | 1/7 [00:08<00:51,  8.60s/it]

epoch=13: train_ppl=tensor(97.6251) train_epoch_loss=tensor(4.5811) eval_ppl=tensor(1.0253) eval_epoch_loss=tensor(0.0250)


 29%|██▊       | 2/7 [00:16<00:42,  8.42s/it]

epoch=13: train_ppl=tensor(97.6251) train_epoch_loss=tensor(4.5811) eval_ppl=tensor(1.0371) eval_epoch_loss=tensor(0.0364)


 43%|████▎     | 3/7 [00:25<00:33,  8.41s/it]

epoch=13: train_ppl=tensor(97.6251) train_epoch_loss=tensor(4.5811) eval_ppl=tensor(1.9241) eval_epoch_loss=tensor(0.6545)


 57%|█████▋    | 4/7 [00:34<00:25,  8.53s/it]

epoch=13: train_ppl=tensor(97.6251) train_epoch_loss=tensor(4.5811) eval_ppl=tensor(4.7867) eval_epoch_loss=tensor(1.5658)


 71%|███████▏  | 5/7 [00:42<00:17,  8.55s/it]

epoch=13: train_ppl=tensor(97.6251) train_epoch_loss=tensor(4.5811) eval_ppl=tensor(13.6350) eval_epoch_loss=tensor(2.6126)


 86%|████████▌ | 6/7 [00:50<00:08,  8.33s/it]

epoch=13: train_ppl=tensor(97.6251) train_epoch_loss=tensor(4.5811) eval_ppl=tensor(47.2524) eval_epoch_loss=tensor(3.8555)


100%|██████████| 7/7 [00:54<00:00,  7.80s/it]


epoch=13: train_ppl=tensor(97.6251) train_epoch_loss=tensor(4.5811) eval_ppl=tensor(196.3727) eval_epoch_loss=tensor(5.2800)


100%|██████████| 7/7 [03:38<00:00, 31.17s/it]
 14%|█▍        | 1/7 [00:08<00:49,  8.30s/it]

epoch=14: train_ppl=tensor(67.0832) train_epoch_loss=tensor(4.2059) eval_ppl=tensor(1.0193) eval_epoch_loss=tensor(0.0192)


 29%|██▊       | 2/7 [00:16<00:42,  8.48s/it]

epoch=14: train_ppl=tensor(67.0832) train_epoch_loss=tensor(4.2059) eval_ppl=tensor(1.0291) eval_epoch_loss=tensor(0.0287)


 43%|████▎     | 3/7 [00:25<00:34,  8.53s/it]

epoch=14: train_ppl=tensor(67.0832) train_epoch_loss=tensor(4.2059) eval_ppl=tensor(1.8603) eval_epoch_loss=tensor(0.6208)


 57%|█████▋    | 4/7 [00:33<00:24,  8.17s/it]

epoch=14: train_ppl=tensor(67.0832) train_epoch_loss=tensor(4.2059) eval_ppl=tensor(4.5605) eval_epoch_loss=tensor(1.5174)


 71%|███████▏  | 5/7 [00:41<00:16,  8.18s/it]

epoch=14: train_ppl=tensor(67.0832) train_epoch_loss=tensor(4.2059) eval_ppl=tensor(12.8605) eval_epoch_loss=tensor(2.5542)


 86%|████████▌ | 6/7 [00:49<00:08,  8.29s/it]

epoch=14: train_ppl=tensor(67.0832) train_epoch_loss=tensor(4.2059) eval_ppl=tensor(44.2779) eval_epoch_loss=tensor(3.7905)


100%|██████████| 7/7 [00:53<00:00,  7.70s/it]


epoch=14: train_ppl=tensor(67.0832) train_epoch_loss=tensor(4.2059) eval_ppl=tensor(184.0112) eval_epoch_loss=tensor(5.2150)


100%|██████████| 7/7 [03:40<00:00, 31.46s/it]
 14%|█▍        | 1/7 [00:08<00:49,  8.20s/it]

epoch=15: train_ppl=tensor(122.7414) train_epoch_loss=tensor(4.8101) eval_ppl=tensor(1.0202) eval_epoch_loss=tensor(0.0200)


 29%|██▊       | 2/7 [00:17<00:42,  8.55s/it]

epoch=15: train_ppl=tensor(122.7414) train_epoch_loss=tensor(4.8101) eval_ppl=tensor(1.0286) eval_epoch_loss=tensor(0.0282)


 43%|████▎     | 3/7 [00:25<00:34,  8.62s/it]

epoch=15: train_ppl=tensor(122.7414) train_epoch_loss=tensor(4.8101) eval_ppl=tensor(1.8308) eval_epoch_loss=tensor(0.6047)


 57%|█████▋    | 4/7 [00:34<00:26,  8.73s/it]

epoch=15: train_ppl=tensor(122.7414) train_epoch_loss=tensor(4.8101) eval_ppl=tensor(4.4140) eval_epoch_loss=tensor(1.4848)


 71%|███████▏  | 5/7 [00:42<00:16,  8.47s/it]

epoch=15: train_ppl=tensor(122.7414) train_epoch_loss=tensor(4.8101) eval_ppl=tensor(12.3390) eval_epoch_loss=tensor(2.5128)


 86%|████████▌ | 6/7 [00:51<00:08,  8.45s/it]

epoch=15: train_ppl=tensor(122.7414) train_epoch_loss=tensor(4.8101) eval_ppl=tensor(42.2267) eval_epoch_loss=tensor(3.7431)


100%|██████████| 7/7 [00:55<00:00,  7.89s/it]


epoch=15: train_ppl=tensor(122.7414) train_epoch_loss=tensor(4.8101) eval_ppl=tensor(175.4870) eval_epoch_loss=tensor(5.1676)


100%|██████████| 7/7 [03:36<00:00, 30.90s/it]
 14%|█▍        | 1/7 [00:08<00:51,  8.60s/it]

epoch=16: train_ppl=tensor(85.7990) train_epoch_loss=tensor(4.4520) eval_ppl=tensor(1.0131) eval_epoch_loss=tensor(0.0130)


 29%|██▊       | 2/7 [00:16<00:42,  8.48s/it]

epoch=16: train_ppl=tensor(85.7990) train_epoch_loss=tensor(4.4520) eval_ppl=tensor(1.0201) eval_epoch_loss=tensor(0.0199)


 43%|████▎     | 3/7 [00:25<00:34,  8.58s/it]

epoch=16: train_ppl=tensor(85.7990) train_epoch_loss=tensor(4.4520) eval_ppl=tensor(1.7737) eval_epoch_loss=tensor(0.5731)


 57%|█████▋    | 4/7 [00:34<00:25,  8.63s/it]

epoch=16: train_ppl=tensor(85.7990) train_epoch_loss=tensor(4.4520) eval_ppl=tensor(4.2223) eval_epoch_loss=tensor(1.4404)


 71%|███████▏  | 5/7 [00:43<00:17,  8.65s/it]

epoch=16: train_ppl=tensor(85.7990) train_epoch_loss=tensor(4.4520) eval_ppl=tensor(11.6986) eval_epoch_loss=tensor(2.4595)


 86%|████████▌ | 6/7 [00:51<00:08,  8.43s/it]

epoch=16: train_ppl=tensor(85.7990) train_epoch_loss=tensor(4.4520) eval_ppl=tensor(39.7945) eval_epoch_loss=tensor(3.6837)


100%|██████████| 7/7 [00:55<00:00,  7.89s/it]


epoch=16: train_ppl=tensor(85.7990) train_epoch_loss=tensor(4.4520) eval_ppl=tensor(165.3791) eval_epoch_loss=tensor(5.1082)


100%|██████████| 7/7 [03:40<00:00, 31.54s/it]
 14%|█▍        | 1/7 [00:08<00:50,  8.40s/it]

epoch=17: train_ppl=tensor(56.1938) train_epoch_loss=tensor(4.0288) eval_ppl=tensor(1.0218) eval_epoch_loss=tensor(0.0216)


 29%|██▊       | 2/7 [00:17<00:42,  8.52s/it]

epoch=17: train_ppl=tensor(56.1938) train_epoch_loss=tensor(4.0288) eval_ppl=tensor(1.0281) eval_epoch_loss=tensor(0.0277)


 43%|████▎     | 3/7 [00:25<00:34,  8.51s/it]

epoch=17: train_ppl=tensor(56.1938) train_epoch_loss=tensor(4.0288) eval_ppl=tensor(1.7532) eval_epoch_loss=tensor(0.5615)


 57%|█████▋    | 4/7 [00:33<00:25,  8.51s/it]

epoch=17: train_ppl=tensor(56.1938) train_epoch_loss=tensor(4.0288) eval_ppl=tensor(4.1363) eval_epoch_loss=tensor(1.4198)


 71%|███████▏  | 5/7 [00:42<00:16,  8.32s/it]

epoch=17: train_ppl=tensor(56.1938) train_epoch_loss=tensor(4.0288) eval_ppl=tensor(11.3768) eval_epoch_loss=tensor(2.4316)


 86%|████████▌ | 6/7 [00:50<00:08,  8.38s/it]

epoch=17: train_ppl=tensor(56.1938) train_epoch_loss=tensor(4.0288) eval_ppl=tensor(38.4704) eval_epoch_loss=tensor(3.6499)


100%|██████████| 7/7 [00:54<00:00,  7.79s/it]


epoch=17: train_ppl=tensor(56.1938) train_epoch_loss=tensor(4.0288) eval_ppl=tensor(159.8765) eval_epoch_loss=tensor(5.0744)


100%|██████████| 7/7 [03:34<00:00, 30.67s/it]
 14%|█▍        | 1/7 [00:08<00:50,  8.40s/it]

epoch=18: train_ppl=tensor(79.2218) train_epoch_loss=tensor(4.3723) eval_ppl=tensor(1.0166) eval_epoch_loss=tensor(0.0165)


 29%|██▊       | 2/7 [00:16<00:42,  8.46s/it]

epoch=18: train_ppl=tensor(79.2218) train_epoch_loss=tensor(4.3723) eval_ppl=tensor(1.0232) eval_epoch_loss=tensor(0.0229)


 43%|████▎     | 3/7 [00:25<00:33,  8.43s/it]

epoch=18: train_ppl=tensor(79.2218) train_epoch_loss=tensor(4.3723) eval_ppl=tensor(1.7289) eval_epoch_loss=tensor(0.5475)


 57%|█████▋    | 4/7 [00:33<00:25,  8.50s/it]

epoch=18: train_ppl=tensor(79.2218) train_epoch_loss=tensor(4.3723) eval_ppl=tensor(4.0012) eval_epoch_loss=tensor(1.3866)


 71%|███████▏  | 5/7 [00:41<00:16,  8.25s/it]

epoch=18: train_ppl=tensor(79.2218) train_epoch_loss=tensor(4.3723) eval_ppl=tensor(10.9157) eval_epoch_loss=tensor(2.3902)


 86%|████████▌ | 6/7 [00:50<00:08,  8.37s/it]

epoch=18: train_ppl=tensor(79.2218) train_epoch_loss=tensor(4.3723) eval_ppl=tensor(36.7170) eval_epoch_loss=tensor(3.6032)


100%|██████████| 7/7 [00:54<00:00,  7.77s/it]


epoch=18: train_ppl=tensor(79.2218) train_epoch_loss=tensor(4.3723) eval_ppl=tensor(152.5897) eval_epoch_loss=tensor(5.0278)


100%|██████████| 7/7 [03:34<00:00, 30.64s/it]
 14%|█▍        | 1/7 [00:08<00:49,  8.31s/it]

epoch=19: train_ppl=tensor(74.6241) train_epoch_loss=tensor(4.3125) eval_ppl=tensor(1.0136) eval_epoch_loss=tensor(0.0135)


 29%|██▊       | 2/7 [00:16<00:42,  8.48s/it]

epoch=19: train_ppl=tensor(74.6241) train_epoch_loss=tensor(4.3125) eval_ppl=tensor(1.0189) eval_epoch_loss=tensor(0.0188)


 43%|████▎     | 3/7 [00:25<00:34,  8.62s/it]

epoch=19: train_ppl=tensor(74.6241) train_epoch_loss=tensor(4.3125) eval_ppl=tensor(1.6862) eval_epoch_loss=tensor(0.5225)


 57%|█████▋    | 4/7 [00:34<00:26,  8.73s/it]

epoch=19: train_ppl=tensor(74.6241) train_epoch_loss=tensor(4.3125) eval_ppl=tensor(3.8801) eval_epoch_loss=tensor(1.3559)


 71%|███████▏  | 5/7 [00:43<00:17,  8.76s/it]

epoch=19: train_ppl=tensor(74.6241) train_epoch_loss=tensor(4.3125) eval_ppl=tensor(10.5206) eval_epoch_loss=tensor(2.3533)


 86%|████████▌ | 6/7 [00:51<00:08,  8.54s/it]

epoch=19: train_ppl=tensor(74.6241) train_epoch_loss=tensor(4.3125) eval_ppl=tensor(35.2034) eval_epoch_loss=tensor(3.5611)


100%|██████████| 7/7 [00:55<00:00,  7.96s/it]


epoch=19: train_ppl=tensor(74.6241) train_epoch_loss=tensor(4.3125) eval_ppl=tensor(146.2991) eval_epoch_loss=tensor(4.9857)


100%|██████████| 7/7 [03:38<00:00, 31.17s/it]
 14%|█▍        | 1/7 [00:08<00:52,  8.80s/it]

epoch=20: train_ppl=tensor(49.6519) train_epoch_loss=tensor(3.9050) eval_ppl=tensor(1.0101) eval_epoch_loss=tensor(0.0100)


 29%|██▊       | 2/7 [00:17<00:44,  8.93s/it]

epoch=20: train_ppl=tensor(49.6519) train_epoch_loss=tensor(3.9050) eval_ppl=tensor(1.0151) eval_epoch_loss=tensor(0.0150)


 43%|████▎     | 3/7 [00:26<00:35,  8.86s/it]

epoch=20: train_ppl=tensor(49.6519) train_epoch_loss=tensor(3.9050) eval_ppl=tensor(1.6648) eval_epoch_loss=tensor(0.5097)


 57%|█████▋    | 4/7 [00:35<00:26,  8.76s/it]

epoch=20: train_ppl=tensor(49.6519) train_epoch_loss=tensor(3.9050) eval_ppl=tensor(3.7683) eval_epoch_loss=tensor(1.3266)


 71%|███████▏  | 5/7 [00:43<00:17,  8.63s/it]

epoch=20: train_ppl=tensor(49.6519) train_epoch_loss=tensor(3.9050) eval_ppl=tensor(10.1434) eval_epoch_loss=tensor(2.3168)


 86%|████████▌ | 6/7 [00:52<00:08,  8.55s/it]

epoch=20: train_ppl=tensor(49.6519) train_epoch_loss=tensor(3.9050) eval_ppl=tensor(33.7531) eval_epoch_loss=tensor(3.5191)


100%|██████████| 7/7 [00:56<00:00,  8.03s/it]


epoch=20: train_ppl=tensor(49.6519) train_epoch_loss=tensor(3.9050) eval_ppl=tensor(140.2721) eval_epoch_loss=tensor(4.9436)


100%|██████████| 7/7 [03:36<00:00, 30.93s/it]
 14%|█▍        | 1/7 [00:08<00:50,  8.49s/it]

epoch=21: train_ppl=tensor(47.7640) train_epoch_loss=tensor(3.8663) eval_ppl=tensor(1.0110) eval_epoch_loss=tensor(0.0109)


 29%|██▊       | 2/7 [00:17<00:42,  8.56s/it]

epoch=21: train_ppl=tensor(47.7640) train_epoch_loss=tensor(3.8663) eval_ppl=tensor(1.0156) eval_epoch_loss=tensor(0.0155)


 43%|████▎     | 3/7 [00:25<00:34,  8.71s/it]

epoch=21: train_ppl=tensor(47.7640) train_epoch_loss=tensor(3.8663) eval_ppl=tensor(1.6387) eval_epoch_loss=tensor(0.4939)


 57%|█████▋    | 4/7 [00:34<00:25,  8.51s/it]

epoch=21: train_ppl=tensor(47.7640) train_epoch_loss=tensor(3.8663) eval_ppl=tensor(3.6814) eval_epoch_loss=tensor(1.3033)


 71%|███████▏  | 5/7 [00:42<00:16,  8.33s/it]

epoch=21: train_ppl=tensor(47.7640) train_epoch_loss=tensor(3.8663) eval_ppl=tensor(9.8534) eval_epoch_loss=tensor(2.2878)


 86%|████████▌ | 6/7 [00:50<00:08,  8.42s/it]

epoch=21: train_ppl=tensor(47.7640) train_epoch_loss=tensor(3.8663) eval_ppl=tensor(32.6214) eval_epoch_loss=tensor(3.4850)


100%|██████████| 7/7 [00:54<00:00,  7.84s/it]


epoch=21: train_ppl=tensor(47.7640) train_epoch_loss=tensor(3.8663) eval_ppl=tensor(135.5688) eval_epoch_loss=tensor(4.9095)


100%|██████████| 7/7 [03:31<00:00, 30.26s/it]
 14%|█▍        | 1/7 [00:08<00:49,  8.30s/it]

epoch=22: train_ppl=tensor(49.1866) train_epoch_loss=tensor(3.8956) eval_ppl=tensor(1.0078) eval_epoch_loss=tensor(0.0078)


 29%|██▊       | 2/7 [00:16<00:40,  8.13s/it]

epoch=22: train_ppl=tensor(49.1866) train_epoch_loss=tensor(3.8956) eval_ppl=tensor(1.0122) eval_epoch_loss=tensor(0.0121)


 43%|████▎     | 3/7 [00:24<00:33,  8.30s/it]

epoch=22: train_ppl=tensor(49.1866) train_epoch_loss=tensor(3.8956) eval_ppl=tensor(1.6156) eval_epoch_loss=tensor(0.4797)


 57%|█████▋    | 4/7 [00:33<00:25,  8.46s/it]

epoch=22: train_ppl=tensor(49.1866) train_epoch_loss=tensor(3.8956) eval_ppl=tensor(3.5794) eval_epoch_loss=tensor(1.2752)


 71%|███████▏  | 5/7 [00:41<00:16,  8.36s/it]

epoch=22: train_ppl=tensor(49.1866) train_epoch_loss=tensor(3.8956) eval_ppl=tensor(9.5184) eval_epoch_loss=tensor(2.2532)


 86%|████████▌ | 6/7 [00:50<00:08,  8.44s/it]

epoch=22: train_ppl=tensor(49.1866) train_epoch_loss=tensor(3.8956) eval_ppl=tensor(31.3597) eval_epoch_loss=tensor(3.4455)


100%|██████████| 7/7 [00:54<00:00,  7.76s/it]


epoch=22: train_ppl=tensor(49.1866) train_epoch_loss=tensor(3.8956) eval_ppl=tensor(130.3253) eval_epoch_loss=tensor(4.8700)


100%|██████████| 7/7 [03:36<00:00, 30.87s/it]
 14%|█▍        | 1/7 [00:08<00:52,  8.70s/it]

epoch=23: train_ppl=tensor(85.5512) train_epoch_loss=tensor(4.4491) eval_ppl=tensor(1.0286) eval_epoch_loss=tensor(0.0282)


 29%|██▊       | 2/7 [00:17<00:42,  8.58s/it]

epoch=23: train_ppl=tensor(85.5512) train_epoch_loss=tensor(4.4491) eval_ppl=tensor(1.0342) eval_epoch_loss=tensor(0.0336)


 43%|████▎     | 3/7 [00:26<00:34,  8.68s/it]

epoch=23: train_ppl=tensor(85.5512) train_epoch_loss=tensor(4.4491) eval_ppl=tensor(1.6356) eval_epoch_loss=tensor(0.4920)


 57%|█████▋    | 4/7 [00:34<00:25,  8.61s/it]

epoch=23: train_ppl=tensor(85.5512) train_epoch_loss=tensor(4.4491) eval_ppl=tensor(3.5851) eval_epoch_loss=tensor(1.2768)


 71%|███████▏  | 5/7 [00:42<00:16,  8.39s/it]

epoch=23: train_ppl=tensor(85.5512) train_epoch_loss=tensor(4.4491) eval_ppl=tensor(9.4809) eval_epoch_loss=tensor(2.2493)


 86%|████████▌ | 6/7 [00:51<00:08,  8.46s/it]

epoch=23: train_ppl=tensor(85.5512) train_epoch_loss=tensor(4.4491) eval_ppl=tensor(31.0937) eval_epoch_loss=tensor(3.4370)


100%|██████████| 7/7 [00:55<00:00,  7.89s/it]


epoch=23: train_ppl=tensor(85.5512) train_epoch_loss=tensor(4.4491) eval_ppl=tensor(129.2201) eval_epoch_loss=tensor(4.8615)


100%|██████████| 7/7 [03:35<00:00, 30.73s/it]
 14%|█▍        | 1/7 [00:08<00:49,  8.20s/it]

epoch=24: train_ppl=tensor(61.7617) train_epoch_loss=tensor(4.1233) eval_ppl=tensor(1.0149) eval_epoch_loss=tensor(0.0148)


 29%|██▊       | 2/7 [00:16<00:41,  8.38s/it]

epoch=24: train_ppl=tensor(61.7617) train_epoch_loss=tensor(4.1233) eval_ppl=tensor(1.0193) eval_epoch_loss=tensor(0.0191)


 43%|████▎     | 3/7 [00:25<00:34,  8.52s/it]

epoch=24: train_ppl=tensor(61.7617) train_epoch_loss=tensor(4.1233) eval_ppl=tensor(1.5900) eval_epoch_loss=tensor(0.4637)


 57%|█████▋    | 4/7 [00:33<00:24,  8.29s/it]

epoch=24: train_ppl=tensor(61.7617) train_epoch_loss=tensor(4.1233) eval_ppl=tensor(3.4692) eval_epoch_loss=tensor(1.2439)


 71%|███████▏  | 5/7 [00:41<00:16,  8.07s/it]

epoch=24: train_ppl=tensor(61.7617) train_epoch_loss=tensor(4.1233) eval_ppl=tensor(9.1346) eval_epoch_loss=tensor(2.2121)


 86%|████████▌ | 6/7 [00:49<00:08,  8.28s/it]

epoch=24: train_ppl=tensor(61.7617) train_epoch_loss=tensor(4.1233) eval_ppl=tensor(29.8337) eval_epoch_loss=tensor(3.3956)


100%|██████████| 7/7 [00:53<00:00,  7.66s/it]


epoch=24: train_ppl=tensor(61.7617) train_epoch_loss=tensor(4.1233) eval_ppl=tensor(123.9838) eval_epoch_loss=tensor(4.8202)


100%|██████████| 7/7 [03:38<00:00, 31.14s/it]
 14%|█▍        | 1/7 [00:08<00:50,  8.50s/it]

epoch=25: train_ppl=tensor(60.4822) train_epoch_loss=tensor(4.1023) eval_ppl=tensor(1.0117) eval_epoch_loss=tensor(0.0117)


 29%|██▊       | 2/7 [00:16<00:42,  8.50s/it]

epoch=25: train_ppl=tensor(60.4822) train_epoch_loss=tensor(4.1023) eval_ppl=tensor(1.0149) eval_epoch_loss=tensor(0.0148)


 43%|████▎     | 3/7 [00:25<00:34,  8.59s/it]

epoch=25: train_ppl=tensor(60.4822) train_epoch_loss=tensor(4.1023) eval_ppl=tensor(1.5797) eval_epoch_loss=tensor(0.4572)


 57%|█████▋    | 4/7 [00:34<00:26,  8.71s/it]

epoch=25: train_ppl=tensor(60.4822) train_epoch_loss=tensor(4.1023) eval_ppl=tensor(3.3971) eval_epoch_loss=tensor(1.2229)


 71%|███████▏  | 5/7 [00:43<00:17,  8.75s/it]

epoch=25: train_ppl=tensor(60.4822) train_epoch_loss=tensor(4.1023) eval_ppl=tensor(8.8894) eval_epoch_loss=tensor(2.1849)


 86%|████████▌ | 6/7 [00:52<00:08,  8.76s/it]

epoch=25: train_ppl=tensor(60.4822) train_epoch_loss=tensor(4.1023) eval_ppl=tensor(28.8926) eval_epoch_loss=tensor(3.3636)


100%|██████████| 7/7 [00:56<00:00,  8.04s/it]


epoch=25: train_ppl=tensor(60.4822) train_epoch_loss=tensor(4.1023) eval_ppl=tensor(120.0728) eval_epoch_loss=tensor(4.7881)


100%|██████████| 7/7 [03:37<00:00, 31.10s/it]
 14%|█▍        | 1/7 [00:08<00:50,  8.40s/it]

epoch=26: train_ppl=tensor(51.5410) train_epoch_loss=tensor(3.9424) eval_ppl=tensor(1.0085) eval_epoch_loss=tensor(0.0085)


 29%|██▊       | 2/7 [00:17<00:42,  8.58s/it]

epoch=26: train_ppl=tensor(51.5410) train_epoch_loss=tensor(3.9424) eval_ppl=tensor(1.0114) eval_epoch_loss=tensor(0.0114)


 43%|████▎     | 3/7 [00:25<00:34,  8.68s/it]

epoch=26: train_ppl=tensor(51.5410) train_epoch_loss=tensor(3.9424) eval_ppl=tensor(1.5594) eval_epoch_loss=tensor(0.4443)


 57%|█████▋    | 4/7 [00:33<00:25,  8.41s/it]

epoch=26: train_ppl=tensor(51.5410) train_epoch_loss=tensor(3.9424) eval_ppl=tensor(3.3295) eval_epoch_loss=tensor(1.2028)


 71%|███████▏  | 5/7 [00:42<00:16,  8.33s/it]

epoch=26: train_ppl=tensor(51.5410) train_epoch_loss=tensor(3.9424) eval_ppl=tensor(8.6719) eval_epoch_loss=tensor(2.1601)


 86%|████████▌ | 6/7 [00:50<00:08,  8.49s/it]

epoch=26: train_ppl=tensor(51.5410) train_epoch_loss=tensor(3.9424) eval_ppl=tensor(28.0719) eval_epoch_loss=tensor(3.3348)


100%|██████████| 7/7 [00:55<00:00,  7.87s/it]


epoch=26: train_ppl=tensor(51.5410) train_epoch_loss=tensor(3.9424) eval_ppl=tensor(116.6622) eval_epoch_loss=tensor(4.7593)


100%|██████████| 7/7 [03:39<00:00, 31.31s/it]
 14%|█▍        | 1/7 [00:08<00:51,  8.50s/it]

epoch=27: train_ppl=tensor(98.3378) train_epoch_loss=tensor(4.5884) eval_ppl=tensor(1.0052) eval_epoch_loss=tensor(0.0052)


 29%|██▊       | 2/7 [00:17<00:43,  8.74s/it]

epoch=27: train_ppl=tensor(98.3378) train_epoch_loss=tensor(4.5884) eval_ppl=tensor(1.0078) eval_epoch_loss=tensor(0.0078)


 43%|████▎     | 3/7 [00:25<00:33,  8.31s/it]

epoch=27: train_ppl=tensor(98.3378) train_epoch_loss=tensor(4.5884) eval_ppl=tensor(1.5333) eval_epoch_loss=tensor(0.4274)


 57%|█████▋    | 4/7 [00:34<00:25,  8.55s/it]

epoch=27: train_ppl=tensor(98.3378) train_epoch_loss=tensor(4.5884) eval_ppl=tensor(3.2905) eval_epoch_loss=tensor(1.1910)


 71%|███████▏  | 5/7 [00:42<00:16,  8.45s/it]

epoch=27: train_ppl=tensor(98.3378) train_epoch_loss=tensor(4.5884) eval_ppl=tensor(8.5555) eval_epoch_loss=tensor(2.1466)


 86%|████████▌ | 6/7 [00:50<00:08,  8.16s/it]

epoch=27: train_ppl=tensor(98.3378) train_epoch_loss=tensor(4.5884) eval_ppl=tensor(27.6148) eval_epoch_loss=tensor(3.3184)


100%|██████████| 7/7 [00:54<00:00,  7.73s/it]


epoch=27: train_ppl=tensor(98.3378) train_epoch_loss=tensor(4.5884) eval_ppl=tensor(114.7623) eval_epoch_loss=tensor(4.7429)


 43%|████▎     | 3/7 [01:45<02:21, 35.27s/it]

In [None]:
model.eval()
i=16
inputs = tokenizer(f'{text_column} : {dataset["test"][i]["Tweet text"]} Label : ', return_tensors="pt")

with torch.no_grad():
  inputs={k: v for k, v in inputs.items()}
  outputs= model.generate(inputs_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'],max_new_tokens=10, eos_token_id=3)


In [None]:
model.push_to_hub(
  f"{dataset_name}_{model_name}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"))

In [None]:
from peft import PeftModel, PeftConfig

peft_model_id = f"{dataset_name}_{model_name}_{peft_config.peft_type}_{peft_config.task_type}".replace(
    "/", "_"
)

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:

model.eval()
i = 4
inputs = tokenizer(f'{text_column} : {dataset["test"][i]["Tweet text"]} Label : ', return_tensors="pt")
print(dataset["test"][i]["Tweet text"])
print(inputs)

with torch.no_grad():
    inputs = {k: v for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3
    )
    print(outputs)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))