In [1]:
!export HF_HOME=/vol/bitbucket/rm1623/.cache/

In [2]:
import os 
os.environ["HF_HOME"] = "/vol/bitbucket/rm1623/.cache/"


In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

import wandb
import torch 
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


In [3]:
model_name = 'facebook/opt-350m'
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=409

In [54]:
# model.config

In [5]:
dataset = load_dataset("tatsu-lab/alpaca")
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})

## OPTForCausalLM

In [177]:
import torch
torch.__version__

'2.2.2+cu121'

In [180]:
from dataclasses import dataclass
from typing import Optional, Tuple, Union, List
from dataclasses import fields, is_dataclass
from functools import partial
from typing import Any, List, Tuple, Iterable
from transformers.modeling_outputs import CausalLMOutputWithPast


In [181]:
from transformers import OPTPreTrainedModel, OPTModel

### Test OPT to understand logic

In [81]:
optmodel = OPTModel.from_pretrained("facebook/opt-350m")

In [83]:
sample_output = optmodel(**tokenizer("Hello, my dog is cute", return_tensors="pt"))


In [87]:
len(tokenizer("Hello, my dog is cute", return_tensors="pt").input_ids[0])

7

In [95]:
labels = tokenizer("Hello, my dog is cute", return_tensors="pt").input_ids
labels

tensor([[    2, 31414,     6,   127,  2335,    16, 11962]])

In [84]:
sample_output[0].shape

torch.Size([1, 7, 512])

In [106]:
lm_head = nn.Linear(512, 16, bias=False)

logits = lm_head(sample_output[0])

logits = nn.Sigmoid()(logits)

In [107]:
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
shift_logits.shape, shift_labels.shape

(torch.Size([1, 6, 16]), torch.Size([1, 6]))

In [108]:
shift_logits[0][0]

tensor([0.8788, 0.8846, 0.9053, 0.6172, 0.9827, 0.8940, 0.5551, 0.7627, 0.6673,
        0.9324, 0.2848, 0.9559, 0.3093, 0.0927, 0.5010, 0.9839],
       grad_fn=<SelectBackward0>)

In [109]:
import torch
def int_to_bin_tensor(val, length=16):
    bin_str = format(val, '0' + str(length) + 'b')
    bin_tensor = torch.tensor([int(bit) for bit in bin_str])
    return bin_tensor
# int_to_bin_tensor(2)

binary_tensors = [int_to_bin_tensor(val.item()) for val in shift_labels.flatten()]


binary_tensors = torch.stack(binary_tensors).view(*shift_labels.shape, -1)

binary_tensors

tensor([[[0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
         [0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0]]])

In [110]:
loss_fct = nn.L1Loss()
loss = loss_fct(shift_logits, binary_tensors)

In [111]:
loss

tensor(0.4442, grad_fn=<MeanBackward0>)

### Modify OPT Class

In [182]:

class OPTForCausalLM(OPTPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.model = OPTModel(config)

        # the lm_head weight is automatically tied to the embed tokens weight
        self.bit_size = torch.log2(torch.tensor(config.vocab_size)).ceil().int().item()
        self.lm_head = nn.Sequential(
            nn.Linear(config.word_embed_proj_dim, self.bit_size, bias=False),
            nn.Sigmoid(),
        )

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.decoder.embed_tokens

    def set_input_embeddings(self, value):
        self.model.decoder.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model.decoder = decoder

    def get_decoder(self):
        return self.model.decoder
    
    def int_to_bin_tensor(self, val):
        length = self.bit_size
        bin_str = format(val, '0' + str(length) + 'b')
        bin_tensor = torch.tensor([int(bit) for bit in bin_str])
        return bin_tensor

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
        """

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model.decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        logits = self.lm_head(outputs[0]).contiguous()  # (bs, seq_length, bit_size)

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            
            # convert the labels to binary - currently they are indexes of the tokenizer
            binary_tensors = [self.int_to_bin_tensor(val.item()) for val in shift_labels.flatten()]
            # get the binary tokens in the same shape as the original tensor
            binary_tensors = torch.stack(binary_tensors).view(*shift_labels.shape, -1)
            binary_tensors = binary_tensors.to(logits.device)
            # add L1 loss
            loss_fct = nn.L1Loss()
            loss = loss_fct(shift_logits, binary_tensors)
        
        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
    ):
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]

            # Some generation methods already pass only the last input ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # Default to old behavior: keep only final ID
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
            }
        )
        return model_inputs

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past


In [183]:
model = OPTForCausalLM.from_pretrained("facebook/opt-350m")

loading configuration file config.json from cache at /homes/rm1623/.cache/huggingface/hub/models--facebook--opt-350m/snapshots/08ab08cc4b72ff5593870b5d527cf4230323703c/config.json
Model config OPTConfig {
  "_name_or_path": "opt-350m",
  "_remove_final_layer_norm": false,
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "architectures": [
    "OPTForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "do_layer_norm_before": false,
  "dropout": 0.1,
  "enable_bias": true,
  "eos_token_id": 2,
  "ffn_dim": 4096,
  "hidden_size": 1024,
  "init_std": 0.02,
  "layer_norm_elementwise_affine": true,
  "layerdrop": 0.0,
  "max_position_embeddings": 2048,
  "model_type": "opt",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "prefix": "</s>",
  "torch_dtype": "float16",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 50272,
  "word_embed_proj_dim": 512
}

loading weights file pytorch_model.bin from cache at /ho

In [184]:
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=409

### Data Related

In [113]:
# import torch

# def hamming_loss(predicted, target):
#     """Compute the Hamming loss between predicted and target bit strings"""
#     return (predicted != target).float().mean()



In [117]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
import numpy as np

In [118]:
sample_dataset = dataset['train'].select(range(5))

In [119]:
texts = dataset["train"]["text"]

In [120]:
def tokenize_function(examples):
        return tokenizer(examples["text"])

In [121]:
sample_dataset

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 5
})

In [122]:
texts = dataset["train"]["text"]
block_size = 10
sample_tokenized_datasets = sample_dataset.map(
            tokenize_function,
            batched=True,
            num_proc=1,
            remove_columns=['instruction', 'input', 'output', 'text'],
            load_from_cache_file=False,
            desc="Running tokenizer on dataset",
        )


Running tokenizer on dataset: 100%|██████████| 5/5 [00:00<00:00, 305.35 examples/s]


In [123]:
sample_tokenized_datasets

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 5
})

In [124]:
from itertools import chain 

def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
        total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        print(result)
        return result

In [125]:
sample_lm_datasets = sample_tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=1,
            load_from_cache_file=False,
            desc=f"Grouping texts in chunks of {block_size}",
        )

Grouping texts in chunks of 10: 100%|██████████| 5/5 [00:00<00:00, 297.76 examples/s]

{'input_ids': [[2, 45943, 16, 41, 15741, 14, 7448, 10, 3685, 4], [21062, 10, 1263, 14, 16574, 25830, 5, 2069, 4, 50118], [50118, 48134, 41241, 35, 50118, 31033, 130, 4965, 13, 4959], [2245, 4, 50118, 50118, 48134, 19121, 35, 50118, 134, 4], [43800, 10, 9320, 5626, 8, 146, 686, 7, 680, 2710], [9, 12849, 8, 8942, 4, 1437, 50118, 176, 4, 30450], [4595, 7, 489, 110, 809, 2171, 8, 670, 4, 1437], [50118, 246, 4, 2315, 615, 3581, 8, 3014, 10, 4292], [3581, 3078, 4, 2, 45943, 16, 41, 15741, 14, 7448], [10, 3685, 4, 21062, 10, 1263, 14, 16574, 25830, 5], [2069, 4, 50118, 50118, 48134, 41241, 35, 50118, 2264, 32], [5, 130, 2270, 8089, 116, 50118, 50118, 48134, 19121, 35], [50118, 133, 130, 2270, 8089, 32, 1275, 6, 2440, 6], [8, 5718, 4, 2, 45943, 16, 41, 15741, 14, 7448], [10, 3685, 4, 21062, 10, 1263, 14, 16574, 25830, 5], [2069, 4, 50118, 50118, 48134, 41241, 35, 50118, 47066, 21700], [5, 3184, 9, 41, 37113, 4, 50118, 50118, 48134, 19121], [35, 50118, 4688, 37113, 16, 156, 62, 9, 10, 38531], [




In [126]:
sample_lm_datasets

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 55
})

## Define Custom Training Loop

[Sample Code](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm_no_trainer.py#L52)

In [127]:
from transformers import default_data_collator, get_scheduler

In [128]:
train_dataloader = DataLoader(
        sample_lm_datasets, shuffle=True, collate_fn=default_data_collator, batch_size=1
    )

In [129]:
no_decay = ["bias", "layer_norm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.001,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=1e-4)


In [148]:
num_train_epochs = 2
gradient_accumulation_steps = 2
num_warmup_steps = 5
num_processes = 1

In [149]:
import math
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
max_train_steps = num_train_epochs * num_update_steps_per_epoch
overrode_max_train_steps = True


In [150]:
lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps * num_processes,
        num_training_steps=max_train_steps
)

In [151]:
from accelerate import Accelerator, DistributedType
from accelerate.logging import get_logger
from accelerate.utils import set_seed

In [152]:
accelerator = Accelerator(gradient_accumulation_steps=gradient_accumulation_steps)


In [153]:
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, None, lr_scheduler
    )

In [154]:
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
max_train_steps = num_train_epochs * num_update_steps_per_epoch
num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)

In [155]:
checkpointing_steps = 100
per_device_train_batch_size = 1

total_batch_size = per_device_train_batch_size * num_processes * gradient_accumulation_steps

In [156]:
from tqdm import tqdm 
with_tracking = True

resume_from_checkpoint = False
starting_epoch=0
completed_steps = 0

resume_step = None

progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
per_device_eval_batch_size = 1

  0%|          | 0/56 [00:59<?, ?it/s]


In [157]:
for epoch in range(starting_epoch, num_train_epochs):
        model.train()
        if with_tracking:
            total_loss = 0
        if resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
            active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
        else:
            active_dataloader = train_dataloader
        for step, batch in enumerate(active_dataloader):
            with accelerator.accumulate(model):
                outputs = model(**batch)
                loss = outputs.loss
                # We keep track of the loss at each epoch
                if with_tracking:
                    total_loss += loss.detach().float()
                accelerator.backward(loss)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            # Checks if the accelerator has performed an optimization step behind the scenes
            if accelerator.sync_gradients:
                progress_bar.update(1)
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
                if completed_steps % checkpointing_steps == 0:
                    output_dir = f"step_{completed_steps}"
                    accelerator.save_state(output_dir)
            if completed_steps >= max_train_steps:
                break

        model.eval()
        losses = []
        if eval_dataloader is not None:
            for step, batch in enumerate(eval_dataloader):
                with torch.no_grad():
                    outputs = model(**batch)

                loss = outputs.loss
                losses.append(accelerator.gather_for_metrics(loss.repeat(per_device_eval_batch_size)))

            losses = torch.cat(losses)
            try:
                eval_loss = torch.mean(losses)
                perplexity = math.exp(eval_loss)
            except OverflowError:
                perplexity = float("inf")

            print(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")

            if with_tracking:
                accelerator.log(
                    {
                        "perplexity": perplexity,
                        "eval_loss": eval_loss,
                        "train_loss": total_loss.item() / len(train_dataloader),
                        "epoch": epoch,
                        "step": completed_steps,
                    },
                    step=completed_steps,
                )


if with_tracking:
    accelerator.end_training()

# if args.output_dir is not None:
#     accelerator.wait_for_everyone()
#     unwrapped_model = accelerator.unwrap_model(model)
#     unwrapped_model.save_pretrained(
#         args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
#     )
#     if accelerator.is_main_process:
#         tokenizer.save_pretrained(args.output_dir)
#         if args.push_to_hub:
#             api.upload_folder(
#                 commit_message="End of training",
#                 folder_path=args.output_dir,
#                 repo_id=repo_id,
#                 repo_type="model",
#                 token=args.hub_token,
#             )
#         with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
#             json.dump({"perplexity": perplexity}, f)



 98%|█████████▊| 55/56 [00:18<00:00, 10.96it/s]

100%|██████████| 56/56 [00:33<00:00, 10.96it/s]

## Use Trainer

In [185]:
import evaluate 
def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    return logits.argmax(dim=-1)

metric = evaluate.load("accuracy", cache_dir="/vol/bitbucket/rm1623/.cache")

In [186]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels, after the argmax(-1) has been calculated
    # by preprocess_logits_for_metrics but we need to shift the labels
    labels = labels[:, 1:].reshape(-1)
    preds = preds[:, :-1].reshape(-1)
    return metric.compute(predictions=preds, references=labels)

In [187]:
from transformers import Trainer, TrainingArguments
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=False,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    push_to_hub=False,
)

trainer = Trainer(
        model=model,
        args=training_params,
        train_dataset=sample_lm_datasets,
        eval_dataset=None,
        tokenizer=tokenizer,
        # Data collator will default to DataCollatorWithPadding, so we change it.
        data_collator=default_data_collator,
        # compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None,
        # preprocess_logits_for_metrics=preprocess_logits_for_metrics
        # if training_args.do_eval and not is_torch_xla_available()
        # else None,
    )


PyTorch: setting up devices
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [188]:
trainer.train()

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.
***** Running training *****
  Num examples = 55
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 55
  Number of trainable parameters = 331,204,608


Step,Training Loss
25,0.3026
50,0.2911


Saving model checkpoint to ./results/tmp-checkpoint-25
Configuration saved in ./results/tmp-checkpoint-25/config.json
Configuration saved in ./results/tmp-checkpoint-25/generation_config.json
Model weights saved in ./results/tmp-checkpoint-25/model.safetensors
tokenizer config file saved in ./results/tmp-checkpoint-25/tokenizer_config.json
Special tokens file saved in ./results/tmp-checkpoint-25/special_tokens_map.json
Saving model checkpoint to ./results/tmp-checkpoint-50
Configuration saved in ./results/tmp-checkpoint-50/config.json
Configuration saved in ./results/tmp-checkpoint-50/generation_config.json
Model weights saved in ./results/tmp-checkpoint-50/model.safetensors
tokenizer config file saved in ./results/tmp-checkpoint-50/tokenizer_config.json
Special tokens file saved in ./results/tmp-checkpoint-50/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=55, training_loss=0.2949854915792292, metrics={'train_runtime': 75.7708, 'train_samples_per_second': 0.726, 'train_steps_per_second': 0.726, 'total_flos': 1001108275200.0, 'train_loss': 0.2949854915792292, 'epoch': 1.0})