In [1]:
import torch
import re
import logging
import numpy as np 
import random

from torch.nn.parallel import DistributedDataParallel as DDP
from functools import partial
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    PreTrainedTokenizer,
    Trainer,
    TrainingArguments,
    pipeline
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# To be added as special tokens
INSTRUCTION_KEY = "### Instruction:"
INPUT_KEY = "Input:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"

In [3]:
### name for model and tokenizer
INPUT_MODEL = "EleutherAI/pythia-2.8b"

In [4]:
### Model Loading
def load_tokenizer(pretrained_model_name_or_path):
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
    # Make the pad token the same as end of string token
    tokenizer.pad_token = tokenizer.eos_token
    # Add the special tokens to the tokenizer
    tokenizer.add_special_tokens(
        {"additional_special_tokens": [END_KEY, INSTRUCTION_KEY, RESPONSE_KEY_NL]}
    )
    return tokenizer

In [5]:
def load_model(pretrained_model_name_or_path, gradient_checkpointing):
    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path, 
        trust_remote_code=True, 
        use_cache=False if gradient_checkpointing else True
    )
    return model

In [6]:
def get_model_tokenizer(pretrained_model_name_or_path, gradient_checkpointing):
    tokenizer = load_tokenizer(pretrained_model_name_or_path)
    model = load_model(pretrained_model_name_or_path, gradient_checkpointing)
    model.resize_token_embeddings(len(tokenizer))
    return model, tokenizer

model, tokenizer = get_model_tokenizer(
    pretrained_model_name_or_path=INPUT_MODEL, 
    gradient_checkpointing=True
)

In [7]:
# find max length in model configuration
conf = model.config
max_length = getattr(model.config, "max_position_embeddings", None)

In [8]:
# training prompt that does not contain an input string.
INTRO_BLURB = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
)

PROMPT_NO_INPUT_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}
{response}
{end_key}""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
    response="{response}",
    end_key=END_KEY,
)

# training prompt that contains an input string that serves as context 
PROMPT_WITH_INPUT_FORMAT = """{intro}
{instruction_key}
{instruction}
{input_key}
{input}
{response_key}
{response}
{end_key}""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    input_key=INPUT_KEY,
    input="{input}",
    response_key=RESPONSE_KEY,
    response="{response}",
    end_key=END_KEY,
)

In [21]:
# Load the dataset but get first 1000 examples
dataset = load_dataset("databricks/databricks-dolly-15k")["train"].select(range(1000))

Found cached dataset json (C:/Users/vmpletsos/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-6e0f9ea7eaa0ee08/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)
100%|██████████| 1/1 [00:00<00:00, 500.04it/s]


In [23]:
def format_for_LLM(rec):
    instruction = rec["instruction"]
    response = rec["response"]
    context = rec.get("context")
    if context:
        rec["text"] = PROMPT_WITH_INPUT_FORMAT.format(
            instruction=instruction, 
            response=response, 
            input=context
        )
    else:
        rec["text"] = PROMPT_NO_INPUT_FORMAT.format(
            instruction=instruction, 
            response=response
        )
    return rec

In [24]:
dataset = dataset.map(format_for_LLM)

                                                    

In [25]:
def preprocess_batch(batch, tokenizer, max_length):
    return tokenizer(batch["text"], max_length=max_length, truncation=True)

In [26]:
preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)

In [27]:
dataset = dataset.map(
        preprocessing_function,
        batched=True,
        remove_columns=["instruction", "context", "response", "text", "category"],
    )

                                                                 

In [28]:
# Make sure we don't have any truncated records, as this would mean the end keyword is missing.
dataset = dataset.filter(lambda rec: len(rec["input_ids"]) < max_length)
dataset = dataset.shuffle()

                                                                    

In [29]:
class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
    def torch_call(self, examples):
        batch = super().torch_call(examples)

        # The prompt ends with the response key plus a newline
        response_token_ids = self.tokenizer.encode(RESPONSE_KEY_NL)
        labels = batch["labels"].clone()

        for i in range(len(examples)):
            response_token_ids_start_idx = None
            for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]:
                response_token_ids_start_idx = idx
                break

            if response_token_ids_start_idx is None:
                raise RuntimeError(
                    f'Could not find response key {response_token_ids} in token IDs {batch["labels"][i]}'
                )

            response_token_ids_end_idx = response_token_ids_start_idx + 1

            # loss function ignore all tokens up through the end of the response key
            labels[i, :response_token_ids_end_idx] = -100

        batch["labels"] = labels

        return batch

In [30]:
data_collator = DataCollatorForCompletionOnlyLM(
        tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
    )

In [31]:
local_output_dir = './dolly_train'

In [32]:
training_args = TrainingArguments(
        output_dir=local_output_dir,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        fp16=False,
        bf16=True,
        learning_rate=1e-5,
        num_train_epochs=5,
        deepspeed=None,
        gradient_checkpointing=True,
        logging_dir=f"{local_output_dir}/runs",
        logging_strategy="steps",
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=1000,
        save_total_limit=10,
        load_best_model_at_end=False,
        report_to="tensorboard",
        disable_tqdm=True,
        remove_unused_columns=False,
        warmup_steps=0,
    )

In [33]:
dataset = dataset.train_test_split(0.1)

In [34]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
)

In [35]:
trainer.train()

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 38.00 MiB (GPU 0; 12.00 GiB total capacity; 11.28 GiB already allocated; 0 bytes free; 11.31 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.save_model(output_dir=local_output_dir)