In [5]:
import re
import logging
from functools import partial
import numpy as np 
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    PreTrainedTokenizer,
    Trainer,
    TrainingArguments
)
from torch.optim import AdamW, Adagrad

In [2]:
logger = logging.getLogger("logger")

In [3]:
### to be added as special tokens
INSTRUCTION_KEY = "### Instruction:"
INPUT_KEY = "Input:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"

In [4]:
### name for model and tokenizer
INPUT_MODEL = "EleutherAI/pythia-2.8b"

### Model Loading
def load_tokenizer(pretrained_model_name_or_path):
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.add_special_tokens(
        {"additional_special_tokens": [END_KEY, INSTRUCTION_KEY, RESPONSE_KEY_NL]}
    )
    return tokenizer

def load_model(pretrained_model_name_or_path, gradient_checkpointing):
    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path, 
        trust_remote_code=True, 
        use_cache=False if gradient_checkpointing else True
    )
    return model

def get_model_tokenizer(
    pretrained_model_name_or_path, gradient_checkpointing):
    tokenizer = load_tokenizer(pretrained_model_name_or_path)
    model = load_model(
        pretrained_model_name_or_path, gradient_checkpointing
    )
    model.resize_token_embeddings(len(tokenizer))
    return model, tokenizer

model, tokenizer = get_model_tokenizer(
    pretrained_model_name_or_path=INPUT_MODEL, 
    gradient_checkpointing=True
)

# find max length in model configuration
conf = model.config
max_length = getattr(model.config, "max_position_embeddings", None)

In [10]:
# training prompt that does not contain an input string.
INTRO_BLURB = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
)

PROMPT_NO_INPUT_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}
{response}
{end_key}""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
    response="{response}",
    end_key=END_KEY,
)

# training prompt that contains an input string that serves as context 
PROMPT_WITH_INPUT_FORMAT = """{intro}
{instruction_key}
{instruction}
{input_key}
{input}
{response_key}
{response}
{end_key}""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    input_key=INPUT_KEY,
    input="{input}",
    response_key=RESPONSE_KEY,
    response="{response}",
    end_key=END_KEY,
)

def load_training_dataset(path_or_dataset="databricks/databricks-dolly-15k"):
    dataset = load_dataset(path_or_dataset)["train"]
    # get only 1000 records 
    dataset = dataset.select(range(1000))
    def _add_text(rec):
        instruction = rec["instruction"]
        response = rec["response"]
        context = rec.get("context")
        if context:
            rec["text"] = PROMPT_WITH_INPUT_FORMAT.format(
                instruction=instruction, 
                response=response, 
                input=context
            )
        else:
            rec["text"] = PROMPT_NO_INPUT_FORMAT.format(
                instruction=instruction, 
                response=response
            )
        return rec
    dataset = dataset.map(_add_text)
    return dataset

def preprocess_batch(batch, tokenizer, max_length):
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

def preprocess_dataset(tokenizer, max_length):
    dataset = load_training_dataset()
    _preprocessing_function = partial(
        preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["instruction", "context", "response", "text", "category"],
    )

    # Make sure we don't have any truncated records, as this would mean the end keyword is missing.
    dataset = dataset.filter(lambda rec: len(rec["input_ids"]) < max_length)
    dataset = dataset.shuffle()
    return dataset

In [11]:
# make parameters not trainable
for name, param in model.base_model.named_parameters():
    if "30" not in name:
        param.requires_grad = False
    else:
        param.requires_grad = True
optimizer = Adagrad(filter(lambda p: p.requires_grad, model.base_model.parameters()), lr=1e-4)

In [12]:
class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
    def torch_call(self, examples):
        batch = super().torch_call(examples)

        # The prompt ends with the response key plus a newline
        response_token_ids = self.tokenizer.encode(RESPONSE_KEY_NL)
        labels = batch["labels"].clone()

        for i in range(len(examples)):
            response_token_ids_start_idx = None
            for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]:
                response_token_ids_start_idx = idx
                break

            if response_token_ids_start_idx is None:
                raise RuntimeError(
                    f'Could not find response key {response_token_ids} in token IDs {batch["labels"][i]}'
                )

            response_token_ids_end_idx = response_token_ids_start_idx + 1

            # loss function ignore all tokens up through the end of the response key
            labels[i, :response_token_ids_end_idx] = -100

        batch["labels"] = labels

        return batch

data_collator = DataCollatorForCompletionOnlyLM(
        tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
    )

In [13]:
dataset = preprocess_dataset(tokenizer, 256)

Found cached dataset json (C:/Users/User/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-6e0f9ea7eaa0ee08/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
100%|██████████| 1/1 [00:00<00:00, 166.60it/s]
Loading cached processed dataset at C:\Users\User\.cache\huggingface\datasets\databricks___json\databricks--databricks-dolly-15k-6e0f9ea7eaa0ee08\0.0.0\e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4\cache-6a83a972f2182ec4.arrow
Loading cached processed dataset at C:\Users\User\.cache\huggingface\datasets\databricks___json\databricks--databricks-dolly-15k-6e0f9ea7eaa0ee08\0.0.0\e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4\cache-6f4fad31e2d6165d.arrow
Loading cached processed dataset at C:\Users\User\.cache\huggingface\datasets\databricks___json\databricks--databricks-dolly-15k-6e0f9ea7eaa0ee08\0.0.0\e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4\cache-43cf00d245bd3145.arrow


In [14]:
dataset = dataset.train_test_split(test_size=0.1, seed=42)

In [16]:
training_args = TrainingArguments(
        output_dir='./train',
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        fp16=True,
        bf16=False,
        num_train_epochs=5,
        gradient_accumulation_steps=16,
        eval_accumulation_steps=16,
        deepspeed=None,
        gradient_checkpointing=False,
        load_best_model_at_end=False,
        report_to=None,
    )

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
)

trainer.train()
trainer.save_model(output_dir="./train/model")

  0%|          | 0/210 [00:00<?, ?it/s]You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB (GPU 0; 12.00 GiB total capacity; 11.14 GiB already allocated; 0 bytes free; 11.19 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF