In [1]:
import os
import sys
from typing import List

import torch
import transformers
from datasets import load_dataset

"""
Unused imports:
import torch.nn as nn
import bitsandbytes as bnb
"""

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)

from typing import Any, Dict, List, Tuple, Union

from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    PreTrainedTokenizer,
    Trainer,
    TrainingArguments,
    set_seed,
)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/sush/miniconda3/envs/llm/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so
CUDA exception! Error code: no CUDA-capable device is detected
CUDA exception! Error code: initialization error
CUDA SETUP: CUDA runtime path found: /home/sush/miniconda3/envs/llm/lib/libcudart.so.11.0
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/sush/miniconda3/envs/llm/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so...


  warn(msg)


In [2]:
data_path: str = "yahma/alpaca-cleaned"
output_dir: str = "./output/v1"
# training hyperparams
batch_size: int = 128
micro_batch_size: int = 1
num_epochs: int = 3
learning_rate: float = 3e-4
# cutoff_len: int = 256
val_set_size: int = 2000
# lora hyperparams
lora_r: int = 8
lora_alpha: int = 16
lora_dropout: float = 0.05
lora_target_modules = "query_key_value",
# llm hyperparams
train_on_inputs: bool = True # if False, masks out inputs in loss
add_eos_token: bool = True
group_by_length: bool = False  # faster, but produces an odd training loss curve
# wandb params
wandb_project: str = ""
wandb_run_name: str = ""
wandb_watch: str = ""  # options: false | gradients | all
wandb_log_model: str = ""  # options: false | true
resume_from_checkpoint: str = None  # either training checkpoint or final adapter
# prompt_template_name: str = "alpaca"
test_size = 0.1
seed = 42
gradient_accumulation_steps = batch_size // micro_batch_size

In [3]:
device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if ddp:
    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
    gradient_accumulation_steps = gradient_accumulation_steps // world_size

In [4]:
# def load_tokenizer(pretrained_model_name_or_path) -> PreTrainedTokenizer:
#     tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
#     tokenizer.pad_token = tokenizer.eos_token
#     #tokenizer.add_special_tokens({"additional_special_tokens": [END_KEY, INSTRUCTION_KEY, RESPONSE_KEY_NL]})
#     return tokenizer


# def load_model(
#     pretrained_model_name_or_path: str, *, gradient_checkpointing: bool = False
# ) -> AutoModelForCausalLM:
#     model = AutoModelForCausalLM.from_pretrained(
#         pretrained_model_name_or_path, 
#         trust_remote_code=True, 
#         load_in_8bit=True,
#         torch_dtype=torch.bfloat16,
#         device_map='auto',
#         use_cache=False if gradient_checkpointing else True
#     )
#     return model


# def get_model_tokenizer(
#     pretrained_model_name_or_path: str, *, gradient_checkpointing: bool = False
# ) -> Tuple[AutoModelForCausalLM, PreTrainedTokenizer]:
#     tokenizer = load_tokenizer(pretrained_model_name_or_path)
#     model = load_model(pretrained_model_name_or_path, gradient_checkpointing=gradient_checkpointing)
#     model.resize_token_embeddings(len(tokenizer))

#     return model, tokenizer
# model, tokenizer = get_model_tokenizer("databricks/dolly-v2-3b")

In [5]:
from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizerFast

base_model = "databricks/dolly-v2-3b"
model = GPTNeoXForCausalLM.from_pretrained(
        base_model,
        # load_in_8bit=True,
        torch_dtype=torch.bfloat16,
        device_map=device_map,
    )

tokenizer = GPTNeoXTokenizerFast.from_pretrained(base_model)

tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"  # Allow batched inference


In [6]:
from datasets import load_from_disk
from torch.utils.data import DataLoader
import numpy as np

dataset = load_from_disk("data/polyglot_processed_2000/")

In [7]:
RESPONSE_KEY_NL = "ENTITIES: "

class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        batch = super().torch_call(examples)

        # The prompt ends with the response key plus a newline.  We encode this and then try to find it in the
        # sequence of tokens.  This should just be a single token.
        response_token_ids = self.tokenizer.encode(RESPONSE_KEY_NL)

        labels = batch["labels"].clone()

        for i in range(len(examples)):

            response_token_ids_start_idx = None
            for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]:
                response_token_ids_start_idx = idx
                break

            if response_token_ids_start_idx is None:
                raise RuntimeError(
                    f'Could not find response key {response_token_ids} in token IDs {batch["labels"][i]}'
                )

            response_token_ids_end_idx = response_token_ids_start_idx + 1

            # Make pytorch loss function ignore all tokens up through the end of the response key
            labels[i, :response_token_ids_end_idx] = -100

        batch["labels"] = labels

        return batch

In [8]:
dataCollator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False,
    pad_to_multiple_of = 8,
    return_tensors = 'pt')

In [9]:
model = prepare_model_for_int8_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=lora_alpha,
    target_modules=lora_target_modules,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)



In [10]:

dataset = dataset.train_test_split(test_size=test_size, seed=seed)
dataset

Loading cached split indices for dataset at /mnt/e/Projects/generic_ner/data/polyglot_processed_2000/cache-258eb9386a72366b.arrow and /mnt/e/Projects/generic_ner/data/polyglot_processed_2000/cache-805704d532b2f14d.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1800
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [11]:
trainer = transformers.Trainer(
        model=model,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        args=transformers.TrainingArguments(
            per_device_train_batch_size=micro_batch_size,
            gradient_accumulation_steps=1,
            warmup_steps=100,
            num_train_epochs=num_epochs,
            learning_rate=learning_rate,
            bf16=True,
            logging_steps=1,
            optim="adamw_torch",
            evaluation_strategy="steps" if val_set_size > 0 else "no",
            save_strategy="steps",
            eval_steps=200 if val_set_size > 0 else None,
            save_steps=200,
            output_dir=output_dir,
            save_total_limit=3,
            load_best_model_at_end=True if val_set_size > 0 else False,
            ddp_find_unused_parameters=False if ddp else None,
            group_by_length=group_by_length,
            report_to=None,
            # optim="adamw_bnb_8bit",
            # run_name=wandb_run_name if use_wandb else None,
        ),
        data_collator = DataCollatorForCompletionOnlyLM(
            tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
        )
    )

Using cuda_amp half precision backend


In [12]:
trainer.train()

***** Running training *****
  Num examples = 1800
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 5400
  Number of trainable parameters = 2621440
You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
200,0.0008,0.469492
400,0.3078,0.409857
600,0.3122,0.26222
800,0.2428,0.25254
1000,0.1506,0.298132
1200,0.5327,0.248937
1400,0.0001,0.223334
1600,0.1627,0.248102
1800,0.0999,0.202908
2000,0.0,0.229824


***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to ./lora-alpaca/checkpoint-200
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to ./lora-alpaca/checkpoint-400
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to ./lora-alpaca/checkpoint-600
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to ./lora-alpaca/checkpoint-800
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Deleting older checkpoint [lora-alpaca/checkpoint-200] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to ./lora-alpaca/checkpoint-1000
Trainer.mo

TrainOutput(global_step=5400, training_loss=0.10001079444363886, metrics={'train_runtime': 1186.4046, 'train_samples_per_second': 4.552, 'train_steps_per_second': 4.552, 'total_flos': 9707639735992320.0, 'train_loss': 0.10001079444363886, 'epoch': 3.0})

In [89]:
txt = '''Your job is to extract the entities provided the definition of each entity which is provided below.

ORG: Represents a formal group or entity such as a company or organization.
PER: Refers to an individual person or a group of individuals.
LOC: Represents a specific place or geographical location.

QUERY: "Why are you working?"
ENTITIES:
'''

In [90]:
input_ids = tokenizer.encode(txt, return_tensors='pt')

In [91]:
# generate text until the output length (which includes the context length) reaches 50
greedy_output = trainer.model.generate(input_ids=input_ids, 
                                       max_length=200, 
                                       # num_beams=5,
                                       do_sample=True,
                                       top_k=2,
                                       # no_repeat_ngram_size=2,
                                       temperature=0.00001,
                                       early_stopping=True)

Generate config GenerationConfig {
  "bos_token_id": 0,
  "eos_token_id": 0,
  "transformers_version": "4.26.1"
}

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


In [92]:
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Your job is to extract the entities provided the definition of each entity which is provided below.

ORG: Represents a formal group or entity such as a company or organization.
PER: Refers to an individual person or a group of individuals.
LOC: Represents a specific place or geographical location.

QUERY: "Why are you working?"
ENTITIES:
PER: "John", "Doe"
LOC: "London"
ORG: "John", "Doe"
PER: "John", "Doe"
LOC: "London"
ORG: "John", "Doe"
PER: "John", "Doe"
LOC: "London"
ORG: "John", "Doe"
PER: "John", "Doe"
LOC: "London"
ORG: "John", "Doe"
PER: "John", "Doe"
LOC


In [32]:
trainer.model.save_pretrained('models/v1')