In [24]:
import os
import bitsandbytes as bnb
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from peft import (
    LoraConfig,
    PeftConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
)

In [2]:
model_id = "databricks/dolly-v2-3b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    load_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model =AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [3]:
model = prepare_model_for_kbit_training(model)

In [4]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [5]:
# To be added as special tokens
INSTRUCTION_KEY = "### Instruction:"
INPUT_KEY = "Input:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"

In [6]:
# training prompt that does not contain an input string.
INTRO_BLURB = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
)

PROMPT_NO_INPUT_FORMAT = """{intro}

{instruction_key}
{instruction}

{response_key}
{response}

{end_key}
""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
    response="{response}",
    end_key=END_KEY,
)


# This is the prompt that is used for generating responses using an already trained model.  It ends with the response
# key, where the job of the model is to provide the completion that follows it (i.e. the response itself).
PROMPT_WITH_INPUT_FORMAT = """{intro}

{instruction_key}
{instruction}

{input_key}
{context}

{response_key}
{response}

{end_key}
""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    input_key=INPUT_KEY,
    context="{context}",
    response_key=RESPONSE_KEY,
    response="{response}",
    end_key=END_KEY
)

In [7]:
dataset = load_dataset("squad_v2")
dataset_train = dataset["train"].select(range(3000))
dataset_test = dataset["validation"].select(range(100))

In [8]:
def format_for_LLM(rec):
    instruction = rec["question"]
    instruction = "Answer the following question only with the provided input. If no answer is found tell that you cannot answer based on this context. " + instruction
    try:
        response = rec["answers"]['text'][0]
    except:
        # print(rec['answers'])
        response = "I cannot answer this question based on this context."
    context = rec.get("context")
    if context:
        rec["text"] = PROMPT_WITH_INPUT_FORMAT.format(
            instruction=instruction, 
            response=response, 
            context=context
        )
    else:
        rec["text"] = PROMPT_NO_INPUT_FORMAT.format(
            instruction=instruction, 
            response=response
        )
    return rec

In [9]:
dataset_train = dataset_train.map(format_for_LLM)
dataset_test = dataset_test.map(format_for_LLM)

In [10]:
def generate_and_tokenize_text(rec):
  tokenized_full_prompt = tokenizer(rec['text'], padding=True, truncation=True)
  return tokenized_full_prompt

In [11]:
# def generate_prompt(data_point):
#   return f"""
# <Human>: {data_point["Context"]}
# <AI>: {data_point["Response"]}
#   """.strip()

# def generate_and_tokenize_prompt(data_point):
#   full_prompt = generate_prompt(data_point)
#   tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
#   return tokenized_full_prompt
    
# dataset_name = 'Amod/mental_health_counseling_conversations'
# dataset = load_dataset(dataset_name, split="train")

# dataset = dataset.shuffle().map(generate_and_tokenize_text)

dataset_train = dataset_train.map(generate_and_tokenize_text)
dataset_test = dataset_test.map(generate_and_tokenize_text)

In [13]:
max_length = 1024

In [14]:
# Make sure we don't have any truncated records, as this would mean the end keyword is missing.
dataset_train = dataset_train.filter(lambda rec: len(rec["input_ids"]) < max_length)
dataset_test = dataset_test.filter(lambda rec: len(rec["input_ids"]) < max_length)

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

In [15]:
dataset_train

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'text', 'input_ids', 'attention_mask'],
    num_rows: 3000
})

In [16]:
dataset_test

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'text', 'input_ids', 'attention_mask'],
    num_rows: 100
})

In [20]:
class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
    def torch_call(self, examples):
        batch = super().torch_call(examples)

        # The prompt ends with the response key plus a newline
        response_token_ids = self.tokenizer.encode(RESPONSE_KEY_NL)
        labels = batch["labels"].clone()

        for i in range(len(examples)):
            response_token_ids_start_idx = None
            for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]:
                response_token_ids_start_idx = idx
                break

            if response_token_ids_start_idx is None:
                raise RuntimeError(
                    f'Could not find response key {response_token_ids} in token IDs {batch["labels"][i]}'
                )

            response_token_ids_end_idx = response_token_ids_start_idx + 1

            # loss function ignore all tokens up through the end of the response key
            labels[i, :response_token_ids_end_idx] = -100

        batch["labels"] = labels

        return batch

In [21]:
data_collator = DataCollatorForCompletionOnlyLM(tokenizer=tokenizer, mlm=False, return_tensors="pt")

In [22]:
training_args = transformers.TrainingArguments(
    auto_find_batch_size=True,
    num_train_epochs=4,
    learning_rate=2e-4,
    bf16=True,
    save_total_limit=4,
    logging_steps=4,
    logging_strategy="epoch",
    output_dir='./dollytest',
    save_strategy='epoch',
    do_eval=True,
    evaluation_strategy='epoch',
)

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    args=training_args,
    data_collator=data_collator,
    # data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()

Epoch,Training Loss,Validation Loss
