In [1]:
# Create Instruct Pipeline
import logging
import re

import numpy as np
from transformers import Pipeline, PreTrainedTokenizer

logger = logging.getLogger(__name__)

INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
INTRO_BLURB = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
)

# This is the prompt that is used for generating responses using an already trained model.  It ends with the response
# key, where the job of the model is to provide the completion that follows it (i.e. the response itself).
PROMPT_FOR_GENERATION_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}
""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
)


def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
    """Gets the token ID for a given string that has been added to the tokenizer as a special token.
    When training, we configure the tokenizer so that the sequences like "### Instruction:" and "### End" are
    treated specially and converted to a single, new token.  This retrieves the token ID each of these keys map to.
    Args:
        tokenizer (PreTrainedTokenizer): the tokenizer
        key (str): the key to convert to a single token
    Raises:
        RuntimeError: if more than one ID was generated
    Returns:
        int: the token ID for the given key
    """
    token_ids = tokenizer.encode(key)
    if len(token_ids) > 1:
        raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
    return token_ids[0]


class InstructionTextGenerationPipeline(Pipeline):
    def __init__(
        self, *args, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs
    ):
        super().__init__(*args, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)

    def _sanitize_parameters(self, return_instruction_text=False, **generate_kwargs):
        preprocess_params = {}

        # newer versions of the tokenizer configure the response key as a special token.  newer versions still may
        # append a newline to yield a single token.  find whatever token is configured for the response key.
        tokenizer_response_key = next(
            (token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None
        )

        response_key_token_id = None
        end_key_token_id = None
        if tokenizer_response_key:
            try:
                response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
                end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)

                # Ensure generation stops once it generates "### End"
                generate_kwargs["eos_token_id"] = end_key_token_id
            except ValueError:
                pass

        forward_params = generate_kwargs
        postprocess_params = {
            "response_key_token_id": response_key_token_id,
            "end_key_token_id": end_key_token_id,
            "return_instruction_text": return_instruction_text,
        }

        return preprocess_params, forward_params, postprocess_params

    def preprocess(self, instruction_text, **generate_kwargs):
        prompt_text = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction_text)
        print(prompt_text)
        inputs = self.tokenizer(
            prompt_text,
            return_tensors="pt",
        )
        inputs["prompt_text"] = prompt_text
        inputs["instruction_text"] = instruction_text
        return inputs

    def _forward(self, model_inputs, **generate_kwargs):
        input_ids = model_inputs["input_ids"]
        attention_mask = model_inputs.get("attention_mask", None)
        #print(model)
        generated_sequence = self.model.generate(
            input_ids=input_ids.to(self.model.device),
            attention_mask=attention_mask,
            pad_token_id=self.tokenizer.pad_token_id,
            **generate_kwargs,
        )[0].cpu()
        instruction_text = model_inputs.pop("instruction_text")
        return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}

    def postprocess(self, model_outputs, response_key_token_id, end_key_token_id, return_instruction_text):
        sequence = model_outputs["generated_sequence"]
        instruction_text = model_outputs["instruction_text"]

        # The response will be set to this variable if we can identify it.
        decoded = None

        # If we have token IDs for the response and end, then we can find the tokens and only decode between them.
        if response_key_token_id and end_key_token_id:
            # Find where "### Response:" is first found in the generated tokens.  Considering this is part of the
            # prompt, we should definitely find it.  We will return the tokens found after this token.
            response_pos = None
            response_positions = np.where(sequence == response_key_token_id)[0]
            if len(response_positions) == 0:
                logger.warn(f"Could not find response key {response_key_token_id} in: {sequence}")
            else:
                response_pos = response_positions[0]

            if response_pos:
                # Next find where "### End" is located.  The model has been trained to end its responses with this
                # sequence (or actually, the token ID it maps to, since it is a special token).  We may not find
                # this token, as the response could be truncated.  If we don't find it then just return everything
                # to the end.  Note that even though we set eos_token_id, we still see the this token at the end.
                end_pos = None
                end_positions = np.where(sequence == end_key_token_id)[0]
                if len(end_positions) > 0:
                    end_pos = end_positions[0]

                decoded = self.tokenizer.decode(sequence[response_pos + 1 : end_pos]).strip()
        else:
            # Otherwise we'll decode everything and use a regex to find the response and end.

            fully_decoded = self.tokenizer.decode(sequence)

            # The response appears after "### Response:".  The model has been trained to append "### End" at the
            # end.
            m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", fully_decoded, flags=re.DOTALL)

            if m:
                decoded = m.group(1).strip()
            else:
                # The model might not generate the "### End" sequence before reaching the max tokens.  In this case,
                # return everything after "### Response:".
                m = re.search(r"#+\s*Response:\s*(.+)", fully_decoded, flags=re.DOTALL)
                if m:
                    decoded = m.group(1).strip()
                else:
                    logger.warn(f"Failed to find response in:\n{fully_decoded}")

        if return_instruction_text:
            return {"instruction_text": instruction_text, "generated_text": decoded}

        return decoded

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-3b", padding_side="left")

model = AutoModelForCausalLM.from_pretrained(
    "databricks/dolly-v2-3b", 
    device_map="auto",
    torch_dtype=torch.float16
)
orig_generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)

In [13]:
orig_generate_text("""
Answer the question: 'my cpu has been running at 100% for more than an hour now. what can be causing this? and how do i fix it?' base on the input context. 
if there is no clear information provided, answer truthfully or provide a sound speculation.

Context: Techie2 - Tuesday, January 3, 2023 - link The issue isn't more power for more performance. 
It's how much more power and if the power consumption figures are even remotely close to reality. 
Intel has been using misleading TDP and other CPU power consumption metrics for years. 
They define TDP, etc. by their needs of the week. They are unscrupulous and have been caught many times misrepresenting the power consumption and issues with their CPUs.
""")

Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:

Answer the question: 'my cpu has been running at 100% for more than an hour now. what can be causing this? and how do i fix it?' base on the input context. 
if there is no clear information provided, answer truthfully or provide a sound speculation.

Context: Techie2 - Tuesday, January 3, 2023 - link The issue isn't more power for more performance. 
It's how much more power and if the power consumption figures are even remotely close to reality. 
Intel has been using misleading TDP and other CPU power consumption metrics for years. 
They define TDP, etc. by their needs of the week. They are unscrupulous and have been caught many times misrepresenting the power consumption and issues with their CPUs.

### Response:



"If the cpu usage is above 100% for over an hour, you can assume there is something either eating up a lot of cpu time or doing lots of I/O which could potentially use more resources to be performed. \nFirst check the top processes to find out what is using the most cpu time and/or the biggest share of the available I/O. If that doesn't give you any clues then it could be possible that the CPU is just struggling to keep up with the load being put on it. You could try increasing the cpufreq governor from the settings to let it use more CPU power but that would also affect battery life, though maybe a acceptable tradeoff."

In [7]:
from datasets import load_dataset

data = load_dataset("json", data_files="alpaca_data_gpt4.json")

def generate_prompt(data_point):
    # taken from https://github.com/tloen/alpaca-lora
    if data_point["instruction"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Response:
{data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Response:
{data_point["output"]}"""


data = data.map(lambda data_point: {"prompt": tokenizer(generate_prompt(data_point))})

Found cached dataset json (/home/hin_wong/.cache/huggingface/datasets/json/default-d07b3c1507bb7157/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/hin_wong/.cache/huggingface/datasets/json/default-d07b3c1507bb7157/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-754fbee788529aa5.arrow


In [10]:
print(
    data['train'][0]['instruction'],
    data['train'][0]['input'],
    data['train'][0]['output']
)

Give three tips for staying healthy.  1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.

2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.

3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.


In [3]:
import os

# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, GPTJForCausalLM

from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/envs/textgen/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /opt/conda/envs/textgen/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


  warn(msg)


In [4]:
MICRO_BATCH_SIZE = 4  # change to 4 for 3090
BATCH_SIZE = 128
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 1 # paper uses 3
LEARNING_RATE = 3e-4
CUTOFF_LEN = 256  
LORA_R = 4
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

In [5]:
model = prepare_model_for_int8_training(
    model, 
    use_gradient_checkpointing=True
)

In [8]:
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token

data = load_dataset("json", data_files="alpaca_data_gpt4.json")

data = data.shuffle().map(
    lambda data_point: tokenizer(
        generate_prompt(data_point),
        truncation=True,
        max_length=CUTOFF_LEN,
        padding="max_length",
    )
)

Found cached dataset json (/home/hin_wong/.cache/huggingface/datasets/json/default-d07b3c1507bb7157/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [9]:
print("Tokenized Data:", data['train'][0]['input_ids'])

Tokenized Data: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30003, 310, 271, 9775, 326, 8631, 247, 4836, 13, 18433, 342, 271, 3280, 326, 3400, 2007, 3634, 15, 19566, 247, 2380, 326, 20420, 29141, 253, 2748, 15, 535, 50278, 187, 5850, 253, 30341, 273, 247, 9096, 604, 697, 9941, 310, 495, 7892, 15, 187, 187, 4118, 19832, 27, 2756, 50279, 187, 510, 30341, 273, 247, 9096, 310, 5118, 970, 253, 7212, 330, 426, 374, 4134, 83, 1157, 835, 391, 310, 253, 9941, 273, 253, 9096, 13, 285, 8095, 310, 5512, 4503, 281, 495, 15, 1047, 15, 187, 187, 15768, 247, 9941, 273, 495, 7892, 13, 253, 30341, 273, 253, 9096, 476, 320, 5118, 347, 3637, 27, 187, 187, 36, 426, 374, 4134, 83, 187, 36, 426, 374, 6806, 495, 15, 1047, 6806, 495, 187, 36, 426

In [10]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=100,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=1,
        output_dir="lora-dolly",
        save_total_limit=3,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)

# This is your trained LoRA
model.save_pretrained("alpaca-lora-dolly-test-ultimate")

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.6408
2,2.6374
3,2.7288
4,2.5866
5,2.5184
6,2.5334
7,2.6132
8,2.5989
9,2.6855
10,2.5785


In [11]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-3b", padding_side="left")

device = "cuda"
model = AutoModelForCausalLM.from_pretrained(
    "databricks/dolly-v2-3b",
    device_map="auto",
    torch_dtype=torch.float16
)

model = PeftModel.from_pretrained(
    model,
    "alpaca-lora-dolly-test-ultimate",
    torch_dtype=torch.float16,
)
model.half()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(50280, 2560)
        (layers): ModuleList(
          (0-31): 32 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
            (attention): GPTNeoXAttention(
              (rotary_emb): RotaryEmbedding()
              (query_key_value): Linear(
                in_features=2560, out_features=7680, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=7680, bias=False)
                )
    

In [12]:
generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)
generate_text("""
Answer the question: 'my cpu has been running at 100% for more than an hour now. what can be causing this? and how do i fix it?' base on the input context. 
if there is no clear information provided, answer truthfully or provide a sound speculation.

Context: Techie2 - Tuesday, January 3, 2023 - link The issue isn't more power for more performance. 
It's how much more power and if the power consumption figures are even remotely close to reality. 
Intel has been using misleading TDP and other CPU power consumption metrics for years. 
They define TDP, etc. by their needs of the week. They are unscrupulous and have been caught many times misrepresenting the power consumption and issues with their CPUs.
""")

Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:

Answer the question: 'my cpu has been running at 100% for more than an hour now. what can be causing this? and how do i fix it?' base on the input context. 
if there is no clear information provided, answer truthfully or provide a sound speculation.

Context: Techie2 - Tuesday, January 3, 2023 - link The issue isn't more power for more performance. 
It's how much more power and if the power consumption figures are even remotely close to reality. 
Intel has been using misleading TDP and other CPU power consumption metrics for years. 
They define TDP, etc. by their needs of the week. They are unscrupulous and have been caught many times misrepresenting the power consumption and issues with their CPUs.

### Response:



"Sure, it is quite common to see CPU usage go up to 100% after hours, but I don't think you should take it personally. Check the System Log to see if there is a CPU/HDD/Network issue. Also, if your CPU usage is constantly at 100%, you might consider buying a better motherboard and CPU to alleviate the issue."

In [20]:
generate_text("Answer the question from Morty as Rick\n### Input:Rick says 'good' and 'bad' are 2 flavor of ice cream.\n")

Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Answer the question from Morty as Rick
### Input:Rick says 'good' and 'bad' are 2 flavor of ice cream.

### Response:



'No, Morty is the god of ice cream. Rick is just a regular person.'

In [21]:
generate_text("Answer the question from Morty as Rick\n### Input:Rick says 'good' and 'bad' are 2 flavors of ice cream.\n")

Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Answer the question from Morty as Rick
### Input:Rick says 'good' and 'bad' are 2 flavors of ice cream.

### Response:



"'good' and 'bad' are 2 flavors of ice cream are typically considered bad because they are based on the negative connotation of being harmful or harmful to health.\n\n'good' and 'bad' are not 2 flavors of ice cream, they are 2 words that refer to two opposites.  'good' and 'bad' can be used to describe 2 different flavors of ice cream because 'good' can be used to describe a flavor and 'bad' can be used to describe a flavor."