In [1]:
# %pip install -q -U trl peft transformers datasets bitsandbytes

In [2]:
# %!pip install flash-attn --no-build-isolation

In [3]:
# from huggingface_hub import snapshot_download

# model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_path=f"/home/ubuntu/dev/AI-Dojo/LLM/models/{model_id}"
# snapshot_download(model_id, local_dir=model_path, token=token)

# Imports

In [4]:
import json
import torch
from datasets import load_dataset
from peft import LoraConfig
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer

# Load Quantized Model

In [5]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model_path=f"/home/ubuntu/dev/AI-Dojo/LLM/models/{model_name}"

In [6]:
#quantization configurations - so you quantize the model while inferencing
torch_dtype = torch.bfloat16

quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch_dtype,
)

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = 512
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=quantization_config,
    torch_dtype=torch_dtype,
    device_map="auto"
)

# Initial trainable parameters of our model.
def count_trainable_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

num_params = count_trainable_params(model)
formatted_num_params = "{:,}".format(num_params)
print(f"Number of trainable parameters: {formatted_num_params}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Number of trainable parameters: 1,050,939,392


## Loading Data

In [7]:
dataset = load_dataset('json', data_files=['/home/ubuntu/dev/AI-Dojo/LLM/data/chris_train.json'])
dataset = dataset['train']
dataset

Dataset({
    features: ['instruction', 'output'],
    num_rows: 128
})

In [8]:
def to_messages(sample):
    sample['messages'] = [{"role": "system", "content": ""}, {"role": "human", "content": sample['instruction']}, {"role": "assistant", "content": sample['output']}]
    sample['text'] = tokenizer.apply_chat_template(sample['messages'], tokenize=False)
    return sample
messages_dataset = dataset.map(to_messages, remove_columns=dataset.features)
messages_dataset

Dataset({
    features: ['messages', 'text'],
    num_rows: 128
})

## Quick Model Test

In [9]:
def generate_answer(human_message, model=model, max_new_tokens=256, temperature=0.1, top_p=0.9):
    messages = [{"role": "system", "content": ""}, {"role": "human", "content": human_message}]
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        temperature=temperature,
        top_p=top_p,
    )
    response = outputs[0][input_ids.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)

In [10]:
human_message = dataset[0]['instruction']
print(human_message)
model_output = generate_answer(human_message=human_message)
model_output

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


What is your name?


"I don't have a personal name. I'm an AI designed to assist and communicate with users, and I don't have a personal identity. I exist solely to provide information, answer questions, and help with tasks to the best of my abilities. I'm here to help you with any questions or topics you'd like to discuss!"

## Pre tuning evaluation

In [11]:
def add_model_output(sample):
    model_output = generate_answer(human_message=human_message)
    sample['model_output'] = model_output
    return sample

pre_tuning_eval_data = dataset.map(add_model_output)
with open("./data/llama3-8b/pre-tuning.json", "w") as f:
    json.dump(pre_tuning_eval_data.to_list(), f, indent=4)




Map:   0%|          | 0/128 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

## Configure LoRA Training

In [12]:
# based on config
peft_config = LoraConfig(
  r=64,
  lora_alpha=16,
  lora_dropout=0.1,
  bias="none",
  task_type="CAUSAL_LM",
  target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

In [13]:
# output dir 
run_name = "llama3-8b-instruct-chris"
model_dir = "/home/ubuntu/dev/AI-Dojo/LLM/models/tuned"

# based on config
training_args = TrainingArguments(
    fp16=False, # specify bf16=True instead when training on GPUs that support bf16 else fp16
    bf16=True,
    # do_eval=True,
    # evaluation_strategy="epoch",
    gradient_accumulation_steps=4,
    learning_rate=2.0e-05,
    log_level="info",
    logging_steps=5,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    max_steps=-1,
    num_train_epochs=3,
    run_name=run_name,
    output_dir=model_dir,
    overwrite_output_dir=True,
    per_device_eval_batch_size=1, # originally set to 8
    per_device_train_batch_size=1, # originally set to 8
    save_total_limit=None,
    seed=42,
)

## Run Training

In [14]:
trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=messages_dataset,
        dataset_text_field="text",
        tokenizer=tokenizer,
        peft_config=peft_config,
        max_seq_length=tokenizer.model_max_length,
    )

# To clear out cache for unsuccessful run
torch.cuda.empty_cache()

train_result = trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
Using auto half precision backend
***** Running training *****
  Num examples = 128
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 96
  Number of trainable parameters = 54,525,952
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss
5,6.4318
10,5.6557
15,4.7833
20,4.4882
25,3.8207
30,3.7811
35,3.2042
40,2.9426
45,2.4529
50,2.4259


Saving model checkpoint to /home/ubuntu/dev/AI-Dojo/LLM/models/tuned/checkpoint-96
tokenizer config file saved in /home/ubuntu/dev/AI-Dojo/LLM/models/tuned/checkpoint-96/tokenizer_config.json
Special tokens file saved in /home/ubuntu/dev/AI-Dojo/LLM/models/tuned/checkpoint-96/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




In [15]:
trainer.save_model(model_dir + "/" + run_name)

Saving model checkpoint to /home/ubuntu/dev/AI-Dojo/LLM/models/tuned/llama3-8b-instruct-chris
tokenizer config file saved in /home/ubuntu/dev/AI-Dojo/LLM/models/tuned/llama3-8b-instruct-chris/tokenizer_config.json
Special tokens file saved in /home/ubuntu/dev/AI-Dojo/LLM/models/tuned/llama3-8b-instruct-chris/special_tokens_map.json


## Post Tuning Evaluation

In [16]:
human_message = dataset[0]['instruction']
print(human_message)
with torch.cuda.amp.autocast():
    model_output = generate_answer(human_message=human_message)
model_output

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What is your name?


'My name is Chris.'

In [17]:
def add_model_output(sample):
    with torch.cuda.amp.autocast():
        model_output = generate_answer(human_message=human_message)
    sample['model_output'] = model_output
    return sample

post_tuning_eval_data = dataset.map(add_model_output)
with open("./data/llama3-8b/post-tuning.json", "w") as f:
    json.dump(post_tuning_eval_data.to_list(), f, indent=4)

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

In [20]:
with torch.cuda.amp.autocast():
    answer = generate_answer("Write a recipe for cooking sushi")
print(answer)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


**Homemade Sushi Recipe**

 Servings: 8-10 pieces

**Ingredients:**

For the sushi rice:

* 1 cup Japanese short-grain rice (such as Koshihikari or Akita Komachi)
* 1 3/4 cups water
* 1/4 cup rice vinegar
* 1/4 cup sugar
* 1/2 teaspoon salt

For the fillings:

* 1/2 avocado, sliced
* 1/2 cucumber, sliced
* 1/4 carrot, peeled and grated
* 1/4 crab stick, sliced
* 1/4 salmon sashimi, sliced (optional)
* 1/4 pickled ginger, sliced
* Sesame seeds and soy sauce for garnish (optional)

**Instructions:**

**Step 1: Prepare the Sushi Rice**

1. Rinse the rice thoroughly and cook it according to the package instructions using 1 3/4 cups of water.
2. In a large saucepan, combine the cooked rice, rice vinegar, sugar, and salt.
3. Heat the mixture over low heat, stirring constantly, until the sugar and salt have dissolved.
4. Remove the saucepan from the heat
