# Fine tuning

<span style="font-size: 20px;"> First let's install all necessary packages </span>

In [1]:
pip install peft trl bitsandbytes

Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.11.0-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.11-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading peft-0.12.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading trl-0.11.0-py3-none-any.whl (316 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [3

In [None]:
pip install datasets transformers matplotlib seaborn torch 


# 1. Setup


<span style="font-size: 18px;"> Before starting the fine-tuning, let's load our dataset from huggingface. Also we are fine-tuning Meta's llama3 model from huggingface. Therefore, please visit [here](https://huggingface.co/docs/hub/en/security-tokens) to create your huggingface token. </span>



In [2]:
from datasets import load_dataset
dataset = load_dataset("Samurai1/logs")
import os
os.environ["HF_TOKEN"] = "<your_HF_token>"


from transformers import AutoTokenizer, AutoModelForCausalLM
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

# Adding a special token [PAD] to the tokenizer's vocabulary for padding sequences to the same length.
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="right"

# Configuring the LoRA method for parameter-efficient fine-tuning of large language models.
peft_config = LoraConfig(
    lora_alpha=15,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

# Configuring quantization for the model to reduce memory and compute requirements.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)



Downloading readme:   0%|          | 0.00/269 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/53.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/568 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]


# 2. Configuration

<span style="font-size: 18px;"> Now configuring the fine-tuning process for the Meta-Llama-3-8B language model.</span>

<span style="font-size: 18px;"> We are adjusting the model to support low-bit (4-bit) training efficiently by modifying internal components and enabling techniques like quantization-aware training. </span>

<span style="font-size: 18px;"> After that we define fine-tuning hyperparameters and logging, then we initialize the trainer with LoRA, enabling efficient training with minimal hardware overhead. </span>

In [3]:
# Loading the Meta-Llama-3-8B model, configured for causal language modeling.
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B",
    quantization_config=bnb_config,
    device_map={"": 0}
)

model.config.use_cache = False
model.config.pretraining_tp = 1

model = prepare_model_for_kbit_training(model)

# Configuring the fine-tuning process using the TrainingArguments class.
training_arguments = TrainingArguments(
        output_dir="./results_llama3_8B",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=1,
        evaluation_strategy="steps",
        eval_steps=50,
        logging_steps=5,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="linear",
        warmup_steps=10,
        report_to="tensorboard",
        max_steps=-1,
)

# Creating an instance of the SFTTrainer class, which simplifies fine-tuning with parameter-efficient techniques like LoRA.
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['train'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
)



config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/568 [00:00<?, ? examples/s]

Map:   0%|          | 0/568 [00:00<?, ? examples/s]

# 3. Launch fine tuning

In [4]:

trainer.train()
trainer.model.save_pretrained("llama3-8B-finetuned")

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
50,1.4058,1.105279
100,0.9415,0.772087
150,0.7084,0.631949
200,0.3175,0.568357
250,0.352,0.522179
300,0.302,0.477242
350,0.4881,0.455531
400,0.378,0.44365


# 4. Inference

In [3]:
import os
os.environ["HF_TOKEN"] = "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"

import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

model_name = "meta-llama/Meta-Llama-3-8B"
new_model = "<path_to_your_output_checkpoints>"
device_map = {"": 0}

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()


tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

prompt = "Who wrote the book Innovator's Dilemma?"

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<s>[INST] Who wrote the book Innovator's Dilemma? [/INST]  The book "Innovator's Dilemma" was written by Clayton M. Christensen, an American economist and business consultant. The book was first published in 1997 and has since become a classic in the field of innovation and business strategy. In the book, Christensen argues that established companies often struggle to adapt to disruptive technologies and new market entrants, and he provides case studies to illustrate his theory. The book has had a significant impact on the way businesses think about innovation and has been widely read and studied by entrepreneurs, investors, and business leaders around the world.
