In [1]:
%%capture
!pip install --upgrade torch  torchvision transformers datasets accelerate bitsandbytes peft trl
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

In [2]:
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported
from peft import PeftModel
from trl import SFTTrainer
from transformers import TrainingArguments
from textwrap import dedent
from datasets import Dataset, load_dataset
import torch
max_seq_length = 2048 
dtype = None 
load_in_4bit = True 

new_model="fine_tuned_llama3_1"

COLOR_ANSWER = '\033[92m'  # Green text for original answers
COLOR_PREDICTION = '\033[93m'  # Yellow text for generated answers
COLOR_RESET = '\033[0m'  # Reset to default color

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


<span style="color:blue; font-size: 24px;">Llama 🦙🦙🦙.1</span> <span style="font-size: 24px;"></span>

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/kaggle/input/llama-3.1/transformers/8b-instruct/1",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
   
)

==((====))==  Unsloth 2024.9.post3: Fast Llama patching. Transformers = 4.45.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Unsloth: We successfully patched the tokenizer to add a {% if add_generation_prompt %} to the chat_template.
This is not a bug, but please notify the Unsloth maintainers - thanks!
/kaggle/input/llama-3.1/transformers/8b-instruct/1 does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>.


### 📝 Get the Dataset Ready

In [4]:
alpaca_prompt = """Below is a question-answering task based on a document title. Provide the best possible answer to the given question based on the document.
### document_title:
{}

### question:
{}

### answer:
{}"""

EOS_TOKEN = tokenizer.eos_token 
def formatting_prompts_func(examples):
    document_title = examples["document_title"]
    question  = examples["question"]
    answer = examples["answer"]
    texts = []
    for document_title, input, answer in zip(document_title,question , answer):
        text = alpaca_prompt.format(document_title, input, answer) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass


In [5]:
dataset = load_dataset("wiki_qa") 
train_dataset = dataset["train"] 
test_dataset = dataset["test"]
val_dataset= dataset["validation"]

train_dataset = dataset.map(formatting_prompts_func, batched = True,)
val_dataset = dataset.map(formatting_prompts_func, batched = True,)

README.md:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/594k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/264k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.00M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/6165 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2733 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20360 [00:00<?, ? examples/s]

Map:   0%|          | 0/6165 [00:00<?, ? examples/s]

Map:   0%|          | 0/2733 [00:00<?, ? examples/s]

Map:   0%|          | 0/20360 [00:00<?, ? examples/s]

In [6]:
train_sample = train_dataset["train"].shuffle(seed=42).select(range(20000))
validation_sample = val_dataset["validation"].shuffle(seed=42).select(range(1000))

## 📊Test the Original Model

In [7]:
FastLanguageModel.for_inference(model)

def unsloth_pipeline(model, tokenizer, input_text, max_new_tokens=128):
  
    tokens = tokenizer.encode(input_text, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
    
    generated_tokens = model.generate(tokens, max_length=max_new_tokens)
    
    generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    
    return generated_text

example_indices = [0, 10, 20]    # Explicitly select three different examples by index 


for i in example_indices:
    row = dataset["test"][i]
    question = row['question']
    document_title = row.get('document_title', '')  
    original_answer = row['answer']  
    input_text = f"Question: {question}\nDocument title: {document_title}\nAnswer:"
    predicted_answer = unsloth_pipeline(model, tokenizer, input_text)

  
    print(f"Question: {question}")
    print(f"Document title: {document_title}")
    print(f"Original Answer:   {COLOR_ANSWER}{original_answer}{COLOR_RESET}")
    print(f"Generated Answer: {COLOR_PREDICTION}{predicted_answer}{COLOR_RESET}")
    print("-" * 100)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Question: HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US
Document title: African immigration to the United States
Original Answer:   [92mAfrican immigration to the United States refers to immigrants to the United States who are or were nationals of Africa .[0m
Generated Answer: [93mQuestion: HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US
Document title: African immigration to the United States
Answer: African Americans were not immigrants, but rather descendants of enslaved Africans who were forcibly brought to the United States. They were brought to the US as part of the transatlantic slave trade, which began in the 16th century and continued until the 19th century. The transatlantic slave trade was a system in which enslaved Africans were forcibly taken from their homes in Africa, transported across the Atlantic Ocean, and sold into slavery in the Americas. Many African Americans are descended[0m
----------------------------------------------------------------------------------

## 🛠️ **Train** & 🏦 **Save**

In [8]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0.05, 
    bias = "none", 
    use_gradient_checkpointing = "unsloth", 
    random_state = 3407,
    use_rslora = False, 
    loftq_config = None,    # And LoftQ
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.9.post3 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [9]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset=train_sample,
    eval_dataset=validation_sample,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,

        evaluation_strategy="steps",
        eval_steps=50,
        save_steps=50,

        warmup_steps =5,
        max_steps = 500,
        learning_rate =1e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)



Map (num_proc=2):   0%|          | 0/20000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [10]:
 trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 20,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 500
 "-____-"     Number of trainable parameters = 83,886,080
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
50,1.005,0.991813
100,0.9612,0.965432
150,0.9559,0.958763
200,0.9869,0.954815
250,0.9284,0.954386
300,0.9501,0.954671
350,0.9306,0.950544
400,0.9245,0.951042
450,0.8913,0.953859
500,0.8874,0.952591


TrainOutput(global_step=500, training_loss=0.9838851509094239, metrics={'train_runtime': 3693.0973, 'train_samples_per_second': 1.083, 'train_steps_per_second': 0.135, 'total_flos': 1.6703002820395008e+16, 'train_loss': 0.9838851509094239, 'epoch': 0.2})

In [11]:
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

('fine_tuned_llama3_1/tokenizer_config.json',
 'fine_tuned_llama3_1/special_tokens_map.json',
 'fine_tuned_llama3_1/tokenizer.json')

##   🔍 Inference

In [12]:
model = PeftModel.from_pretrained(model,new_model)

model.eval() 

def unsloth_pipeline(model, tokenizer, input_text, max_new_tokens=128):
  
    tokens = tokenizer.encode(input_text, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
    
    generated_tokens = model.generate(tokens, max_length=max_new_tokens)
    
    generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    
    return generated_text

example_indices = [0, 10, 20]  


for i in example_indices:
    row = dataset["test"][i]
    question = row['question']
    document_title = row.get('document_title', '')  
    original_answer = row['answer']  
    input_text = f"Question: {question}\nDocument title: {document_title}\nAnswer:"
    predicted_answer = unsloth_pipeline(model, tokenizer, input_text)

  
    print(f"Question: {question}")
    print(f"Document title: {document_title}")
    print(f"Original Answer:   {COLOR_ANSWER}{original_answer}{COLOR_RESET}")
    print(f"Generated Answer: {COLOR_PREDICTION}{predicted_answer}{COLOR_RESET}")
    print("-" * 100)  

Question: HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US
Document title: African immigration to the United States
Original Answer:   [92mAfrican immigration to the United States refers to immigrants to the United States who are or were nationals of Africa .[0m
Generated Answer: [93mQuestion: HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US
Document title: African immigration to the United States
Answer: African Americans were primarily immigrated to the United States through the transatlantic slave trade, which forcibly brought millions of enslaved Africans to the Americas between the 16th and 19th centuries. Additionally, many free African immigrants came to the US as laborers, sailors, and students from the late 18th century to the early 20th century. The 1965 Immigration and Nationality Act also led to an increase in African immigration to the US. Today,[0m
----------------------------------------------------------------------------------------------------
Question: how large we