<a href="https://colab.research.google.com/github/Nobobi-Hasan/FND-Llama/blob/main/FND_Llama_01_Fine_Tuning_1_00_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# --- 1: Mount Google Drive ---

In [1]:
# to save the trained model permanently
from google.colab import drive
drive.mount('/content/drive')

# Create the project directory if it doesn't exist
!mkdir -p /content/drive/MyDrive/ML-Models/FND_Llama_01_Fine_Tuning_1.00

Mounted at /content/drive


# --- 2: Install Dependencies ---

In [2]:
# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 datasets

In [3]:
!pip install -q accelerate peft bitsandbytes transformers trl datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25h

# --- 3: Hugging Face Login ---

In [4]:
from huggingface_hub import login
login()

# --- 4: Imports ---

In [5]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# --- 5: Model & Dataset Parameters ---

In [6]:
model_name = "meta-llama/Llama-3.2-3B-Instruct"

dataset_name = "liar"

# Fine-tuned model name (for temporary adapter storage)
new_model_adapter = "Llama-3.2-3B-fake-news-adapter"

# Permanent model path in Google Drive
output_model_path = "/content/drive/MyDrive/my_research_project/Llama-3.2-3B-fake-news-classifier-merged"

In [7]:
# QLoRA parameters
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

In [8]:
# bitsandbytes parameters
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

In [9]:
# TrainingArguments parameters
output_dir = "./results"
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 1 # Reduced for T4 stability
per_device_eval_batch_size = 2
gradient_accumulation_steps = 4 # Increased to compensate for batch size
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0
logging_steps = 25


In [10]:
# SFT parameters
max_seq_length = 256
packing = False
device_map = {"": 0}

# --- 6: Data Preprocessing ---

In [11]:

# # Format the LIAR dataset into an instruction format for SFT
# def format_instruction(sample):
#     # Simplify to Real (True, Mostly-true) vs. Fake (all others)
#     label = "Real" if sample['label'] in [2, 3] else "Fake"

#     # Use the Llama 3.2 Instruct chat template
#     prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nClassify the truthfulness of the following statement: \"{sample['statement']}\"\n\nRespond with only one word: Real or Fake.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{label}<|eot_id|>"
#     return {"text": prompt}

# # Load dataset
# dataset = load_dataset(dataset_name, split="train", trust_remote_code=True)

# # Apply formatting
# dataset = dataset.map(format_instruction, remove_columns=list(dataset.features))

In [12]:
from datasets import load_dataset

# Define the formatting function (same as before)
def format_instruction(sample):
    # Simplify to Real (true, mostly-true) vs. Fake (all others)
    # This assumes 'label' column is numeric (0-5)
    # label = "Real" if sample['label'] in [2, 3] else "Fake"
    label = "Real" if sample['label'] in ['true', 'mostly-true'] else "Fake"

    # Use the Llama 3.2 Instruct chat template
    prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nClassify the truthfulness of the following statement: \"{sample['statement']}\"\n\nRespond with only one word: Real or Fake.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{label}<|eot_id|>"
    return {"text": prompt}

data_path = "/content/drive/MyDrive/ML-Datasets/liar_dataset/train.tsv"

# column_names = [
#     'id',
#     'label',
#     'statement',
#     'subject',
#     'speaker',
#     'job_title',
#     'state_info',
#     'party_affiliation',
#     'barely_true_counts',
#     'false_counts',
#     'half_true_counts',
#     'mostly_true_counts',
#     'pants_on_fire_counts',
#     'context'
# ]

# 3. Load the dataset
raw_dataset = load_dataset(
    "csv",
    data_files={"train": data_path},
    delimiter="\t",
    # column_names=column_names,
    split="train"  # We select the 'train' split
)

# 4. Apply formatting (same as before)
dataset = raw_dataset.map(format_instruction, remove_columns=list(raw_dataset.features))

print("\nSuccessfully loaded and formatted dataset from Google Drive.")
# print(f"First sample:\n{raw_dataset[0]}")
# print(f"First sample:\n{dataset[0]['text']}")

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10240 [00:00<?, ? examples/s]


Successfully loaded and formatted dataset from Google Drive.


In [13]:
raw_dataset[10]

{'id': '7115.json',
 'label': 'mostly-true',
 'statement': 'For the first time in history, the share of the national popular vote margin is smaller than the Latino vote margin.',
 'subject': 'elections',
 'speaker': 'robert-menendez',
 "speaker's_job_title": 'U.S. Senator',
 'state_info': 'New Jersey',
 'party_affiliation': 'democrat',
 'barely_true_counts': 1.0,
 'false_counts': 3.0,
 'half_true_counts': 1.0,
 'mostly_true_counts': 3.0,
 'pants_on_fire_counts': 0.0,
 'context': 'a speech'}

In [14]:
dataset[11]['text']

'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nClassify the truthfulness of the following statement: "Since 2000, nearly 12 million Americans have slipped out of the middle class and into poverty."\n\nRespond with only one word: Real or Fake.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nFake<|eot_id|>'

# --- 7: Load Tokenizer and Model ---

In [15]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [16]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True
)
model.config.use_cache = False
model.config.pretraining_tp = 1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [17]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

# --- 8: Load LoRA and Training Args ---

In [18]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# --- 9: Set up Trainer ---

In [19]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    # tokenizer=tokenizer,
    processing_class=tokenizer,
    args=training_arguments,
    # packing=packing,
)

Adding EOS to train dataset:   0%|          | 0/10240 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/10240 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/10240 [00:00<?, ? examples/s]

# --- 10: Train Model ---

In [20]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
25,3.9138
50,2.8037
75,1.8871
100,1.5159
125,1.7364
150,1.4527
175,1.7255
200,1.4565
225,1.7495
250,1.4644


TrainOutput(global_step=2560, training_loss=1.2642575670033693, metrics={'train_runtime': 2700.3243, 'train_samples_per_second': 3.792, 'train_steps_per_second': 0.948, 'total_flos': 9854799923865600.0, 'train_loss': 1.2642575670033693, 'entropy': 1.135284850001335, 'num_tokens': 578925.0, 'mean_token_accuracy': 0.752059605717659, 'epoch': 1.0})

# --- 11: Save Checkpoint After Epoch 1 ---

In [25]:
print("Saving checkpoint...")

# Define a path in Google Drive for this checkpoint
checkpoint_path = "/content/drive/MyDrive/ML-Models/FND_Llama_01_Fine_Tuning_1.00/checkpoint-epoch-1"

# Save the model adapter (LoRA weights)
trainer.save_model(checkpoint_path)

print(f"Checkpoint for Epoch 1 saved to: {checkpoint_path}")

Saving checkpoint...
Checkpoint for Epoch 1 saved to: /content/drive/MyDrive/ML-Models/FND_Llama_01_Fine_Tuning_1.00/checkpoint-epoch-1


In [29]:
trainer.save_state()
print(f"Trainer state saved to {trainer.args.output_dir}")

Trainer state saved to ./results


# --- 10 (Again): Resume Training for Epoch 2 ---

In [28]:
# The path to the checkpoint
checkpoint_path = "/content/drive/MyDrive/ML-Models/FND_Llama_01_Fine_Tuning_1.00/checkpoint-epoch-1"

print(f"Resuming training from: {checkpoint_path}")

# This loads the state from checkpoint and continues training.
# It will run for one more epoch, as defined in your TrainingArguments.
trainer.train(resume_from_checkpoint=checkpoint_path)

print("Epoch 2 training complete.")

Resuming training from: /content/drive/MyDrive/ML-Models/FND_Llama_01_Fine_Tuning_1.00/checkpoint-epoch-1


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/ML-Models/FND_Llama_01_Fine_Tuning_1.00/checkpoint-epoch-1/trainer_state.json'