<a href="https://colab.research.google.com/github/Nobobi-Hasan/FND-Llama/blob/main/FND_Llama_01_Fine_Tuning_1_00_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# --- 1: Mount Google Drive ---

In [1]:
# to save the trained model permanently
from google.colab import drive
drive.mount('/content/drive')

# Create the project directory if it doesn't exist
!mkdir -p /content/drive/MyDrive/ML-Models/FND_Llama_01_Fine_Tuning_1.00

Mounted at /content/drive


# --- 2: Install Dependencies ---

In [2]:
# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 datasets

In [3]:
!pip install -q accelerate peft bitsandbytes transformers trl datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25h

# --- 3: Hugging Face Login ---

In [4]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# --- 4: Imports ---

In [5]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# --- 5: Model & Dataset Parameters ---

In [6]:
model_name = "meta-llama/Llama-3.2-3B-Instruct"

dataset_name = "liar"

# Fine-tuned model name (for temporary adapter storage)
new_model_adapter = "Llama-3.2-3B-fake-news-adapter"

# Permanent model path in Google Drive
output_model_path = "/content/drive/MyDrive/my_research_project/Llama-3.2-3B-fake-news-classifier-merged"

In [7]:
# QLoRA parameters
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

In [8]:
# bitsandbytes parameters
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

In [9]:
# TrainingArguments parameters
output_dir = "./results"
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 2 # Reduced for T4 stability
per_device_eval_batch_size = 2
gradient_accumulation_steps = 2 # Increased to compensate for batch size
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0
logging_steps = 25


In [10]:
# SFT parameters
max_seq_length = None
packing = False
device_map = {"": 0}

# --- 6: Data Preprocessing ---

In [11]:

# # Format the LIAR dataset into an instruction format for SFT
# def format_instruction(sample):
#     # Simplify to Real (True, Mostly-true) vs. Fake (all others)
#     label = "Real" if sample['label'] in [2, 3] else "Fake"

#     # Use the Llama 3.2 Instruct chat template
#     prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nClassify the truthfulness of the following statement: \"{sample['statement']}\"\n\nRespond with only one word: Real or Fake.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{label}<|eot_id|>"
#     return {"text": prompt}

# # Load dataset
# dataset = load_dataset(dataset_name, split="train", trust_remote_code=True)

# # Apply formatting
# dataset = dataset.map(format_instruction, remove_columns=list(dataset.features))

In [42]:
from datasets import load_dataset

# Define the formatting function (same as before)
def format_instruction(sample):
    # Simplify to Real (true, mostly-true) vs. Fake (all others)
    # This assumes 'label' column is numeric (0-5)
    # label = "Real" if sample['label'] in [2, 3] else "Fake"
    label = "Real" if sample['label'] in ['true', 'mostly-true'] else "Fake"

    # Use the Llama 3.2 Instruct chat template
    prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nClassify the truthfulness of the following statement: \"{sample['statement']}\"\n\nRespond with only one word: Real or Fake.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{label}<|eot_id|>"
    return {"text": prompt}

data_path = "/content/drive/MyDrive/ML-Datasets/liar_dataset/train.tsv"

# column_names = [
#     'id',
#     'label',
#     'statement',
#     'subject',
#     'speaker',
#     'job_title',
#     'state_info',
#     'party_affiliation',
#     'barely_true_counts',
#     'false_counts',
#     'half_true_counts',
#     'mostly_true_counts',
#     'pants_on_fire_counts',
#     'context'
# ]

# 3. Load the dataset
raw_dataset = load_dataset(
    "csv",
    data_files={"train": data_path},
    delimiter="\t",
    # column_names=column_names,
    split="train"  # We select the 'train' split
)

# 4. Apply formatting (same as before)
dataset = raw_dataset.map(format_instruction, remove_columns=list(raw_dataset.features))

print("\nSuccessfully loaded and formatted dataset from Google Drive.")
# print(f"First sample:\n{raw_dataset[0]}")
# print(f"First sample:\n{dataset[0]['text']}")

Map:   0%|          | 0/10240 [00:00<?, ? examples/s]


Successfully loaded and formatted dataset from Google Drive.


In [43]:
raw_dataset[10]

{'id': '7115.json',
 'label': 'mostly-true',
 'statement': 'For the first time in history, the share of the national popular vote margin is smaller than the Latino vote margin.',
 'subject': 'elections',
 'speaker': 'robert-menendez',
 "speaker's_job_title": 'U.S. Senator',
 'state_info': 'New Jersey',
 'party_affiliation': 'democrat',
 'barely_true_counts': 1.0,
 'false_counts': 3.0,
 'half_true_counts': 1.0,
 'mostly_true_counts': 3.0,
 'pants_on_fire_counts': 0.0,
 'context': 'a speech'}

In [46]:
dataset[11]['text']

'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nClassify the truthfulness of the following statement: "Since 2000, nearly 12 million Americans have slipped out of the middle class and into poverty."\n\nRespond with only one word: Real or Fake.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nFake<|eot_id|>'