In [None]:
!pip3 install -q bitsandbytes datasets fsspec==2025.3.2 evaluate GPUtil rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m95.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for GPUtil (setup.py) ... [?2

In [None]:
import os
import time
import json
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, concatenate_datasets, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainerCallback

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling, EvalPrediction

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from matplotlib import pyplot as plt

import evaluate
import gc
import psutil
import GPUtil
from tqdm import tqdm
import wandb


from google.colab import drive


In [None]:

# Google Drive paths
DRIVE_MOUNT_PATH = "/content/drive"
ALPACA_JSON_PATH = "/MyDrive/ETHER/Dataset/alpaca_processed.json"


In [None]:

def load_preprocessed_dataset():
    """Load preprocessed Alpaca dataset (JSON Lines format) from Google Drive."""

    print("Mounting Google Drive...")
    drive.mount(DRIVE_MOUNT_PATH, force_remount=True)

    alpaca_json_full_path = DRIVE_MOUNT_PATH + ALPACA_JSON_PATH
    print(f"Looking for dataset at: {alpaca_json_full_path}")

    if not os.path.exists(alpaca_json_full_path):
        raise FileNotFoundError(f"File not found: {alpaca_json_full_path}")

    print("Loading Alpaca dataset from JSON Lines...")
    # Load each line as a JSON object
    with open(alpaca_json_full_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    # Convert to HuggingFace Dataset
    dataset = Dataset.from_list(data)
    print("\nSample entry:")
    for _ in range(5):
      print(dataset[_])




    dataset = dataset.train_test_split(test_size=0.3)

    return dataset["train"], dataset["test"]

In [None]:
def print_trainable_parameters(model):
    """
    Print the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}%"
    )
    return trainable_params, all_param

In [None]:


# Load and process datasets
train_dataset, eval_dataset = load_preprocessed_dataset()


print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

train_dataset = train_dataset.select(range(min(10000, len(train_dataset))))
eval_dataset = eval_dataset.select(range(min(100, len(eval_dataset))))

print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")


Mounting Google Drive...
Mounted at /content/drive
Looking for dataset at: /content/drive/MyDrive/ETHER/Dataset/alpaca_processed.json
Loading Alpaca dataset from JSON Lines...

Sample entry:
{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'llama3_prompt': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.<|start_header_id|>user<|end_header_id|>\nGive three tips for staying healthy.<|start_header_id|>assistant<|end_header_id|>\n', 'llama3_output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent

In [None]:
# Configure BitsAndBytes for quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)


In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_params = 0
    for name, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params:,} || "
        f"all params: {all_params:,} || "
        f"trainable%: {100 * trainable_params / all_params:.2f}"
    )


    return trainable_params, all_params

In [None]:
def tokenize_function(examples):
    """
    Tokenize the data for Llama-3.2 training.

    """
    # Combine prompt and output for complete sequences
    complete_samples = [prompt + output for prompt, output in zip(examples["llama3_prompt"], examples["llama3_output"])]

    # Tokenize the complete samples
    model_inputs = tokenizer(
        complete_samples,
        max_length=256,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    # Create labels (same as input_ids, as we're doing causal language modeling)
    labels = model_inputs["input_ids"].clone()

    # Create attention masks for padding tokens
    attention_mask = model_inputs["attention_mask"]

    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": attention_mask,
        "labels": labels
    }

In [None]:
train_dataset[0]

{'instruction': 'Name a sport which is played indoors.',
 'input': '',
 'output': 'Basketball.',
 'llama3_prompt': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.<|start_header_id|>user<|end_header_id|>\nName a sport which is played indoors.<|start_header_id|>assistant<|end_header_id|>\n',
 'llama3_output': 'Basketball.<|end_of_text|>',
 'llama3_complete': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.<|start_header_id|>user<|end_header_id|>\nName a sport which is played indoors.<|start_header_id|>assistant<|end_header_id|>\nBasketball.<|end_of_text|>',
 'instruction_length': 37,
 'output_length': 11,
 'instruction_start': 'Name a sport'}

In [None]:


class Logger(TrainerCallback):
    """
    Custom callback for Hugging Face Trainer to track memory usage during training.
    """
    def __init__(self):
        self.memory_metrics = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        """
        Event called after logging the last logs.
        Logs GPU and CPU memory usage.
        """
        if logs is None:
            return

        # Get GPU memory stats if available
        if torch.cuda.is_available():
            gpu_allocated = torch.cuda.memory_allocated() / (1024 * 1024)  # MB
            gpu_reserved = torch.cuda.memory_reserved() / (1024 * 1024)    # MB
            gpu_max_allocated = torch.cuda.max_memory_allocated() / (1024 * 1024)  # MB
            gpu_max_reserved = torch.cuda.max_memory_reserved() / (1024 * 1024)    # MB

            logs["gpu_allocated_mb"] = round(gpu_allocated, 2)
            logs["gpu_reserved_mb"] = round(gpu_reserved, 2)
            logs["gpu_max_allocated_mb"] = round(gpu_max_allocated, 2)
            logs["gpu_max_reserved_mb"] = round(gpu_max_reserved, 2)

            # Store the metrics
            self.memory_metrics.append({
                "step": state.global_step,
                "gpu_allocated_mb": round(gpu_allocated, 2),
                "gpu_max_allocated_mb": round(gpu_max_allocated, 2)
            })

        # Get CPU memory stats
        cpu_percent = psutil.cpu_percent()
        ram_percent = psutil.virtual_memory().percent

        logs["cpu_percent"] = cpu_percent
        logs["ram_percent"] = ram_percent

    def on_step_end(self, args, state, control, **kwargs):
        """
        Event called at the end of a training step.
        Perform garbage collection every 100 steps to manage memory.
        """
        if state.global_step % 100 == 0:
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    def get_memory_report(self):
        """
        Return a summary of memory usage during training.
        """
        if not self.memory_metrics:
            return "No memory metrics collected."

        peak_allocated = max(metric["gpu_max_allocated_mb"] for metric in self.memory_metrics)
        last_allocated = self.memory_metrics[-1]["gpu_allocated_mb"]

        report = f"Memory Usage Summary:\n"
        report += f"- Peak GPU memory allocated: {peak_allocated:.2f} MB\n"
        report += f"- Final GPU memory allocated: {last_allocated:.2f} MB\n"

        return report


In [None]:
def print_gpu_utilization():
    """
    Function to print GPU memory usage.
    Returns peak memory usage in MB.
    """
    if torch.cuda.is_available():
        peak_memory = torch.cuda.max_memory_allocated() / (1024 * 1024)
        current_memory = torch.cuda.memory_allocated() / (1024 * 1024)
        print(f"GPU Memory: {current_memory:.2f} MB (Current) / {peak_memory:.2f} MB (Peak)")
        return peak_memory
    else:
        print("No GPU available")
        return 0

In [None]:

# Measure GPU usage before fine-tuning
print("\nGPU Memory Usage Before Fine-tuning:")
before_peak_gpu = print_gpu_utilization()



GPU Memory Usage Before Fine-tuning:
GPU Memory: 0.00 MB (Current) / 0.00 MB (Peak)


In [None]:
def compute_metrics(eval_preds, tokenizer):
    """
    Compute evaluation metrics for fine-tuning.
    """
    rouge = evaluate.load("rouge")
    bleu = evaluate.load("bleu")

    # Unpack predictions and labels
    predictions, labels = eval_preds

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # If predictions are logits, convert to token IDs
    if len(predictions.shape) == 3:
        predictions = np.argmax(predictions, axis=-1)

    # Replace -100 padding token IDs in labels with the pad token ID
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode predictions and labels
    decoded_preds = []
    decoded_labels = []

    for pred, label in zip(predictions, labels):
        # Skip padding in preds and labels
        pred_tokens = [t for t in pred if t != tokenizer.pad_token_id]
        label_tokens = [t for t in label if t != tokenizer.pad_token_id]

        pred_text = tokenizer.decode(pred_tokens, skip_special_tokens=True)
        label_text = tokenizer.decode(label_tokens, skip_special_tokens=True)

        decoded_preds.append(pred_text)
        decoded_labels.append(label_text)


    # Calculate ROUGE scores
    rouge_output = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # Calculate BLEU score
    bleu_output = bleu.compute(
        predictions=decoded_preds,
        references=[[label] for label in decoded_labels]
    )

    return {
        "rouge1": rouge_output["rouge1"],
        "rouge2": rouge_output["rouge2"],
        "rougeL": rouge_output["rougeL"],
        "bleu": bleu_output["bleu"]
    }

In [None]:

MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct"

LORA_RANK = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

BATCH_SIZE = 8
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4

NUM_EPOCHS = 1
MAX_SEQ_LENGTH = 256


OUTPUT_DIR = "./lora_finetuned_llama32_3b"

WANDB_PROJECT = "llama-3.2-3B-lora-finetuning-2"
WANDB_NAME = "alpaca-lora-2"
USE_WANDB = True

In [None]:


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [None]:

print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)
tokenized_eval_dataset = eval_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns= eval_dataset.column_names
)


Tokenizing datasets...


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
# Load model
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

Loading model...


config.json:   0%|          | 0.00/945 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

In [None]:
trainable_params, all_params = print_trainable_parameters(model)

trainable params: 394,177,536 || all params: 1,803,463,680 || trainable%: 21.86


In [None]:
global tokenizer

# Initialize wandb
if USE_WANDB:
  wandb.init(
        project=WANDB_PROJECT,
        name=WANDB_NAME,
        config={
            "model": MODEL_NAME,
            "lora_r": LORA_RANK,
            "lora_alpha": LORA_ALPHA,
            "lora_dropout": LORA_DROPOUT,
            "batch_size": BATCH_SIZE,
            "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
            "learning_rate": LEARNING_RATE,
            "num_epochs": NUM_EPOCHS,
            "max_seq_length": MAX_SEQ_LENGTH,
            "data_source": "Google Drive"
        }
    )


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdinesh-te[0m ([33mdinesh-te-northeastern-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:


# Prepare model for training
model = prepare_model_for_kbit_training(model)


In [None]:

# Define LoRA configuration
lora_config = LoraConfig(
    r=LORA_RANK,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)


In [None]:
# Apply LoRA to the model
model = get_peft_model(model, lora_config)

In [None]:
# Create data collator for causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Setting Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    evaluation_strategy="epoch",
    eval_accumulation_steps=2,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=10,  # Log more frequently
    logging_first_step=True,  # Log the first step
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,

    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=False,
    lr_scheduler_type="cosine",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="wandb",
    logging_strategy="steps",
    push_to_hub=False,
    save_strategy="epoch",

)

metrics_callback = Logger()

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=lambda eval_preds: compute_metrics(eval_preds, tokenizer),
    callbacks=[metrics_callback]
)

# Train the model
start_time = time.time()
print("\nStarting fine-tuning...")
print("=" * 50)
import sys
sys.stdout.flush()
train_result = trainer.train()

# Calculate time taken
total_time = time.time() - start_time
hours, remainder = divmod(total_time, 3600)
minutes, seconds = divmod(remainder, 60)
print(f"Training took {int(hours)}h {int(minutes)}m {int(seconds)}s")

# Save model
trainer.save_model(OUTPUT_DIR)

# Get final metrics
final_eval_metrics = trainer.evaluate()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



Starting fine-tuning...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bleu,Allocated Mb,Reserved Mb,Max Allocated Mb,Max Reserved Mb,Percent
0,1.0869,1.009993,0.738482,0.544044,0.701625,0.503131,3549.63,5158.0,7550.26,10690.0,21.3


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Training took 0h 14m 12s


In [None]:

# Print summary
print("\n" + "="*50)
print("FINE-TUNING SUMMARY")
print("="*50)
print(f"Model: {MODEL_NAME}")
print(f"LoRA r={LORA_RANK}, alpha={LORA_ALPHA}, dropout={LORA_DROPOUT}")

print(f"Time taken: {int(hours)}h {int(minutes)}m {int(seconds)}s")
print(f"Train loss: {train_result.training_loss:.4f}")
print(f"Eval loss: {final_eval_metrics['eval_loss']:.4f}")
print(f"ROUGE-1: {final_eval_metrics['eval_rouge1']:.4f}")
print(f"ROUGE-2: {final_eval_metrics['eval_rouge2']:.4f}")
print(f"ROUGE-L: {final_eval_metrics['eval_rougeL']:.4f}")
print(f"BLEU: {final_eval_metrics['eval_bleu']:.4f}")
print("="*50)



FINE-TUNING SUMMARY
Model: unsloth/Llama-3.2-3B-Instruct
LoRA r=8, alpha=16, dropout=0.05
Time taken: 0h 14m 12s
Train loss: 1.1803
Eval loss: 1.0100
ROUGE-1: 0.7385
ROUGE-2: 0.5440
ROUGE-L: 0.7016
BLEU: 0.5031


In [None]:

MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

In [None]:

predictions = []
references = []

for example in tqdm(eval_dataset):
    input_text = example["llama3_prompt"]
    reference_text = example["llama3_output"].replace("<|end_of_text|>", "").strip()

    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            max_new_tokens=128,
            do_sample=False
        )

    decoded_pred = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    predictions.append(decoded_pred)
    references.append(reference_text)

100%|██████████| 100/100 [07:54<00:00,  4.75s/it]


In [None]:

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

rouge_scores = rouge.compute(
    predictions=predictions,
    references=references,
    use_stemmer=True
)

bleu_scores = bleu.compute(
    predictions=predictions,
    references=[[ref] for ref in references]
)


print("ROUGE:")
for k, v in rouge_scores.items():
    print(f"{k}: {v:.4f}")

print(f"\nBLEU: {bleu_scores['bleu']:.4f}")

ROUGE:
rouge1: 0.2467
rouge2: 0.0938
rougeL: 0.1695
rougeLsum: 0.2089

BLEU: 0.0533
