In [1]:
!pip3 install -q bitsandbytes datasets fsspec==2025.3.2 evaluate GPUtil rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [2]:
import os
import time
import json
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainerCallback
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import OFTConfig, get_peft_model, TaskType
from matplotlib import pyplot as plt
import evaluate
import gc
import psutil
import wandb
from tqdm import tqdm
from google.colab import drive

In [3]:

# Google Drive paths
DRIVE_MOUNT_PATH = "/content/drive"
ALPACA_JSON_PATH = "/MyDrive/ETHER/Dataset/alpaca_processed.json"


In [4]:

def load_preprocessed_dataset():
    """Load preprocessed Alpaca dataset (JSON Lines format) from Google Drive."""

    print("Mounting Google Drive...")
    drive.mount(DRIVE_MOUNT_PATH, force_remount=True)

    alpaca_json_full_path = DRIVE_MOUNT_PATH + ALPACA_JSON_PATH
    print(f"Looking for dataset at: {alpaca_json_full_path}")

    if not os.path.exists(alpaca_json_full_path):
        raise FileNotFoundError(f"File not found: {alpaca_json_full_path}")

    print("Loading Alpaca dataset from JSON Lines...")
    # Load each line as a JSON object
    with open(alpaca_json_full_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    # Convert to HuggingFace Dataset
    dataset = Dataset.from_list(data)
    print("\nSample entry:")
    for _ in range(5):
      print(dataset[_])




    dataset = dataset.train_test_split(test_size=0.3)

    return dataset["train"], dataset["test"]

In [5]:


# Load data
train_dataset, eval_dataset = load_preprocessed_dataset()

# Take a subset of data for faster experimentation
train_dataset = train_dataset.select(range(min(10000, len(train_dataset))))
eval_dataset = eval_dataset.select(range(min(100, len(eval_dataset))))

print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

Mounting Google Drive...
Mounted at /content/drive
Looking for dataset at: /content/drive/MyDrive/ETHER/Dataset/alpaca_processed.json
Loading Alpaca dataset from JSON Lines...

Sample entry:
{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'llama3_prompt': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.<|start_header_id|>user<|end_header_id|>\nGive three tips for staying healthy.<|start_header_id|>assistant<|end_header_id|>\n', 'llama3_output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_params = 0
    for name, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params:,} || "
        f"all params: {all_params:,} || "
        f"trainable%: {100 * trainable_params / all_params:.2f}"
    )


    return trainable_params, all_params

In [7]:

# Tokenize function for Llama-3.2 model
def tokenize_function(examples):
    # Combine prompt and output for complete sequences
    complete_samples = [prompt + output for prompt, output in zip(examples["llama3_prompt"], examples["llama3_output"])]

    # Tokenize the complete samples
    model_inputs = tokenizer(
        complete_samples,
        max_length=256,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    # Create labels (same as input_ids for causal LM)
    labels = model_inputs["input_ids"].clone()

    # Create attention masks for padding tokens
    attention_mask = model_inputs["attention_mask"]

    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": attention_mask,
        "labels": labels
    }

In [8]:
train_dataset[0]

{'instruction': 'Name a game that can be played with cards.',
 'input': '',
 'output': 'One game that can be played with cards is Go Fish. This game requires a standard deck of cards ( usually the least number of cards possible- usually four players ) in which the objective of the game is to make the most pairs of matched cards. The player with the most matched pairs wins.',
 'llama3_prompt': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.<|start_header_id|>user<|end_header_id|>\nName a game that can be played with cards.<|start_header_id|>assistant<|end_header_id|>\n',
 'llama3_output': 'One game that can be played with cards is Go Fish. This game requires a standard deck of cards ( usually the least number of cards possible- usually four players ) in which the objective of the game is to make the most pairs of matched cards. The player with the most matched pair

In [9]:


# Custom callback to monitor memory usage
class Logger(TrainerCallback):
    def __init__(self):
        self.memory_metrics = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return

        if torch.cuda.is_available():
            gpu_allocated = torch.cuda.memory_allocated() / (1024 * 1024)  # MB
            gpu_reserved = torch.cuda.memory_reserved() / (1024 * 1024)    # MB
            gpu_max_allocated = torch.cuda.max_memory_allocated() / (1024 * 1024)  # MB
            gpu_max_reserved = torch.cuda.max_memory_reserved() / (1024 * 1024)    # MB

            logs["gpu_allocated_mb"] = round(gpu_allocated, 2)
            logs["gpu_reserved_mb"] = round(gpu_reserved, 2)
            logs["gpu_max_allocated_mb"] = round(gpu_max_allocated, 2)
            logs["gpu_max_reserved_mb"] = round(gpu_max_reserved, 2)

            self.memory_metrics.append({
                "step": state.global_step,
                "gpu_allocated_mb": round(gpu_allocated, 2),
                "gpu_max_allocated_mb": round(gpu_max_allocated, 2)
            })

        # Get CPU memory stats
        cpu_percent = psutil.cpu_percent()
        ram_percent = psutil.virtual_memory().percent

        logs["cpu_percent"] = cpu_percent
        logs["ram_percent"] = ram_percent

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 100 == 0:
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    def get_memory_report(self):
        if not self.memory_metrics:
            return "No memory metrics collected."

        peak_allocated = max(metric["gpu_max_allocated_mb"] for metric in self.memory_metrics)
        last_allocated = self.memory_metrics[-1]["gpu_allocated_mb"]

        report = f"Memory Usage Summary:\n"
        report += f"- Peak GPU memory allocated: {peak_allocated:.2f} MB\n"
        report += f"- Final GPU memory allocated: {last_allocated:.2f} MB\n"

        return report

In [10]:
def print_gpu_utilization():
    if torch.cuda.is_available():
        peak_memory = torch.cuda.max_memory_allocated() / (1024 * 1024)
        current_memory = torch.cuda.memory_allocated() / (1024 * 1024)
        print(f"GPU Memory: {current_memory:.2f} MB (Current) / {peak_memory:.2f} MB (Peak)")
        return peak_memory
    else:
        print("No GPU available")
        return 0


In [11]:

# Measure GPU usage before fine-tuning
print("\nGPU Memory Usage Before Fine-tuning:")
before_peak_gpu = print_gpu_utilization()



GPU Memory Usage Before Fine-tuning:
GPU Memory: 0.00 MB (Current) / 0.00 MB (Peak)


In [12]:

# Compute evaluation metrics
def compute_metrics(eval_preds, tokenizer):
    rouge = evaluate.load("rouge")
    bleu = evaluate.load("bleu")

    # Unpack predictions and labels
    predictions, labels = eval_preds

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # Convert logits to token IDs if needed
    if len(predictions.shape) == 3:
        predictions = np.argmax(predictions, axis=-1)

    # Replace -100 padding token IDs with the pad token ID
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode predictions and labels
    decoded_preds = []
    decoded_labels = []

    for pred, label in zip(predictions, labels):
        # Skip padding tokens
        pred_tokens = [t for t in pred if t != tokenizer.pad_token_id]
        label_tokens = [t for t in label if t != tokenizer.pad_token_id]

        pred_text = tokenizer.decode(pred_tokens, skip_special_tokens=True)
        label_text = tokenizer.decode(label_tokens, skip_special_tokens=True)

        decoded_preds.append(pred_text)
        decoded_labels.append(label_text)

    # Calculate ROUGE scores
    rouge_output = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # Calculate BLEU score
    bleu_output = bleu.compute(
        predictions=decoded_preds,
        references=[[label] for label in decoded_labels]
    )

    return {
        "rouge1": rouge_output["rouge1"],
        "rouge2": rouge_output["rouge2"],
        "rougeL": rouge_output["rougeL"],
        "bleu": bleu_output["bleu"]
    }

In [13]:
MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct"
OFT_RANK = 16
MODULE_DROPOUT = 0.05

BATCH_SIZE = 4   # Reduced batch size for full precision model
GRADIENT_ACCUMULATION_STEPS = 16
LEARNING_RATE = 2e-4

NUM_EPOCHS = 1
MAX_SEQ_LENGTH = 256

OUTPUT_DIR = "./oft_finetuned_llama32_3b"

WANDB_PROJECT = "llama-3.2-3B-oft-finetuning-2"
WANDB_NAME = "alpaca-oft-3"
USE_WANDB = True


In [14]:

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [15]:
# Tokenize datasets
print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

tokenized_eval_dataset = eval_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=eval_dataset.column_names
)

Tokenizing datasets...


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [16]:
print("\nGPU Memory Usage Before Model Loading:")
print_gpu_utilization()



GPU Memory Usage Before Model Loading:
GPU Memory: 0.00 MB (Current) / 0.00 MB (Peak)


0.0

In [17]:
print("Loading model!")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16,  # Use float16 instead of full precision for reasonable memory usage
    trust_remote_code=True
)

Loading model!


config.json:   0%|          | 0.00/945 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

In [18]:
trainable_params, all_params = print_trainable_parameters(model)

trainable params: 3,212,749,824 || all params: 3,212,749,824 || trainable%: 100.00


In [19]:
# Initialize wandb
if USE_WANDB:
    wandb.init(
        project=WANDB_PROJECT,
        name=WANDB_NAME,
        config={
            "model": MODEL_NAME,
            "oft_r": OFT_RANK,
            "module_dropout": MODULE_DROPOUT,
            "batch_size": BATCH_SIZE,
            "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
            "learning_rate": LEARNING_RATE,
            "num_epochs": NUM_EPOCHS,
            "max_seq_length": MAX_SEQ_LENGTH,

        }
    )

    # Print GPU usage after loading model
print("\nGPU Memory Usage After Model Loading:")
print_gpu_utilization()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdinesh-te[0m ([33mdinesh-te-northeastern-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



GPU Memory Usage After Model Loading:
GPU Memory: 6127.83 MB (Current) / 6880.00 MB (Peak)


6880.0

In [20]:

# Configure OFT
oft_config = OFTConfig(
    task_type=TaskType.CAUSAL_LM,
    r=OFT_RANK,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    module_dropout=MODULE_DROPOUT,
    init_weights=True,
    inference_mode=False
)

In [21]:


# Apply OFT to the model
model = get_peft_model(model, oft_config)

In [22]:
# Print model architecture
for name, module in model.named_modules():
    if any(target in name for target in ["q_proj", "k_proj", "v_proj", "o_proj"]):
        print(f"Module: {name}, Shape: {module.weight.shape if hasattr(module, 'weight') else 'No weight'}")



Module: base_model.model.model.layers.0.self_attn.q_proj, Shape: torch.Size([3072, 3072])
Module: base_model.model.model.layers.0.self_attn.q_proj.base_layer, Shape: torch.Size([3072, 3072])
Module: base_model.model.model.layers.0.self_attn.q_proj.oft_r, Shape: No weight
Module: base_model.model.model.layers.0.self_attn.q_proj.oft_s, Shape: No weight
Module: base_model.model.model.layers.0.self_attn.q_proj.oft_dropout, Shape: No weight
Module: base_model.model.model.layers.0.self_attn.q_proj.oft_dropout.default, Shape: No weight
Module: base_model.model.model.layers.0.self_attn.k_proj, Shape: torch.Size([1024, 3072])
Module: base_model.model.model.layers.0.self_attn.k_proj.base_layer, Shape: torch.Size([1024, 3072])
Module: base_model.model.model.layers.0.self_attn.k_proj.oft_r, Shape: No weight
Module: base_model.model.model.layers.0.self_attn.k_proj.oft_s, Shape: No weight
Module: base_model.model.model.layers.0.self_attn.k_proj.oft_dropout, Shape: No weight
Module: base_model.model.

In [23]:
trainable_params, all_params = print_trainable_parameters(model)

trainable params: 66,289,664 || all params: 3,279,039,488 || trainable%: 2.02


In [24]:
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set up training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,

    eval_accumulation_steps=2,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=10,
    logging_first_step=True,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,

    bf16=False,
    fp16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    group_by_length=False,
    lr_scheduler_type="cosine",
    save_total_limit=3,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="wandb" if USE_WANDB else "none",
    logging_strategy="steps",
    push_to_hub=False,
    save_strategy="epoch",
)

# Create memory tracking callback
metrics_callback = Logger()

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=lambda eval_preds: compute_metrics(eval_preds, tokenizer),
    callbacks=[metrics_callback]
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [25]:
# Train the model
start_time = time.time()
print("\nStarting fine-tuning...")
print("=" * 50)

train_result = trainer.train()

# Calculate time taken
total_time = time.time() - start_time
hours, remainder = divmod(total_time, 3600)
minutes, seconds = divmod(remainder, 60)
print(f"Training took {int(hours)}h {int(minutes)}m {int(seconds)}s")


Starting fine-tuning...




Step,Training Loss
1,3.552
10,2.2197
20,1.3263
30,1.2984
40,1.295
50,1.3035
60,1.2245
70,1.259
80,1.2354
90,1.2193


Training took 0h 28m 8s


In [26]:
#Eavluate
final_eval_metrics = trainer.evaluate()

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [27]:
print("\n" + "=" * 50)
print("FINE-TUNING SUMMARY")
print("=" * 50)
print(f"Model: {MODEL_NAME}")
print(f"OFT r={OFT_RANK}, module_dropout={MODULE_DROPOUT}")
print(f"Time taken: {int(hours)}h {int(minutes)}m {int(seconds)}s")
print(f"Train loss: {train_result.training_loss:.4f}")
print(f"Eval loss: {final_eval_metrics['eval_loss']:.4f}")
print(f"ROUGE-1: {final_eval_metrics['eval_rouge1']:.4f}")
print(f"ROUGE-2: {final_eval_metrics['eval_rouge2']:.4f}")
print(f"ROUGE-L: {final_eval_metrics['eval_rougeL']:.4f}")
print(f"BLEU: {final_eval_metrics['eval_bleu']:.4f}")
print("=" * 50)

# Close wandb
if USE_WANDB:
    wandb.finish()


FINE-TUNING SUMMARY
Model: unsloth/Llama-3.2-3B-Instruct
OFT r=16, module_dropout=0.05
Time taken: 0h 28m 8s
Train loss: 1.3149
Eval loss: 1.2817
ROUGE-1: 0.7176
ROUGE-2: 0.5064
ROUGE-L: 0.6684
BLEU: 0.4579


0,1
eval/bleu,▁
eval/loss,▁
eval/rouge1,▁
eval/rouge2,▁
eval/rougeL,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▂▂▃▃▄▄▅▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▄▄▅▅▅▆▆▇▇███

0,1
eval/bleu,0.45785
eval/loss,1.28171
eval/rouge1,0.71757
eval/rouge2,0.50636
eval/rougeL,0.6684
eval/runtime,59.1944
eval/samples_per_second,1.689
eval/steps_per_second,0.422
total_flos,4.424326650947174e+16
train/epoch,0.9984


In [28]:
trainer.save_model(OUTPUT_DIR)