In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
!pip install transformers datasets peft accelerate bitsandbytes xformers huggingface-hub

Mounted at /content/drive
Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting xformers
  Downloading xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting torch>=1.13.0 (from peft)
  Downloading torch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting sympy>=1.13.3 (from torch>=1.

In [None]:
# 1. Clean uninstall
!pip uninstall -y torch torchvision torchaudio

# 2. Reinstall with CUDA 11.8 support (suitable for Colab Pro/A100)
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118



Found existing installation: torch 2.7.0
Uninstalling torch-2.7.0:
  Successfully uninstalled torch-2.7.0
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.7.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch)
  Downloading https://download.pyto

In [None]:
import os
import math
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
from huggingface_hub import login
from transformers import DataCollatorForLanguageModeling

# Login to Hugging Face
# IMPORTANT: Replace with your actual Hugging Face token or use Kaggle Secrets.
# Using Kaggle Secrets is highly recommended to keep your token secure.
# If using Secrets, add a secret named 'HF_TOKEN' under the "Addons" tab
# and uncomment the lines below:
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# hf_token = user_secrets.get_secret('HF_TOKEN')

# If not using Secrets (less secure for tokens), replace with your token:
hf_token = "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" # Replace with your Hugging Face token

login(token=hf_token)

# Specify the model name
model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Define BitsAndBytesConfig for 4-bit quantization (Optional but recommended for T4)
# This can help fit larger models or larger batch sizes on the T4's 16GB VRAM.
# Uncomment the following block to enable 4-bit quantization.
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    # bnb_4bit_compute_dtype=torch.float16, # T4 supports float16, but bf16 is often better for transformers
    bnb_4bit_compute_dtype=torch.bfloat16, # T4 supports bf16, recommended for better training stability
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", # nf4 is recommended for LLMs
)

In [None]:

# Load the tokenizer
# use_auth_token is deprecated, use token=hf_token instead
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)

# Load the model
# We'll load the model with quantization if the config is uncommented,
# otherwise load in bf16 directly. BF16 is generally preferred over FP16
# for training stability with large models if supported by the hardware (T4 supports BF16).
if 'quantization_config' in locals():
    # Load with 4-bit quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto", # Automatically distributes model across available GPUs
        quantization_config=quantization_config,
        trust_remote_code=True, # Keep if required by the model
        token=hf_token, # Add if model requires authentication
    )
else:
    # Load in bf16 directly
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto", # Automatically distributes model across available GPUs
        trust_remote_code=True, # Keep if required by the model
        token=hf_token, # Add if model requires authentication
        torch_dtype=torch.bfloat16 # Load model weights directly in bf16
    )

# Configure LoRA
peft_config = LoraConfig(
    r=32, # LoRA attention dimension
    lora_alpha=64, # Alpha parameter for LoRA scaling
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], # Target modules for LoRA
    lora_dropout=0.05, # Dropout probability for LoRA layers
    bias="none", # Bias type for LoRA
    task_type=TaskType.CAUSAL_LM # Task type for Causal Language Modeling
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

trainable params: 27,262,976 || all params: 8,057,524,224 || trainable%: 0.3384


In [None]:


# Load the dataset
# Make sure your dataset file 'alumni_smart_fine_tuning_dataset.jsonl' is
# uploaded to your Kaggle notebook environment.
# You can add data to your notebook using the "+ Add Data" button.
# If you add it as a Dataset, it will likely be in the '../input/<dataset-name>/' directory.
# If you upload it directly, it might be in the '../working/' directory.
# Update the file path accordingly.
dataset_path = "/content/drive/MyDrive/alumni_smart_fine_tuning_dataset_full.jsonl" # <--- UPDATE THIS PATH
dataset = load_dataset("json", data_files=dataset_path)["train"]
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:


# Function to convert messages to Llama 3 Instruct format
def convert_messages_to_text(example):
    # Llama 3 Instruct requires a specific format with special tokens
    # Reference: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/
    prompt = "<|begin_of_text|>"
    for msg in example["messages"]:
        role = msg["role"]
        content = msg["content"].strip()
        prompt += f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}<|eot_id|>"

    # The prompt should end with the start of the assistant's turn
    # if the last message was from the user, to prompt the model for a response.
    # If your dataset includes complete dialogues (user and assistant turns),
    # the above format is correct. If you only have user prompts and want to
    # fine-tune the model to generate the assistant response, you might need
    # to ensure the last turn is from the user and append `<|start_header_id|>assistant<|end_header_id|>\n\n`.
    # Based on your original code, it seems you have complete dialogues.

    return {"text": prompt}

# Apply the formatting and tokenize the datasets
train_dataset = train_dataset.map(convert_messages_to_text)
eval_dataset = eval_dataset.map(convert_messages_to_text)

def tokenize(example):
    # Tokenize without padding; DataCollator will handle dynamic padding per batch
    return tokenizer(
        example["text"],
        truncation=True,
        padding=False,          # Set padding to False for dynamic padding
        max_length=2048         # Keep max_length for truncation
    )

# Set the pad token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token

train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/64743 [00:00<?, ? examples/s]

Map:   0%|          | 0/7194 [00:00<?, ? examples/s]

Map:   0%|          | 0/64743 [00:00<?, ? examples/s]

Map:   0%|          | 0/7194 [00:00<?, ? examples/s]

In [None]:

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
# Adjusted batch size and gradient accumulation for better fit on T4's VRAM.
# You might need to experiment with these values (per_device_train_batch_size and gradient_accumulation_steps)
# to find the optimal setting for your specific dataset and the T4 GPU.
# Effective batch size = per_device_train_batch_size * gradient_accumulation_steps * number_of_gpus
# With a single T4, effective batch size = per_device_train_batch_size * gradient_accumulation_steps
# We aim for an effective batch size similar to your original code (8 * 2 = 16).
# IMPORTANT: Set the output_dir to a path within your mounted Google Drive
output_dir = "/content/drive/MyDrive/llama3_8b_finetune_checkpoints" # <--- UPDATE THIS PATH IN YOUR DRIVE
training_args = TrainingArguments(
    output_dir=output_dir, # Directory to save checkpoints in Google Drive
    num_train_epochs=5, # Total number of training epochs
    per_device_train_batch_size=4, # Reduce batch size per device for T4
    gradient_accumulation_steps=4, # Increase accumulation steps to compensate for smaller batch size
    learning_rate=2e-5, # Learning rate
    warmup_steps=100, # Number of steps for the warmup phase
    logging_steps=10, # Log training metrics every X steps
    save_steps=500, # Save checkpoint every X steps
    eval_steps=500, # Run evaluation every X steps
    save_total_limit=3, # Limit the total number of checkpoints to save
    lr_scheduler_type="cosine", # Learning rate scheduler type
    bf16=True, # Use bf16 for mixed precision training (T4 supports this)
    logging_dir=f"{output_dir}/logs", # Directory for logging in Google Drive
    report_to="none", # Do not report to external services like W&B
    # Add checkpointing configuration
    save_strategy="steps", # Save checkpoint based on steps
    load_best_model_at_end=False, # Set to True if you want to load the best model based on eval metric
    # metric_for_best_model="eval_loss", # Metric to use if load_best_model_at_end is True
    # greater_is_better=False, # For eval_loss, smaller is better
)

# Define compute metrics function (for evaluation)
def compute_metrics(eval_pred):
    import numpy as np
    loss = eval_pred.loss
    perplexity = math.exp(loss) if loss < 100 else float("inf")
    return {"loss": loss, "perplexity": perplexity}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:

# --- Check for existing checkpoints and resume training ---
# This part checks if there's a saved checkpoint in the output directory
# in Google Drive and resumes training from there if found.
import glob

# Ensure the output directory exists in Google Drive
os.makedirs(output_dir, exist_ok=True)

checkpoints = glob.glob(f"{output_dir}/checkpoint-*")

latest_checkpoint = None
if checkpoints:
    # Find the latest checkpoint based on step number
    # Use os.path.basename to get the directory name for splitting
    latest_checkpoint = max(checkpoints, key=lambda x: int(os.path.basename(x).split('-')[-1]))
    print(f"Resuming training from checkpoint: {latest_checkpoint}")
    # The Trainer's train method can automatically resume if a checkpoint path is provided.
    trainer.train(resume_from_checkpoint=latest_checkpoint)
else:
    print("No checkpoint found, starting training from scratch.")
    # Start training from scratch
    trainer.train()

# Optional: Save the final model after training
# trainer.save_model(f"{output_dir}/final_model")

Resuming training from checkpoint: /content/drive/MyDrive/llama3_8b_finetune_checkpoints/checkpoint-19500


Step,Training Loss
19510,0.7996
19520,0.7891
19530,0.7501
19540,0.7861
19550,0.7585
19560,0.7854
19570,0.7965
19580,0.825
19590,0.7561
19600,0.7872


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
trainer.save_model(f"{output_dir}/final_model")