In [None]:
# Install necessary libraries for Unsloth, Hugging Face, etc.
# Uses specific versions optimized for Colab in the 'else' block.
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch

# Set maximum sequence length - allows for longer paper sections + JSON output
# While we set 30000, actual training time depends on the length of data samples (~10k in your case).
# This primarily reserves VRAM.
max_seq_length = 30000
dtype = None # None for auto detection (will likely be float16 on T4/V100, bfloat16 on Ampere+)
load_in_4bit = True # Use 4bit quantization for memory efficiency

# REASON: Must use the 'Instruct' model as your dataset uses the chat format.
# We use the 4bit quantized version optimized by Unsloth for lower memory usage.
model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"

# Load the model and tokenizer using Unsloth's optimized function
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # Add your Hugging Face token if using gated models like original Llama 2/3
)

In [None]:
# Apply LoRA adapters to the model for efficient fine-tuning (QLoRA)
model = FastLanguageModel.get_peft_model(
    model,
    # REASON: r = 16 is a balanced choice for adapter rank (capacity).
    # Balances learning capability with memory usage and overfitting risk.
    r = 16,

    # Specify which modules (layers) to apply LoRA to. These are common choices for Llama models.
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16, # Scaling factor for LoRA updates
    lora_dropout = 0, # Set dropout to 0 (optimized in Unsloth)
    bias = "none",    # Use no bias term (optimized in Unsloth)

    # Use Unsloth's custom gradient checkpointing for better memory efficiency with long sequences
    use_gradient_checkpointing = "unsloth",
    random_state = 3407, # For reproducibility
    use_rslora = False,  # Rank Stabilized LoRA (optional)
    loftq_config = None, # LoftQ initialization (optional)
)

In [None]:
from datasets import load_dataset

# --- IMPORTANT ---
# 1. Make sure you've uploaded your 'train.jsonl' file to the Colab environment.
#    Use the "Files" tab on the left sidebar.
# -------------------

# REASON: Load your custom dataset directly.
# The `load_dataset` function handles the .jsonl format automatically.
dataset_path = "train.jsonl" # Make sure this matches the uploaded filename
dataset = load_dataset("json", data_files=dataset_path, split="train")

# Optional: Print the first sample to verify it loaded correctly
print("--- Example Data Sample ---")
print(dataset[0])
print("---------------------------")

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Configure the trainer for Supervised Fine-Tuning
trainer = SFTTrainer(
    model = model, # The QLoRA-adapted model
    tokenizer = tokenizer,
    train_dataset = dataset, # Your loaded dataset

    # REASON: No 'dataset_text_field' needed. SFTTrainer automatically detects
    # and uses the 'messages' format when it finds it in the dataset.

    max_seq_length = max_seq_length, # The max token limit (30000)
    dataset_num_proc = 2, # Number of CPU cores for pre-processing (adjust based on Colab instance)
    packing = False, # Set to True if most sequences are much shorter than max_seq_length,
                     # can speed up training but might behave differently with very long sequences.
                     # False is safer for potentially long sequences.

    args = TrainingArguments(
        # REASON: Batch size 2 and grad accumulation 4 for effective batch size 8.
        # This helps fit into memory while stabilizing training.
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,

        warmup_steps = 5, # Number of steps for learning rate warmup

        # REASON: Train for 3 full passes over your 350 samples.
        # Good starting point for a small dataset to avoid under/overfitting.
        num_train_epochs = 3,

        learning_rate = 2e-4, # Standard learning rate for LoRA
        fp16 = not is_bfloat16_supported(), # Use mixed-precision FP16 if BF16 is not supported
        bf16 = is_bfloat16_supported(),    # Use BF16 if supported (better for training stability)

        # REASON: Log training loss every 10 steps for clarity.
        logging_steps = 10,

        optim = "adamw_8bit", # Memory-efficient AdamW optimizer
        weight_decay = 0.01,
        lr_scheduler_type = "linear", # Simple linear learning rate decay
        seed = 3407, # For reproducibility
        output_dir = "outputs", # Directory to save checkpoints
        report_to = "none", # Disable reporting to Weights & Biases (can be set to "wandb")
    ),
)

In [None]:
# Check initial GPU memory usage before training starts
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
# This command begins the training process based on the trainer configuration.
print("Starting fine-tuning...")
trainer_stats = trainer.train()
print("Fine-tuning finished!")

In [None]:
# Display memory usage and time taken after training completes
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

# Check if train_runtime exists in metrics, handle potential errors if training stopped early
train_runtime = trainer_stats.metrics.get('train_runtime', 0)
train_runtime_minutes = round(train_runtime / 60, 2) if train_runtime > 0 else 0

print(f"{train_runtime:.4f} seconds used for training.")
print(f"{train_runtime_minutes} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
from unsloth import FastLanguageModel
# Prepare the model for faster inference after training
FastLanguageModel.for_inference(model)

# --- Define your test input ---
system_prompt = "You are an expert AI assistant. Your task is to read the provided research paper text and generate a JSON object that represents the core logic of the paper as a flowchart, including nodes and edges."
user_input = "The Transformer architecture relies on self-attention mechanisms..." # Replace with a short snippet from a paper NOT in your training set

# --- Format using the chat template ---
# REASON: Crucial for Instruct models. Must format the input exactly as the model expects.
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_input},
    # Leave assistant content empty for generation
]

# Apply the template to create the input IDs
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Important! Tells the model to generate the assistant response
    return_tensors = "pt"
).to("cuda")

# --- Generate the response ---
# Generate text using the model
outputs = model.generate(input_ids=inputs, max_new_tokens = 1024, use_cache = True) # Increased max_new_tokens for potentially long JSON

# Decode the generated tokens back into text
response_full = tokenizer.batch_decode(outputs, skip_special_tokens=False)[0]

# --- Print only the generated part ---
# Find the start of the assistant's response and print it cleanly
assistant_start_tag = "<|start_header_id|>assistant<|end_header_id|>"
response_start_index = response_full.find(assistant_start_tag)

print("--- Model Input Prompt ---")
print(tokenizer.decode(inputs[0], skip_special_tokens=False)) # Show the formatted input
print("\n--- Generated Response ---")

if response_start_index != -1:
    assistant_response = response_full[response_start_index + len(assistant_start_tag):]
    # Remove the end-of-turn token if present
    assistant_response = assistant_response.replace("<|eot_id|>", "").strip()
    print(assistant_response)
else:
    print("Could not find assistant start tag in the response.")
    print("Full response:", response_full) # Print full for debugging if needed

In [None]:
from transformers import TextStreamer
FastLanguageModel.for_inference(model) # Ensure model is in inference mode

# --- Define another test input ---
messages_stream = [
    {"role": "system", "content": "You are an expert AI assistant..."}, # Use your system prompt
    {"role": "user", "content": "Provide a brief summary of Proximal Policy Optimization (PPO)."}, # Another test query
]

# --- Apply template ---
inputs_stream = tokenizer.apply_chat_template(
    messages_stream,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt"
).to("cuda")

# --- Setup streamer and generate ---
# skip_prompt=True ensures only the newly generated tokens are printed
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
print("--- Streaming Response ---")
_ = model.generate(input_ids=inputs_stream, streamer = text_streamer, max_new_tokens = 1024, use_cache = True)

In [None]:
# Save the trained LoRA adapter weights locally to the 'lora_model' directory
print("Saving LoRA adapters...")
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")
print("Adapters saved locally to 'lora_model'.")

# --- Optional: Push to Hugging Face Hub ---
# Make sure you have logged in using !huggingface-cli login
# Replace "your_username/your_model_name" with your desired HF repo name
# model.push_to_hub("your_username/your_model_name", token = "YOUR_HF_TOKEN")
# tokenizer.push_to_hub("your_username/your_model_name", token = "YOUR_HF_TOKEN")
# print("Adapters pushed to Hugging Face Hub.")