In [11]:
import os
os.environ["ACCELERATE_MIXED_PRECISION"] = "fp16"

In [2]:
%pip install datasets transformers trl peft huggingface_hub dotenv bitsandbytes

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

# Load variables from .env into the notebook kernel
load_dotenv()

login(token=os.getenv("HF_TOKEN"))


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
# Set up directories
preprocessed_data_dir = "/home/dinesh/Downloads/dataset/processed-IN-Ext/"
# Base model (gated: make sure HF login done)
model_name = "meta-llama/Llama-2-7b-hf"
# Where to save LoRA adapter + tokenizer
adapter_save_dir = "/home/dinesh/Downloads/dataset/fine_tuned_lora_adapter"
os.makedirs(adapter_save_dir, exist_ok=True)
 
# (Optional) If you want a merged full model later (BIG disk usage)
# merged_model_dir = "../fine_tuned_lora_merged_model"
# Load the tokenizer and model
# Load tokenizer + model (8-bit)
# -------------------------
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16
)
 
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to eos_token
 
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,   # force fp16 compute
)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.55s/it]


In [4]:
# Set up LoRA configuration
lora_config = LoraConfig(
    lora_alpha=16,          # Scaling factor for low-rank matrices
    lora_dropout=0.1,      # Dropout rate for LoRA layers
    r=8,                   # Rank (size of low-rank matrices)
    bias="none",           # No bias in LoRA layers
    task_type="CAUSAL_LM",  # Task type for causal language modeling
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Target modules for LoRA
)
# Apply LoRAg
model = get_peft_model(model, lora_config)
for n, p in model.named_parameters():
    if p.requires_grad and p.dtype == torch.bfloat16:
        p.data = p.data.to(torch.float16)
model.print_trainable_parameters()
# for name, p in model.named_parameters():
#     if p.requires_grad and p.dtype == torch.bfloat16:
#         p.data = p.data.to(torch.float16)
print("Trainable dtypes:", sorted({p.dtype for p in model.parameters() if p.requires_grad}))


trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.1243
Trainable dtypes: [torch.float32]


In [5]:
# 3) Apply LoRA — PUT THE DTYPE WRAP HERE
# import torch
# from peft import get_peft_model

old = torch.get_default_dtype()
torch.set_default_dtype(torch.float16)        # force LoRA params to be created as fp16
model = get_peft_model(model, lora_config)
torch.set_default_dtype(old)

print(sorted({p.dtype for p in model.parameters() if p.requires_grad}))

[torch.float32]




In [None]:
# # Runtime guard: convert bf16 grads if they appear
# def fp16_grad_hook(grad):
#     if grad is not None and grad.dtype == torch.bfloat16:
#         return grad.to(torch.float16)
#     return grad

In [None]:
# for n, p in model.named_parameters():
#     if p.requires_grad:
#         p.register_hook(fp16_grad_hook)

In [6]:
# Load and preprocess dataset
# -------------------------
def load_dataset(jsonl_file):
    with open(jsonl_file, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
 
    system_prompt = "Summarize the following legal text."
 
    texts = []
    for item in data:
        text = f"""### Instruction: {system_prompt}
 
### Input:
{item['judgement'].strip()[:10000]}
 
### Response:
{item['summary'].strip()}
""".strip()
        texts.append(text)
 
    return Dataset.from_dict({"text": texts})
 
train_file_A1 = os.path.join(preprocessed_data_dir, "full_summaries_A1.jsonl")
train_file_A2 = os.path.join(preprocessed_data_dir, "full_summaries_A2.jsonl")
 
train_dataset_A1 = load_dataset(train_file_A1)
train_dataset_A2 = load_dataset(train_file_A2)
 
train_data = concatenate_datasets([train_dataset_A1, train_dataset_A2])

In [9]:
# 7) Training config (TRL 0.26.2)
# =========================
train_params = SFTConfig(
    output_dir="/home/dinesh/Documents/vs code/results_lora",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,

    optim="paged_adamw_8bit",
    learning_rate=2e-4,      # FIX: 5e-3 is too high for LoRA on LLaMA-2
    weight_decay=0.001,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",

    logging_steps=50,
    save_steps=50,
    report_to="tensorboard",

    fp16=False,
    bf16=True,              # keep bf16 off
    max_grad_norm=0.0,       # IMPORTANT: disables clip_grad_norm_ (extra safety)
                             # If you want clipping, set back to 0.3 AFTER this works.

    group_by_length=True,
    dataset_text_field="text",

         # TRL 0.26.x wants this in SFTConfig
    packing=True,            # TRL 0.26.x wants this in SFTConfig
)

In [None]:
# # Training config
# # -------------------------
# train_params = SFTConfig(
#     output_dir="../results_lora",
#     num_train_epochs=3,
#     per_device_train_batch_size=1,
#     gradient_accumulation_steps=1,
#     optim="paged_adamw_32bit",
#     save_steps=50,
#     logging_steps=50,
#     learning_rate=5e-3,
#     weight_decay=0.001,
#     fp16=True,
#     bf16=False,
#     bf16_full_eval=False,
#     fp16_full_eval=False,
#     max_grad_norm=0.3,
#     warmup_ratio=0.03,
#     group_by_length=True,
#     lr_scheduler_type="constant",
#     report_to="tensorboard",
#     dataset_text_field="text"
# )

In [10]:
from trl import SFTTrainer

fine_tuning = SFTTrainer(
    model=model,
    train_dataset=train_data,
    args=train_params,
    processing_class=tokenizer,
)
# Start fine-tuning
print("Starting fine-tuning...")
fine_tuning.train()

Padding-free training is enabled, but the attention implementation is not set to a supported flash attention variant. Padding-free training flattens batches into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn2, kernels-community/flash-attn3, kernels-community/vllm-flash-attn3. Using other implementations may lead to unexpected behavior. To ensure compatibility, set `attn_implementation` in the model configuration to one of these supported options or verify that your attention mechanism can handle flattened sequences.
You are using packing, but the attention implementation is not set to a supported flash attention variant. Packing gathers multiple samples into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn2, kernels-community/flash-attn3, kernels-community/vllm-f

Starting fine-tuning...


Step,Training Loss
50,1.6383
100,1.5431
150,1.3481
200,1.2747
250,0.9671
300,0.9232


TrainOutput(global_step=300, training_loss=1.282414639790853, metrics={'train_runtime': 193.7853, 'train_samples_per_second': 1.548, 'train_steps_per_second': 1.548, 'total_flos': 1.21941176352768e+16, 'train_loss': 1.282414639790853, 'epoch': 3.0})

In [12]:
# Save the fine-tuned model
print("Saving the fine-tuned model...")
model.save_pretrained("/home/dinesh/Downloads/dataset/fine_tuned_lora_model")
tokenizer.save_pretrained("/home/dinesh/Downloads/dataset/fine_tuned_lora_model")
print("Fine-tuned model saved at '/home/dinesh/Downloads/dataset/fine_tuned_lora_model'")

Saving the fine-tuned model...
Fine-tuned model saved at '/home/dinesh/Downloads/dataset/fine_tuned_lora_model'
