In [1]:
pip install -U transformers==4.44.2 accelerate==0.34.2 peft==0.11.1 bitsandbytes sentence-transformers


Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.34.2
  Downloading accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Collecting peft==0.11.1
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.44.2)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accele

In [None]:
# Hugging Face authentication
from huggingface_hub import login

# Replace 'YOUR_HF_TOKEN' with your Hugging Face token
login(token="hf_pqJVpTilNYQokegiazQfTTGEnsdffdfdjbskjbdfbshbcsdfsf")

In [8]:
# -------------------------------
# Cell 1: Setup + Model + Tokenizer + QLoRA (COPY-PASTE)
# -------------------------------
# (Run this after you've installed the right packages:
#  pip install "transformers==4.40.2" "peft==0.11.1" "accelerate==0.30.1" bitsandbytes -q )

from google.colab import drive
drive.mount('/content/drive', force_remount=False)

import os, torch
# reduce fragmentation (helpful but optional)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"  # or "expandable_segments:True"

# clear cuda cache before heavy loads
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

model_name = "microsoft/Phi-3-mini-4k-instruct"

# --- load tokenizer first (avoids extra warnings) ---
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# --- 4-bit Quantization Config (QLoRA) ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Force entire model on GPU (works reliably on Colab GPUs)
device_map = {"": 0}

# Load model in safe 4-bit mode (IMPORTANT: use device_map, not "auto")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    low_cpu_mem_usage=True,
    max_memory={0: "14GiB", "cpu": "6GiB"},  # adjust if you have different GPU
    trust_remote_code=True
)

# Wrap forward to accept extra kwargs Trainer may pass (prevents num_items_in_batch errors)
def forward_with_kwargs(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
    return self.__class__.forward(self, input_ids=input_ids, attention_mask=attention_mask, labels=labels)
model.forward = forward_with_kwargs.__get__(model, model.__class__)

# LoRA config — r reduced to 8 for stability on Colab; increase if you have more GPU RAM
lora_config = LoraConfig(
    r=8,                    # safer default for 16GB GPU — set to 4/8/16 depending on memory
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

model = get_peft_model(model, lora_config)

# Disable use_cache to avoid DynamicCache problems with Phi-3 during training
model.config.use_cache = False

print("Model + QLoRA setup complete!")
print("Trainable params:", sum(p.numel() for p in model.parameters() if p.requires_grad))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
# -------------------------------
# Cell 2: Dataset + Tokenization + Trainer (FIXED)
# -------------------------------
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Load dataset
dataset = load_dataset("json", data_files="/content/drive/MyDrive/legal_finetune_dataset.json")["train"]

# Preprocess (combine instruction + input)
def preprocess(example):
    text = example["instruction"]
    if example["input"] and example["input"].strip():
        text += "\n\n<input>\n" + example["input"]

    example["text"] = text
    example["labels_text"] = example["response"]
    return example

dataset = dataset.map(preprocess)

# Tokenizer function
def tokenize(batch):
    # Tokenize inputs
    inputs = tokenizer(
        batch["text"],
        truncation=True,
        max_length=512,
        padding=False
    )

    # Tokenize outputs
    labels = tokenizer(
        batch["labels_text"],
        truncation=True,
        max_length=512,
        padding=False
    )

    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)

# Data collator for CAUSAL LM (IMPORTANT)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal LM, not masked LM
)

# Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/phi3-legal-lora",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    warmup_steps=10,
    learning_rate=2e-4,
    fp16=False,               # DISABLE FP16
    bf16=True,                # ENABLE BF16 (matches your model)
    logging_steps=5,
    save_steps=200,
    optim="paged_adamw_8bit", # Best optimizer for QLoRA
    report_to="none",
    remove_unused_columns=False,
    save_strategy="steps",
    save_total_limit=5
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

print("Trainer ready!")


In [None]:
import torch, numpy as np

torch.serialization.add_safe_globals([
    np.core.multiarray._reconstruct,
    np.dtype,
    np.ndarray,
    np.integer,
    np.floating,
    np.bool_,
    np.bytes_,
    np.str_,
    np.void,
    np.generic,
    np.complexfloating,
    # new required dtype class
    np.dtype(np.uint32).type,   # UInt32DType
    np.dtype(np.uint64).type,
    np.dtype(np.int32).type,
    np.dtype(np.int64).type,
    np.dtype(np.float32).type,
    np.dtype(np.float64).type,
])


In [None]:
import torch
import torch.serialization

_orignal_torch_load = torch.load

def patched_torch_load(*args, **kwargs):
  kwargs['weights_only'] = False
  return _orignal_torch_load(*args, **kwargs)

torch.load = patched_torch_load

In [None]:
last_ckpt = max(
    [os.path.join("/content/drive/MyDrive/phi3-legal-lora", d)
     for d in os.listdir("/content/drive/MyDrive/phi3-legal-lora")
     if d.startswith("checkpoint")],
    key=os.path.getmtime
)

trainer.train(resume_from_checkpoint=last_ckpt)


In [None]:
# Save LoRA + tokenizer
model.save_pretrained("/content/drive/MyDrive/phi3-legal-lora")
tokenizer.save_pretrained("/content/drive/MyDrive/phi3-legal-lora")

# Quick inference test
pipe_input = tokenizer(
    "<instruction>\nExtract the penalty clause.\n\n<input>\nThe contractor shall pay a fine of ₹25,000 for delay.\n",
    return_tensors="pt"
).to("cuda")

output = model.generate(**pipe_input, max_new_tokens=100)
print("=== Model Output ===")
print(tokenizer.decode(output[0], skip_special_tokens=True))

# /content/drive/MyDrive/phi3-legal-lora/checkpoint-400
# trainer.train(resume_from_checkpoint="/content/drive/MyDrive/phi3-legal-lora/checkpoint-400")
# trainer.train(resume_from_checkpoint=True)
