#### Convert input-and-label pairs to a single train and a eval jsonl

In [19]:
import json
import glob
import os
from sklearn.model_selection import train_test_split
import random



In [20]:
filtered_pairs= "./InputNLabel/filtred_labels"
file_and_limits = {
    "./InputNLabel/filtered_labels/filtered_formatted.json": 30,
    "./InputNLabel/filtered_labels/filtered_randoms.json": 30,
    "./InputNLabel/filtered_labels/filtered_receipts.json": 30,
    "./InputNLabel/filtered_labels/filtered_reports.json": 10,
    "./InputNLabel/prompt_sensitive_translated/formatted_2.json": 300,
    "./InputNLabel/prompt_sensitive_translated/random_2.json": 300,
    "./InputNLabel/prompt_sensitive_translated/receipts_2.json": 300,
    "./InputNLabel/prompt_sensitive_translated/reports_2.json": 300,
}


train_output_file = "train.jsonl"
eval_output_file = "eval.jsonl"

all_data = []


for file_path, limit in file_and_limits.items():
    with open(file_path, "r", encoding="utf-8") as file:
        data = file.read().strip()

        # Fix concatenated JSON objects
        objs = data.split("}{")

        parsed_objs = []
        for i, obj in enumerate(objs):
            if not obj.startswith("{"):
                obj = "{" + obj
            if not obj.endswith("}"):
                obj = obj + "}"
            try:
                parsed = json.loads(obj)
                parsed_objs.append(parsed)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {file_path}, object {i}: {e}")

        print(f"Loaded {len(parsed_objs)} valid objects from {file_path}")

        # Shuffle and select specified limit
        random.shuffle(parsed_objs)
        limited_data = parsed_objs[:limit]

        print(f"Using {len(limited_data)} objects from {file_path}")
        all_data.extend(limited_data)

print(f"Total combined dataset size: {len(all_data)}")

# Split combined data (90% train, 10% eval)
train_data, eval_data = train_test_split(all_data, test_size=0.1, random_state=42)


# Save train data
with open(train_output_file, "w", encoding="utf-8") as f:
    for item in train_data:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

# Save eval data
with open(eval_output_file, "w", encoding="utf-8") as f:
    for item in eval_data:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

print(f"Train data saved to: {train_output_file} ({len(train_data)} entries)")
print(f"Eval data saved to: {eval_output_file} ({len(eval_data)} entries)")

Loaded 99 valid objects from ./InputNLabel/filtered_labels/filtered_formatted.json
Using 30 objects from ./InputNLabel/filtered_labels/filtered_formatted.json
Loaded 100 valid objects from ./InputNLabel/filtered_labels/filtered_randoms.json
Using 30 objects from ./InputNLabel/filtered_labels/filtered_randoms.json
Loaded 100 valid objects from ./InputNLabel/filtered_labels/filtered_receipts.json
Using 30 objects from ./InputNLabel/filtered_labels/filtered_receipts.json
Loaded 30 valid objects from ./InputNLabel/filtered_labels/filtered_reports.json
Using 10 objects from ./InputNLabel/filtered_labels/filtered_reports.json
Loaded 300 valid objects from ./InputNLabel/prompt_sensitive_translated/formatted_2.json
Using 300 objects from ./InputNLabel/prompt_sensitive_translated/formatted_2.json
Loaded 300 valid objects from ./InputNLabel/prompt_sensitive_translated/random_2.json
Using 300 objects from ./InputNLabel/prompt_sensitive_translated/random_2.json
Loaded 300 valid objects from ./Inpu

### Training LoRA for immediate summariztion on receiving text

In [21]:
import torch
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    AutoConfig,
    Trainer, 
    TrainingArguments, 
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from transformers import BitsAndBytesConfig
import numpy as np
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import evaluate


In [22]:
from huggingface_hub import login
#login()

In [23]:

# Load datasets

# Data preprocessing
def preprocess(tokenizer, example):
    prompt = f"Summarize:\n{example['input']}\n\nSummary:\n"
    full_text = prompt + example["output"]
    tokenized = tokenizer(full_text, truncation=True, padding="max_length", max_length=2048)
    labels = tokenized["input_ids"].copy()
    
    # Mask input prompt tokens in loss calculation
    prompt_token_len = len(tokenizer(prompt, truncation=True)["input_ids"])
    labels[:prompt_token_len] = [-100] * prompt_token_len
    tokenized["labels"] = labels
    
    return tokenized




In [18]:

#compute different metrics
def compute_metrics(tokenizer, eval_pred, rouge, bleu):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=-1)
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.split("Summary:\n")[-1].strip() for pred in decoded_preds]
    decoded_labels = [label.split("Summary:\n")[-1].strip() for label in decoded_labels]

    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_result = bleu.compute(predictions=[p.split() for p in decoded_preds],
                               references=[[l.split()] for l in decoded_labels])

    return {
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "bleu": bleu_result["bleu"],
    }


In [None]:
from functools import partial


dataset = load_dataset("json", data_files={"train": "train.jsonl", "eval": "eval.jsonl"})

# Evaluation metrics setup
rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')
# Model setup
MODEL_NAME ="meta-llama/Llama-3.2-3B-Instruct"  

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
config = AutoConfig.from_pretrained(MODEL_NAME)
# manually set rope_scaling to supported structure:
#config.rope_scaling = {"type": "dynamic", "factor": 2.0}
config.inference_mode = False


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

preprocess = partial(preprocess, tokenizer)

train_data = dataset["train"].map(preprocess)
eval_data = dataset["eval"].map(preprocess)

train_data.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
eval_data.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    return_dict=True
)

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')




Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

TypeError: preprocess() missing 1 required positional argument: 'example'

In [None]:
# training_args = TrainingArguments(
#     output_dir="./summarizer_eval_output",
#     per_device_train_batch_size=2,         # Small batch fits easily 
#     per_device_eval_batch_size=2,
#     gradient_accumulation_steps=4,         # Effective batch size: 2*4=8
#     eval_strategy="steps",
#     eval_steps=50,                         # Frequent evaluation for 1,000 samples
#     logging_steps=20,
#     save_steps=50,                         # More frequent checkpoints
#     learning_rate=2e-4,
#     num_train_epochs=5,                    # Slightly more epochs due to small dataset
#     fp16=True,
#     report_to="none",
#     load_best_model_at_end=True,
#     metric_for_best_model="rougeL",
#     greater_is_better=True,
#     save_total_limit=3
# )



# training_args = TrainingArguments(
#     output_dir="./summarizer_eval_output",
#     per_device_train_batch_size=1,         # <-- Reduce batch size to 1
#     per_device_eval_batch_size=1,          # <-- Reduce eval batch size
#     gradient_accumulation_steps=8,         # <-- Increase to compensate (effective batch size = 8)
#     gradient_checkpointing=True,           # <-- Enable gradient checkpointing (huge memory savings!)
#     eval_strategy="steps",
#     eval_steps=100,                        # <-- Slightly reduce eval frequency (less memory overhead)
#     logging_steps=50,                      # <-- Reduce logging overhead
#     save_steps=100,                        # <-- Fewer checkpoints (saves disk + memory)
#     learning_rate=1e-4,                    # <-- Lower LR can help stability in small batch setups
#     num_train_epochs=5,
#     fp16=True,
#     report_to="none",
#     load_best_model_at_end=True,
#     greater_is_better=True,
#     save_total_limit=2                     # <-- Fewer saved models (disk/memory efficient)
# )
training_args = TrainingArguments(
        output_dir="./llama-3.2-3b-instruct-lora",
        per_device_train_batch_size=4,       # Small due to memory limits
        gradient_accumulation_steps=8,      # 64 is high; 32 balances speed & memory
        num_train_epochs=2,                  # Same as before
        learning_rate=3e-4,                   # 6: Standard for LoRA tuning
        lr_scheduler_type="cosine",           # Good decay schedule
        warmup_ratio=0.01,                    # Slightly lower to prevent slow start
        weight_decay=0.005,                    # More balanced
        fp16=True,                            # Mixed precision training
        gradient_checkpointing=True,          # Reduces memory usage
        max_grad_norm=1.0,                    # More stable than 0.8
        save_total_limit=2,                    # Keep last 2 checkpoints
        save_steps=100,                        # Save frequently for monitoring
        logging_dir="./logs",
        logging_steps=1,                       # Logs every step
        eval_steps=100,                        # <-- Slightly reduce eval frequency (less memory overhead)
        eval_strategy="steps",
        optim="adamw_bnb_8bit",             # Best optimizer for quantized models
        report_to="none",                       # No external logging
        metric_for_best_model="rougeL",
        load_best_model_at_end=True,
        greater_is_better=True


)


training = True
if training:
    model.config.use_cache = False

compute_metrics = partial(compute_metrics, tokenizer, rouge, bleu)
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    compute_metrics=compute_metrics,
)





No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [8]:
import torch, gc

torch.cuda.empty_cache()  # Clears GPU memory immediately
gc.collect()              # Collects any lingering objects in Python memory



trainer.train()

# Save final model
model.save_pretrained("final_adapter_with_eval")
tokenizer.save_pretrained("final_adapter_with_eval")



RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn