#### Convert input-and-label pairs to a single train and a eval jsonl

In [None]:
import json
import glob
import os
from sklearn.model_selection import train_test_split
import random



In [2]:
filtered_pairs= "./InputNLabel/filtred_labels"
file_and_limits = {
    "./InputNLabel/filtered_labels/filtered_formatted.json": 30,
    "./InputNLabel/filtered_labels/filtered_randoms.json": 30,
    "./InputNLabel/filtered_labels/filtered_receipts.json": 30,
    "./InputNLabel/filtered_labels/filtered_reports.json": 10,
    "./InputNLabel/prompt_sensitive_translated/formatted_2.json": 300,
    "./InputNLabel/prompt_sensitive_translated/random_2.json": 300,
    "./InputNLabel/prompt_sensitive_translated/receipts_2.json": 300,
    "./InputNLabel/prompt_sensitive_translated/reports_2.json": 300,
}


train_output_file = "train.jsonl"
eval_output_file = "eval.jsonl"

all_data = []


for file_path, limit in file_and_limits.items():
    with open(file_path, "r", encoding="utf-8") as file:
        data = file.read().strip()

        # Fix concatenated JSON objects
        objs = data.split("}{")

        parsed_objs = []
        for i, obj in enumerate(objs):
            if not obj.startswith("{"):
                obj = "{" + obj
            if not obj.endswith("}"):
                obj = obj + "}"
            try:
                parsed = json.loads(obj)
                parsed_objs.append(parsed)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {file_path}, object {i}: {e}")

        print(f"Loaded {len(parsed_objs)} valid objects from {file_path}")

        # Shuffle and select specified limit
        random.shuffle(parsed_objs)
        limited_data = parsed_objs[:limit]

        print(f"Using {len(limited_data)} objects from {file_path}")
        all_data.extend(limited_data)

print(f"Total combined dataset size: {len(all_data)}")

# Split combined data (90% train, 10% eval)
train_data, eval_data = train_test_split(all_data, test_size=0.1, random_state=42)


# Save train data
with open(train_output_file, "w", encoding="utf-8") as f:
    for item in train_data:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

# Save eval data
with open(eval_output_file, "w", encoding="utf-8") as f:
    for item in eval_data:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

print(f"Train data saved to: {train_output_file} ({len(train_data)} entries)")
print(f"Eval data saved to: {eval_output_file} ({len(eval_data)} entries)")

Loaded 99 valid objects from ./InputNLabel/filtered_labels/filtered_formatted.json
Using 30 objects from ./InputNLabel/filtered_labels/filtered_formatted.json
Loaded 100 valid objects from ./InputNLabel/filtered_labels/filtered_randoms.json
Using 30 objects from ./InputNLabel/filtered_labels/filtered_randoms.json
Loaded 100 valid objects from ./InputNLabel/filtered_labels/filtered_receipts.json
Using 30 objects from ./InputNLabel/filtered_labels/filtered_receipts.json
Loaded 30 valid objects from ./InputNLabel/filtered_labels/filtered_reports.json
Using 10 objects from ./InputNLabel/filtered_labels/filtered_reports.json
Loaded 300 valid objects from ./InputNLabel/prompt_sensitive_translated/formatted_2.json
Using 300 objects from ./InputNLabel/prompt_sensitive_translated/formatted_2.json
Loaded 300 valid objects from ./InputNLabel/prompt_sensitive_translated/random_2.json
Using 300 objects from ./InputNLabel/prompt_sensitive_translated/random_2.json
Loaded 300 valid objects from ./Inpu

### Training LoRA for immediate summariztion on receiving text

In [3]:
import torch
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    Trainer, 
    TrainingArguments, 
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from transformers import BitsAndBytesConfig
import numpy as np
import evaluate

# Model setup
MODEL_NAME ="meta-llama/Llama-3.2-3B-Instruct"  # Replace with your actual model ID

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

# LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

# Load datasets
dataset = load_dataset("json", data_files={"train": "train.jsonl", "eval": "eval.jsonl"})

# Data preprocessing
def preprocess(example):
    prompt = f"Summarize:\n{example['input']}\n\nSummary:\n"
    full_text = prompt + example["output"]
    tokenized = tokenizer(full_text, truncation=True, padding="max_length", max_length=2048)
    labels = tokenized["input_ids"].copy()
    
    # Mask input prompt tokens in loss calculation
    prompt_token_len = len(tokenizer(prompt, truncation=True)["input_ids"])
    labels[:prompt_token_len] = [-100] * prompt_token_len
    tokenized["labels"] = labels
    
    return tokenized

train_data = dataset["train"].map(preprocess)
eval_data = dataset["eval"].map(preprocess)

train_data.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
eval_data.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Evaluation metrics setup
rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=-1)
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.split("Summary:\n")[-1].strip() for pred in decoded_preds]
    decoded_labels = [label.split("Summary:\n")[-1].strip() for label in decoded_labels]

    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_result = bleu.compute(predictions=[p.split() for p in decoded_preds],
                               references=[[l.split()] for l in decoded_labels])

    return {
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "bleu": bleu_result["bleu"],
    }

# # Training arguments
# training_args = TrainingArguments(
#     output_dir="./summarizer_eval_output",
#     per_device_train_batch_size=2,
#     per_device_eval_batch_size=2,
#     gradient_accumulation_steps=4,
#     evaluation_strategy="steps",
#     eval_steps=200,
#     logging_steps=20,
#     save_steps=200,
#     learning_rate=2e-4,
#     num_train_epochs=3,
#     fp16=True,
#     report_to="none",
#     load_best_model_at_end=True,
#     metric_for_best_model="rougeL",
#     greater_is_better=True,
#     save_total_limit=3
# )
training_args = TrainingArguments(
    output_dir="./summarizer_eval_output",
    per_device_train_batch_size=2,         # Small batch fits easily on RTX3060
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,         # Effective batch size: 2*4=8
    evaluation_strategy="steps",
    eval_steps=50,                         # Frequent evaluation for 1,000 samples
    logging_steps=20,
    save_steps=50,                         # More frequent checkpoints
    learning_rate=2e-4,
    num_train_epochs=5,                    # Slightly more epochs due to small dataset
    fp16=True,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    save_total_limit=3
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


RuntimeError: Failed to import transformers.models.auto.modeling_auto because of the following error (look up to see its traceback):
Failed to import transformers.generation.utils because of the following error (look up to see its traceback):
register_pytree_node() got an unexpected keyword argument 'flatten_with_keys_fn'

In [None]:
# Start training
trainer.train()

# Save final model
model.save_pretrained("final_adapter_with_eval")
tokenizer.save_pretrained("final_adapter_with_eval")