#### Convert input-and-label pairs to a single train and a eval jsonl

In [26]:
import json
import glob
import os
from sklearn.model_selection import train_test_split
import random



In [39]:
filtered_pairs= "./InputNLabel/filtred_labels"
file_and_limits = {
    "./InputNLabel/filtered_labels/filtered_formatted.json": -1,
    "./InputNLabel/filtered_labels/filtered_randoms.json": -1,
    "./InputNLabel/filtered_labels/filtered_receipts.json": -1,
    "./InputNLabel/filtered_labels/filtered_reports.json": -1,
    "./InputNLabel/prompt_sensitive_translated/formatted_1.json": 300,
    "./InputNLabel/prompt_sensitive_translated/random_1.json": 300,
    "./InputNLabel/prompt_sensitive_translated/receipts_1.json": 300,
    "./InputNLabel/prompt_sensitive_translated/reports_1.json": 300,
    "./InputNLabel/prompt_sensitive_translated/formatted_2.json": 300,
    "./InputNLabel/prompt_sensitive_translated/random_2.json": 300,
    "./InputNLabel/prompt_sensitive_translated/receipts_2.json": 300,
    "./InputNLabel/prompt_sensitive_translated/reports_2.json": 300,
}


train_output_file = "train_600_1_2.jsonl"
eval_output_file = "eval_600_1_2.jsonl"
def get_train_and_eval_data(file_and_limits, train_output_file, eval_output_file):
    all_data = []


    for file_path, limit in file_and_limits.items():
        with open(file_path, "r", encoding="utf-8") as file:
            data = file.read().strip()

            # Fix concatenated JSON objects
            objs = data.split("}{")

            parsed_objs = []
            for i, obj in enumerate(objs):
                if not obj.startswith("{"):
                    obj = "{" + obj
                if not obj.endswith("}"):
                    obj = obj + "}"
                try:
                    parsed = json.loads(obj)
                    parsed_objs.append(parsed)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON in file {file_path}, object {i}: {e}")

            print(f"Loaded {len(parsed_objs)} valid objects from {file_path}")

            # Shuffle and select specified limit
            random.shuffle(parsed_objs)
            limited_data = parsed_objs[:limit]

            print(f"Using {len(limited_data)} objects from {file_path}")
            all_data.extend(limited_data)

    # Split combined data (90% train, 10% eval)

    train_data, eval_data =  train_test_split(all_data, test_size=0.1, random_state=42)
    if train_output_file is not None:
        try:
            print(f"Total combined dataset size: {len(all_data)}")
            with open(train_output_file, "w", encoding="utf-8") as f:
                for item in train_data:
                    json.dump(item, f, ensure_ascii=False)
                    f.write("\n")
        except Exception as e:
            print(f"Error saving train data: {e}")
    if eval_output_file is not None:
    # Save eval data
        try: 
            with open(eval_output_file, "w", encoding="utf-8") as f: 
                for item in eval_data:
                    json.dump(item, f, ensure_ascii=False)
                    f.write("\n")
        except Exception as e:
            print(f"Error saving eval data: {e}") 

    print(f"Train data saved to: {train_output_file} ({len(train_data)} entries)")
    print(f"Eval data saved to: {eval_output_file} ({len(eval_data)} entries)")
    return train_data, eval_data


train_data, eval_data = get_train_and_eval_data(file_and_limits, train_output_file, eval_output_file) 



Loaded 99 valid objects from ./InputNLabel/filtered_labels/filtered_formatted.json
Using 98 objects from ./InputNLabel/filtered_labels/filtered_formatted.json
Loaded 100 valid objects from ./InputNLabel/filtered_labels/filtered_randoms.json
Using 99 objects from ./InputNLabel/filtered_labels/filtered_randoms.json
Loaded 100 valid objects from ./InputNLabel/filtered_labels/filtered_receipts.json
Using 99 objects from ./InputNLabel/filtered_labels/filtered_receipts.json
Loaded 30 valid objects from ./InputNLabel/filtered_labels/filtered_reports.json
Using 29 objects from ./InputNLabel/filtered_labels/filtered_reports.json
Loaded 300 valid objects from ./InputNLabel/prompt_sensitive_translated/formatted_1.json
Using 300 objects from ./InputNLabel/prompt_sensitive_translated/formatted_1.json
Loaded 300 valid objects from ./InputNLabel/prompt_sensitive_translated/random_1.json
Using 300 objects from ./InputNLabel/prompt_sensitive_translated/random_1.json
Loaded 300 valid objects from ./Inpu

### Training LoRA for immediate summariztion on receiving text

In [28]:
import torch
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    AutoConfig,
    Trainer, 
    TrainingArguments, 
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from transformers import BitsAndBytesConfig
import numpy as np
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training, PeftModel, PeftConfig
import evaluate


In [29]:
from huggingface_hub import login
#login()
MODEL_NAME ="meta-llama/Llama-3.2-3B-Instruct"  


In [30]:

# Load datasets

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
# Data preprocessing
def preprocess( example):
    prompt = f"Summarize:\n{example['input']}\n\nSummary:\n"
    full_text = prompt + example["output"]
    tokenized = tokenizer(full_text, truncation=True, padding="max_length", max_length=2048)
    labels = tokenized["input_ids"].copy()
    
    # Mask input prompt tokens in loss calculation
    prompt_token_len = len(tokenizer(prompt, truncation=True)["input_ids"])
    labels[:prompt_token_len] = [-100] * prompt_token_len
    tokenized["labels"] = labels
    
    return tokenized




In [31]:

#compute different metrics
rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')
def compute_metrics( eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=-1)
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.split("Summary:\n")[-1].strip() for pred in decoded_preds]
    decoded_labels = [label.split("Summary:\n")[-1].strip() for label in decoded_labels]

    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_result = bleu.compute(predictions=[p.split() for p in decoded_preds],
                               references=[[l.split()] for l in decoded_labels])

    return {
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "bleu": bleu_result["bleu"],
    }


In [32]:

def format_supervised_data(train_file, eval_file ):
    dataset = load_dataset("json", data_files={"train": train_file, "eval": eval_file})

    train_data = dataset["train"].map(preprocess)
    eval_data = dataset["eval"].map(preprocess)

    train_data.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    eval_data.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    return train_data, eval_data

train_data, eval_data = format_supervised_data( "train_600_1_2.jsonl", "eval_600_1_2.jsonl")


Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2452 [00:00<?, ? examples/s]

Map:   0%|          | 0/273 [00:00<?, ? examples/s]

In [35]:
from functools import partial
def set_up_lora(MODEL_NAME, lora_config="", training = False):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )
    config = AutoConfig.from_pretrained(MODEL_NAME)
    # manually set rope_scaling to supported structure:
    config.rope_scaling = {"type": "dynamic", "factor": 2.0}
    if training:
        config.inference_mode = False
        config.use_cache = False
    else:
        config.inference_mode = True
        config.use_cache = True


    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        config=config,
        device_map="auto"
    )
    #model.get_input_embeddings().weight.requires_grad = True

    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()
    # LoRA configuration
    if lora_config == "":
        lora_config = LoraConfig( r=16,
            lora_alpha=32,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type=TaskType.CAUSAL_LM
        )
        
        model = get_peft_model(model, lora_config)
    else:
        model = PeftModel.from_pretrained(model,lora_config)

    return model







In [36]:

# from bitsandbytes.optim import AdamW8bit
# optimizer = AdamW8bit(model.parameters(), lr=3e-4, betas=(0.9, 0.95), eps=1e-8)
# from transformers import get_cosine_schedule_with_warmup
batch_size = 4
calculation_steps = 8
num_epochs = 2
# total_training_steps = (len(train_data) // (batch_size * calculation_steps)) * num_epochs
# num_training_steps = total_training_steps  # calculate this
# num_warmup_steps = int(0.03 * num_training_steps)

# lr_scheduler = get_cosine_schedule_with_warmup(
#     optimizer=optimizer,
#     num_warmup_steps=num_warmup_steps,
#     num_training_steps=num_training_steps
# )

def set_up_training_args(batch_size, calculation_steps, num_epochs):

    training_args = TrainingArguments(
            output_dir="./llama-3.2-3b-instruct-lora",
            per_device_train_batch_size=batch_size,       # Small due to memory limits
            gradient_accumulation_steps=calculation_steps,      # 64 is high; 32 balances speed & memory
            num_train_epochs=num_epochs,                  # Same as before
            learning_rate=3e-4,                   # 6: Standard for LoRA tuning
            lr_scheduler_type="cosine",           # Good decay schedule
            warmup_ratio=0.01,                    # Slightly lower to prevent slow start
            weight_decay=0.005,                    # More balanced
            fp16=True,                            # Mixed precision training
            gradient_checkpointing=True,          # Reduces memory usage
            max_grad_norm=1.0,                    # More stable than 0.8
            save_total_limit=2,                    # Keep last 2 checkpoints
            save_steps=100,                        # Save frequently for monitoring
            logging_dir="./logs",
            logging_steps=1,                       # Logs every step
            eval_steps=100,                        # <-- Slightly reduce eval frequency (less memory overhead)
            eval_strategy="steps",                  #not enough of data for "epoch"
            #optimizer=(optimizer,lr_scheduler),             # Best optimizer for quantized models
            optim="adamw_bnb_8bit",
            #optim_args={"lr_scheduler": lr_scheduler, "optimizer": optimizer},
            report_to="none",                       # No external logging
            metric_for_best_model="rougeL",
            load_best_model_at_end=True,
            greater_is_better=True
    )
    return training_args


model = set_up_lora(MODEL_NAME)

training_args = set_up_training_args(batch_size, calculation_steps, num_epochs)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    compute_metrics=compute_metrics,
)





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# import torch, gc

# torch.cuda.empty_cache()  # Clears GPU memory immediately
# gc.collect()              # Collects any lingering objects in Python memory



# #trainer.train()

# # Save final model
# model.save_pretrained("final_adapter_with_eval")
# tokenizer.save_pretrained("final_adapter_with_eval")

In [None]:


# import torch, gc



# torch.cuda.empty_cache()  # Clears GPU memory immediately
# gc.collect()              # Collects any lingering objects in Python memory

# trainer.evaluate()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

ValueError: Can't find 'adapter_config.json' at './../Training/final_adapter_with_eval'

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import torch
file_and_limits = {
    "./InputNLabel/filtered_labels/filtered_formatted.json": -1,
    "./InputNLabel/filtered_labels/filtered_randoms.json": -1,
    "./InputNLabel/filtered_labels/filtered_receipts.json": -1,
    "./InputNLabel/filtered_labels/filtered_reports.json": -1,
    "./InputNLabel/prompt_sensitive_translated/formatted_1.json": 300,
    "./InputNLabel/prompt_sensitive_translated/random_1.json": 300,
    "./InputNLabel/prompt_sensitive_translated/receipts_1.json": 300,
    "./InputNLabel/prompt_sensitive_translated/reports_1.json": 300,
}
train_data, eval_data = get_train_and_eval_data(file_and_limits, "train_1.jsonl", "eval_1.jsonl")
train_data, eval_data = format_supervised_data( "train_1.jsonl", "eval_1.jsonl")



# Load tokenizer
model_name = "meta-llama/Llama-3.2-3B-Instruct"  # or the base model you trained on
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load your LoRA adapter
#previously trained adapter
adapter_path = "./../Training/final_adapter_with_eval_0"  # or wherever your adapter_model.safetensors is
model = set_up_lora(MODEL_NAME, lora_config=adapter_path, training = False)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    compute_metrics=compute_metrics,
)
trainer.train()
model.save_pretrained("final_adapter_with_eval_1")
tokenizer.save_pretrained("final_adapter_with_eval_1")


Loaded 99 valid objects from ./InputNLabel/filtered_labels/filtered_formatted.json
Using 98 objects from ./InputNLabel/filtered_labels/filtered_formatted.json
Loaded 100 valid objects from ./InputNLabel/filtered_labels/filtered_randoms.json
Using 99 objects from ./InputNLabel/filtered_labels/filtered_randoms.json
Loaded 100 valid objects from ./InputNLabel/filtered_labels/filtered_receipts.json
Using 99 objects from ./InputNLabel/filtered_labels/filtered_receipts.json
Loaded 30 valid objects from ./InputNLabel/filtered_labels/filtered_reports.json
Using 29 objects from ./InputNLabel/filtered_labels/filtered_reports.json
Loaded 300 valid objects from ./InputNLabel/prompt_sensitive_translated/formatted_1.json
Using 300 objects from ./InputNLabel/prompt_sensitive_translated/formatted_1.json
Loaded 300 valid objects from ./InputNLabel/prompt_sensitive_translated/random_1.json
Using 300 objects from ./InputNLabel/prompt_sensitive_translated/random_1.json
Loaded 300 valid objects from ./Inpu

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1372 [00:00<?, ? examples/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


KeyboardInterrupt: 