In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import torch

# Load tokenizer
model_name = "meta-llama/Llama-3.2-3B-Instruct"  # or the base model you trained on
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load base model in 4-bit if you used quantization
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.float16
)
# Load your LoRA adapter
adapter_path = "./../Training/final_adapter_with_eval"  # or wherever your adapter_model.safetensors is
model = PeftModel.from_pretrained(base_model, adapter_path)
model.config.use_cache = True


2025-03-24 19:31:25.882470: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def generate_summary(input_text, max_new_tokens=150):
    prompt = f"summarize:{input_text}"
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

def generate_base_summary(input_text, max_new_tokens=150):
    prompt = f"summarize :{input_text}\n summary:"
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = base_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

In [3]:
import json
def get_json_from_file(file_path):

    with open(file_path, "r", encoding="utf-8") as file:
        data = file.read().strip()

        # Fix concatenated JSON objects
        objs = data.split("}{")

        parsed_objs = []
        for i, obj in enumerate(objs):
            if not obj.startswith("{"):
                obj = "{" + obj
            if not obj.endswith("}"):
                obj = obj + "}"
            try:
                parsed = json.loads(obj)
                parsed_objs.append(parsed)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {file_path}, object {i}: {e}")
    return parsed_objs

In [8]:

# example_file = "./../Training/../Training/InputNLabel/prompt_sensitive_translated/receipts_2.json"
# file_and_limits = {
#     "./../Training/InputNLabel/filtered_labels/filtered_formatted.json": 30,
#     "./../Training/InputNLabel/filtered_labels/filtered_randoms.json": 30,
#     "./../Training/InputNLabel/filtered_labels/filtered_receipts.json": 30,
#     "./../Training/InputNLabel/filtered_labels/filtered_reports.json": 10,
#     "./../Training/InputNLabel/prompt_sensitive_translated/formatted_2.json": 300,
#     "./../Training/InputNLabel/prompt_sensitive_translated/random_2.json": 300,
#     "./../Training/InputNLabel/prompt_sensitive_translated/receipts_2.json": 300,
#     "./../Training/InputNLabel/prompt_sensitive_translated/reports_2.json": 300,
# }
formatted_chunks = "./../Training/InputNLabel/prompt_sensitive_translated/formatted_2.json"
receipt_chunks = "./../Training/InputNLabel/prompt_sensitive_translated/receipts_2.json"
report_chunks = "./../Training/InputNLabel/prompt_sensitive_translated/reports_2.json"
random_chunks = "./../Training/InputNLabel/prompt_sensitive_translated/random_2.json"

formatted = get_json_from_file(formatted_chunks)
receipts = get_json_from_file(receipt_chunks)
reports = get_json_from_file(report_chunks)
randoms = get_json_from_file(random_chunks)

print("PDFs with formats",len(formatted))
print("PDFs of recipts that need ocr", len(receipts))
print("PDFs of well written long papers, no ocr",len(reports))
print("PDFs of all sorts",len(randoms))


print(formatted[0]["input"])



PDFs with formats 300
PDFs of recipts that need ocr 300
PDFs of well written long papers, no ocr 300
PDFs of all sorts 300
-08
Shipped Date: 2017-08-11
Products:
Product: Jack's New England Glam Chowder
‘Quantity: 20
Unit Pie:
9.65,

--------------------------------------------------


In [22]:
#trained adapter 0 after 72 examples: there is still original text but a summary is now  more concise
input_text = formatted[0]["input"] + formatted[1]["input"] + formatted[2]["input"]
print("!!!!<----------------------->!!!!")
print(generate_base_summary(input_text))
print("!!!!<----------------------->!!!!")
#generate_summary(formatted[0]["input"] + formatted[1]["input"] + formatted[2]["input"]
print(generate_summary(input_text))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


!!!!<----------------------->!!!!


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


summarize :-08
Shipped Date: 2017-08-11
Products:
Product: Jack's New England Glam Chowder
‘Quantity: 20
Unit Pie:
9.65,

--------------------------------------------------7-08-21
Shipped Date: 2017-08-28
Products:
--------------------------------------------------------------------------------------------------
Product: Gudbrandsdalsost
Quantity: 20
Unit Price: 36.0
Total: 720.0
--------------------------------------------------------------------------------------------------
Product: Outback Lager
Quantity: 15
Unit Price: 15.0
Total: 225.0
Total Price:

--------------------------------------------------
Total Price: 945.0

---------------------------------------Order ID: 10481
Shipping Details:
Ship Name: Ricardo Adocicados
Ship Address: Av. Copacabana, 257
Ship Cy: de Janeiro.
Ship Region: South America
Ship Postal Code: 02389-890
Ship County: Brazil
Customer Details:
CustomerID: RICAR
Customer Name:
Ricardo Adocicados
Employee Details:
Employee Name: Laura Callahan
Shipper Details: