In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from peft import PeftModel, PeftConfig
import torch
import gc

# Load tokenizer
model_name = "meta-llama/Llama-3.2-3B-Instruct"  # or the base model you trained on
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
gc.collect()


# Load base model in 4-bit if you used quantization
config = AutoConfig.from_pretrained(model_name)
# manually set rope_scaling to supported structure:
config.rope_scaling = {"type": "dynamic", "factor": 2.0}
config.use_cache = True

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="mps",
    config=config,
    torch_dtype=torch.float16
)

# Load your LoRA adapter
adapter_path = "./../Training/final_adapter_with_eval_1"  # or wherever your adapter_model.safetensors is
adapted_model= PeftModel.from_pretrained(base_model, adapter_path)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [None]:
#Need to empty_cache other wise 2 models' results will bleed into each other
# only slice and decode the new tokens 
def generate_summary(input_text, adapted_model, max_new_tokens=150):

    prompt = f"""Summarize:\n{input_text} Summary:\n"""

#     prompt = f"""Without commentary, from its original language summarize to English on useful information including sensitive data, below 100 words. If no meaning return <NULL>
# Text:
# {input_text}

    
    inputs = tokenizer(prompt, return_tensors="pt").to(adapted_model.device)
    
    with torch.no_grad():
        outputs = adapted_model.generate(
            **inputs,
            do_sample=True,
            temperature=0.7,
            max_new_tokens=max_new_tokens,
            top_p=0.9
        )
    input_len = inputs["input_ids"].shape[1]
    new_tokens = outputs[0][input_len:]  # exclude prompt
    summary = tokenizer.decode(new_tokens, skip_special_tokens=True)
    return summary

def generate_base_summary(input_text, base_model, max_new_tokens=150):


    prompt = f"""Summarize:\n{input_text} Summary:\n"""
#     prompt = f"""Without commentary, from its original language summarize to English on useful information including sensitive data, below 100 words. If no meaning return <NULL>
# Text:
# {input_text}
# """
    
    inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
    
    with torch.no_grad():
        outputs = base_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )
    input_len = inputs["input_ids"].shape[1]
    new_tokens = outputs[0][input_len:]  # exclude prompt
    summary = tokenizer.decode(new_tokens, skip_special_tokens=True)
    return summary

def chunk_text_with_overlap(txt_file="", chunk_size=500, overlap=100, string=""):
    """Splits text into overlapping chunks."""
    if txt_file:
        with open(txt_file, 'r', encoding='utf-8') as f:
            text = f.read()
    if string:
        text = string
    chunks = []
    start = 0
    while start < len(text):
        end_ptr = min(start + chunk_size, len(text))
        chunks.append(text[start:end_ptr])
        start += chunk_size - overlap  # Move forward while keeping overlap

    # for chunk in chunks:
    #     print(chunk)
    return chunks


def rate(summary):
        prompt = f"?Is this useful:{summary}"
        inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)
    
        with torch.no_grad():
            outputs = base_model.generate(
                **inputs,
                do_sample=True,
                temperature=0.7,
                max_new_tokens=10,
                top_p=0.9
            )
        input_len = inputs["input_ids"].shape[1]
        new_tokens = outputs[0][input_len:]  # exclude prompt
        score = tokenizer.decode(new_tokens, skip_special_tokens=True)
        print("rating:", score)
        # try:
        #     score  = float(score)
        # except :
        #     score = 0
        return score 

    

In [None]:
def clean_summary(text):
    return (
        text.replace('NULL', '')
            .replace('<', '')
            .replace('>', '')
            .replace('\n', '')
            .replace('--', '')
            .replace('Summary:', '')
    )
class IterativeSummarizer:
    def __init__(self,model):
         self.model = model
     
    def summarize(self, chunks, max_token):
          if len(chunks) == 1:
               return chunks[0]  # Base case: only one chunk remains

          summarized_chunks = []

          for chunk in chunks:
               summary = generate_summary(chunk, adapted_model=self.model, max_new_tokens=max_token)
               cleaned = clean_summary(summary)
               summarized_chunks.append(cleaned)

          # Combine and re-chunk
          combined = " ".join(summarized_chunks)
          combined = combined.strip() 
          print("<----Combined--->", combined)

          # Recursively call after re-chunking
          new_chunks = chunk_text_with_overlap(string=combined, chunk_size=900)
          return self.summarize(new_chunks, max_token)


In [None]:
import json
from pathlib import Path
evaluate_data_folder = Path("./Data/PDFs")
pdf_names = []
for txt_file in evaluate_data_folder.rglob("*.txt"):  # Get all .txt files in the folder
        tup = tuple([str(txt_file.name),txt_file])
        pdf_names.append(tup)

#print(pdf_names)
pdf_names.sort(key=lambda x: x[0])
print(len(pdf_names))
for p in pdf_names:
        print((p))
summarizer = IterativeSummarizer(model=adapted_model)
results = []
page = 0
page_size = 0
with open("adapter_inference.jsonl", "a") as f:
        for i in range(page * page_size, len(pdf_names)):
                print(f"At file {i}of{len(pdf_names)}")
                chunks = chunk_text_with_overlap(txt_file=pdf_names[i][1])
                print(len(chunks))
                sum = summarizer.summarize(chunks,max_token=120)
                results.append((pdf_names)[i][0], sum)
                record = {
                        "file_name": pdf_names[i][0],
                        "summary": sum 
                        }
                f.write(json.dumps(record) + "\n")
#For pdf[0] after 14m on M1pro:
#<----Combined--->    The text discusses effective data incident management in healthcare, emphasizing email notification and adherence to organization-specific procedures. It also highlights understanding the causal chain of data incidents, identifying threat actors and vulnerabilities, and managing existing risks. Organizations are advised to strengthen their data incident management by implementing security controls and sharing best practices. Vulnerabilities are shared with trusted entities. The European Union's funding for humanitarian aid is mentioned, with the Centre for Humanitarian Data undertaking humanitarian activities with key partners.

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


25
('0e21835a42a6df2405496f62647058ff855743c1_normal.txt', PosixPath('Data/PDFs/0e21835a42a6df2405496f62647058ff855743c1_normal.txt'))
('1dcf57a5007b56254583423ba31107d22459bccf_normal.txt', PosixPath('Data/PDFs/1dcf57a5007b56254583423ba31107d22459bccf_normal.txt'))
('2009.08453v2_normal.txt', PosixPath('Data/PDFs/2009.08453v2_normal.txt'))
('2010.05981v2_normal.txt', PosixPath('Data/PDFs/2010.05981v2_normal.txt'))
('2010.11929v2_normal.txt', PosixPath('Data/PDFs/2010.11929v2_normal.txt'))
('2544CYX3TC3T5QB2NTVXD3IUFM654GXK_normal.txt', PosixPath('Data/PDFs/2544CYX3TC3T5QB2NTVXD3IUFM654GXK_normal.txt'))
('281928eff64137efdd144a833c81ad0ee45284c1_normal.txt', PosixPath('Data/PDFs/281928eff64137efdd144a833c81ad0ee45284c1_normal.txt'))
('2FDPTMT2NZDE6RIJSZZXGBMD7LYL7YHV_ocr.txt', PosixPath('Data/PDFs/2FDPTMT2NZDE6RIJSZZXGBMD7LYL7YHV_ocr.txt'))
('2G54QACZZK5MIIKK25USTLNPN66FST63_normal.txt', PosixPath('Data/PDFs/2G54QACZZK5MIIKK25USTLNPN66FST63_normal.txt'))
('2YS3ALM6OTD5ENWN4Z5LOBWG73575

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

text1 = "The cat sat on the"
text2 = "The cat sat on the mat"

inputs = tokenizer(text2, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss.item()

print(f"Cross-entropy loss: {loss}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


!!!!<----------------------->!!!!
!!!!<----------------------->!!!!


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


0: On August 11, 2017, 20 units of Jack's New England Glam Chowder were shipped. The unit price was $9.65. The shipped date was 2017-08-11.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


1: On 2017-08-28, two products were shipped. The first, Gudbrandsdalsost, was shipped in a quantity of 20 at a unit price of 36.0, totaling 720.0. The second, Outback Lager, was shipped in a quantity of 15 at a unit price of 15.0, totaling 225.0. The total price for all products shipped was 945.0. Shipping took place on 2017-08-28.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


2: Order ID 10481 was placed by Ricardo Adocicados (Customer ID: RICAR) on 2017-03-20 and shipped by Laura Callahan via United Package (Shipper ID: 2) on 2017-03-25. The order included 26 units of Maxiak at a unit price of 160. The order was shipped to Av. Copacabana, 257, Cidade de Janeiro, Brazil, 02389-890.
On August 11, 2017, 20 units of Jack's New England Glam Chowder were shipped at a unit price of $9.65. On August 28, 2017, two products were shipped: 20 units of Gudbrandsdalsost at a unit price of 36.0, totaling 720.0, and 15 units of Outback Lager at a unit price of 15.0, totaling 225.0. The total price for all products shipped was 945.0. Order ID 10481 was placed by Ricardo Adocicados (Customer ID: RICAR) on March 20, 2017, and shipped by Laura Callahan via United Package (Shipper ID: 2) on March 25, 2017. The order was shipped to Av. Copacabana, 257, Cidade de Janeiro, Brazil, 02389-890.
