In [None]:
import fitz  
import json
import spacy
from gliner_spacy.pipeline import GlinerSpacy
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

# Load T5 model for text structuring
model_name = "t5-base"  # You can experiment with different T5 models
t5_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
t5_tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load zero-shot classification pipeline with BART
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

nlp=spacy.load("en_core_web_sm")

# Define entity labels for zero-shot classification
LABELS = ["INVOICE_NUMBER", "DATE", "TOTAL_AMOUNT", "COMPANY_NAME", "ADDRESS"]

def extract_text_from_pdf(pdf_path):
    """Extracts raw text from a PDF file."""
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

def structure_text_with_t5(raw_text):
    """Structures raw text into sentences using T5 model."""
    input_text = f"structure: {raw_text}"
    input_ids = t5_tokenizer.encode(input_text, return_tensors="pt")
    outputs = t5_model.generate(input_ids)
    structured_text = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return structured_text

def extract_entities_zero_shot(text):
    """Extracts entities from text using zero-shot classification."""
    sentences = text.split(". ")  # Assuming sentences end with '. '
    entities = []
    for sentence in sentences:
        results = classifier(sentence, LABELS)
        top_label = results["labels"][0]
        top_score = results["scores"][0]
        if top_score > 0.7:  # Adjust threshold as needed
            start_char = sentence.find(results["sequence"])
            end_char = start_char + len(results["sequence"])
            entities.append((results["sequence"], top_label, start_char, end_char))
    return entities

def extract_entities_ner(text, ner_model):
    """Extracts entities from text using a trained NER model (optional)."""
    doc = ner_model(text)
    entities = [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents]
    return entities

def process_pdf(pdf_path, use_ner=False):
    """Processes a PDF file, extracts entities, and returns a structured dictionary."""
    raw_text = extract_text_from_pdf(pdf_path)
    structured_text = structure_text_with_t5(raw_text)

    if use_ner and nlp:
        entities = extract_entities_ner(structured_text, nlp)
    else:
        entities = extract_entities_zero_shot(structured_text)

    # Organize extracted entities into a dictionary
    extracted_data = {}
    for entity_text, entity_type, start, end in entities:
        if entity_type not in extracted_data:
            extracted_data[entity_type] = []
        extracted_data[entity_type].append(entity_text)

    return extracted_data



In [1]:
from transformers import pipeline

nlp = pipeline(
    "document-question-answering",
    model="impira/layoutlm-document-qa",
)

nlp(
    "https://templates.invoicehome.com/invoice-template-us-neat-750px.png",
    "What is the invoice number?"
)
# {'score': 0.9943977, 'answer': 'us-001', 'start': 15, 'end': 15}

nlp(
    "https://miro.medium.com/max/787/1*iECQRIiOGTmEFLdWkVIH2g.jpeg",
    "What is the purchase amount?"
)
# {'score': 0.9912159, 'answer': '$1,000,000,000', 'start': 97, 'end': 97}

nlp(
    "https://www.accountingcoach.com/wp-content/uploads/2013/10/income-statement-example@2x.png",
    "What are the 2020 net sales?"
)
# {'score': 0.59147286, 'answer': '$ 3,750', 'start': 19, 'end': 20}

config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/511M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/315 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



[{'score': 0.9726561307907104, 'answer': '$ 3,980', 'start': 11, 'end': 12}]