In [1]:
!pip install paddleocr
!pip install pypdf
!pip install paddlepaddle
!pip install bitsandbytes==0.44.2
!pip install accelerate
!apt-get install -y poppler-utils
!pip install pdf2image

Collecting paddleocr
  Downloading paddleocr-2.9.0-py3-none-any.whl.metadata (8.4 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post6-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting python-docx (from paddleocr)
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting fire>=0.3.0 (from paddleocr)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading paddleocr-2.9.0-py3-none-any.whl (544 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5

In [2]:
!pip install bitsandbytes
!pip install --upgrade accelerate
!pip install --upgrade transformers

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1
Collecting accelerate
  Downloading accelerate-1.0.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.0.1-py3-none-any.whl (330 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.9/330.9 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.34.2
    Uninstalling accelerate-0.34.2:
      Successfully uninstalled accelerate-0.34.2
Successfully installed accelerate-1.0.1
Collecting transformers
  Downloading transformers-4.45.2-py3-none-any.whl.m

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, BitsAndBytesConfig
from paddleocr import PaddleOCR
from pypdf import PdfReader
import json
import bitsandbytes
from pdf2image import convert_from_path
import numpy as np
from tqdm.auto import tqdm

In [4]:
# Initialize PaddleOCR for English text extraction
paddleocr = PaddleOCR(lang="en", ocr_version="PP-OCRv4", show_log=False, use_gpu=True)

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:00<00:00, 6178.61it/s]


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10000/10000 [00:00<00:00, 11637.04it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:02<00:00, 881.09it/s] 


In [5]:
# Define the scan function for PaddleOCR (similar to what you had for receipt images)
def paddle_scan(paddleocr, img_path_or_nparray):
    result = paddleocr.ocr(img_path_or_nparray, cls=True)
    result = result[0]
    boxes = [line[0] for line in result]       # bounding box
    txts = [line[1][0] for line in result]     # raw text
    scores = [line[1][1] for line in result]   # scores
    return txts, result

In [6]:
# Function to extract text from PDF using PaddleOCR
def extract_text_from_pdf(pdf_path):
    ocr_results = []

    # Convert PDF pages to images
    images = convert_from_path(pdf_path)

    for page_image in images:
        # Convert the PIL image to a NumPy array (required by PaddleOCR)
        page_image_np = np.array(page_image.convert('RGB'))

        # Use PaddleOCR to scan the image
        ocr_texts, ocr_boxes = paddle_scan(paddleocr, page_image_np)
        ocr_results.append(ocr_texts)

    return ocr_results

In [7]:
# Initialize the model using 4-bit quantization and Mistral fine-tuned model
# Configure 4-bit quantization with bitsandbytes
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Enable 8-bit quantization
    llm_int8_enable_fp32_cpu_offload=True,  # Enable FP32 offload for better precision on CPU
)


In [8]:
device_map = {
    "transformer.word_embeddings": 0,
    "transformer.word_embeddings_layernorm": 0,
    "lm_head": 0,
    "transformer.h": 0,
    "transformer.ln_f": 0,
    "model.embed_tokens": 0,
    "model.layers": 0,
    "model.norm": 0
}

In [9]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/5.07k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/89.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [11]:
import re
def process_invoice_with_llm(ocr_results):
    prompt =f"""### Instruction:
    You are POS receipt data expert, parse, detect, recognize and convert following receipt OCR image result into structure receipt data object.
    Strictly Ensure that include only these fields:-

    - 'sgst_amounts'
    - 'cgst_amounts'
    - 'igst_amount' (if igst explicitly mentioned in the ocr results otherwise null)
    - 'sgst_rate'
    - 'cgst_rate'
    - 'igst_rate'(if igst explicitly mentioned in the ocr results otherwise none)
    - 'tax_amount' (calculated as cgst_amount + sgst_amount + igst_amount)
    - 'tax_rate'
    - 'final_amount'
    - 'taxable_value'(calculated as final_amount - (cgst_amount + sgst_amount + igst_amount))
    - 'invoice_number'
    - 'invoice_date'
    - 'place_of_supply'
    - 'place_of_origin'(First place name of indian city, state in the ocr results)
    - 'gstin_supplier'
    - 'gstin_recipient'
    Don't make up value not in the Input and do not put any field in json other than above mentioned. Output must be a well-formed JSON object.```json
    ### Input:
    {ocr_results}

    ### Output:
    """


    # Tokenize the prompt and run the model to generate the output
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)

    # Move input tensors to the same device as the model
    device = next(model.parameters()).device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=512)

    # Decode the output into text
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(output_text)
# Attempt to convert the output to JSON
    try:
        # Find the first '{' and first '}' and extract the JSON string between them
        json_text = output_text[output_text.index("{"):output_text.index("}")+1]
        json_clean = json.loads(json_text)
        # Filter out any fields that are not in the allowed set
        filtered_data = {key: value for key, value in json_clean.items() if key in ALLOWED_FIELDS}

        return filtered_data
    except Exception as e:
        print(f"Error converting to JSON: {e}")
        return {}



    return result_json

In [12]:
# Main function to extract data from the PDF
def process_pdf_invoice(pdf_path):
    # Step 1: Extract OCR data from the PDF
    ocr_results = extract_text_from_pdf(pdf_path)
    print(ocr_results)
    # Step 2: Process OCR results with the LLM to extract structured invoice data
    structured_data = process_invoice_with_llm(ocr_results)

    return structured_data

In [None]:
import csv
import os
ALLOWED_FIELDS = set([
     'taxable_value', 'sgst_amounts', 'cgst_amounts', 'igst_amount',
    'sgst_rate', 'cgst_rate', 'igst_rate', 'tax_amount', 'tax_rate', 'final_amount','invoice_number', 'invoice_date',
    'place_of_supply', 'place_of_origin', 'gstin_supplier', 'gstin_recipient'
])
# Recursive function to process all PDFs in the directory structure
def process_pdfs_in_directory(directory, csv_file):
    fieldnames = ['taxable_value', 'sgst_amounts', 'cgst_amounts', 'igst_amount', 'sgst_rate', 'cgst_rate', 'igst_rate', 'tax_amount', 'tax_rate', 'final_amount', 'invoice_number', 'invoice_date', 'place_of_supply', 'place_of_origin', 'gstin_supplier', 'gstin_recipient']

    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()

        # Walk through the directory structure
        for root, _, files in os.walk(directory):
            for file in tqdm(files):
                if file.endswith(".pdf"):
                    pdf_path = os.path.join(root, file)
                    print(f"Processing {pdf_path}...")

                    # Step 1: Extract OCR data from each page of the PDF
                    ocr_results_by_page = extract_text_from_pdf(pdf_path)

                    # Step 2: Process each page and accumulate the data
                    for page_result in tqdm(enumerate(ocr_results_by_page)):
                        structured_data = process_invoice_with_llm(page_result)

                        # Write the structured data to the CSV file
                        if structured_data:
                            writer.writerow(structured_data)

# Main execution
directory = '/content/sample_data/Untitled Folder'  # Replace with your actual directory path
csv_file = '/content/sample_data/2_invoices.csv'  # Save the CSV to Kaggle's working directory

process_pdfs_in_directory(directory, csv_file)
print(f"Extraction completed. Data saved to {csv_file}.")

  0%|          | 0/13 [00:00<?, ?it/s]

Processing /content/sample_data/Untitled Folder/INV-149_Karishma Bande.pdf...


0it [00:00, ?it/s]

### Instruction:
    You are POS receipt data expert, parse, detect, recognize and convert following receipt OCR image result into structure receipt data object.
    Strictly Ensure that include only these fields:-
    - 'invoice_number'
    - 'invoice_date'
    -'sgst_amounts'
    - 'cgst_amounts'
    - 'igst_amount' (if igst explicitly mentioned in the ocr results otherwise null)
    -'sgst_rate'
    - 'cgst_rate'
    - 'igst_rate'(if igst explicitly mentioned in the ocr results otherwise none)
    - 'tax_amount' (calculated as cgst_amount + sgst_amount + igst_amount)
    - 'tax_rate'
    - 'final_amount'
    - 'taxable_value'(calculated as final_amount - (cgst_amount + sgst_amount + igst_amount))
    - 'place_of_supply'
    - 'place_of_origin'(First place name of indian city, state in the ocr results)
    - 'gstin_supplier'
    - 'gstin_recipient'
    Don't make up value not in the Input and do not put any field in json other than above mentioned. Output must be a well-formed JSON o

0it [00:00, ?it/s]