# Requirements

In [1]:
# Zephyr-7B Installation
%pip install git+https://github.com/huggingface/transformers.git
%pip install accelerate

# PaddleOCR Installation
!git clone https://github.com/PaddlePaddle/PaddleOCR.git
%pip install paddlepaddle-gpu
%pip install "paddleocr>=2.0.1"

# Fix locale settings for PaddleOCR
import locale
def getpreferredencoding(do_setlocale=True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# Install PyMuPDF (specific version)
!pip uninstall -y fitz PyMuPDF
!pip install PyMuPDF==1.20.0

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-8w2lp28k
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-8w2lp28k
  Resolved https://github.com/huggingface/transformers.git to commit 816f4424964c1a1631e303b663fc3d68f731e923
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-4.46.0.dev0-py3-none-any.whl size=9991549 sha256=470cc490c3bf5ad4a71a5c47020f498947a7b68b845071c9201862dd15997497
  Stored in directory: /tmp/pip-ephem-wheel-cache-e97ylpvn/wheels/e7/9c/5b/e1a9c8007c343041e61cc484433d512ea9274272e3fcbe7c16
Successfully built tr

# Libraries

In [2]:
# Import libraries required for OCR and PDF handling
import fitz  # PyMuPDF for working with PDFs
from paddleocr import PaddleOCR  # OCR library from Paddle
import numpy as np  # NumPy for numerical operations

# Import libraries required for handling LLM
import torch  # PyTorch for handling deep learning models
from transformers import pipeline  # Hugging Face's Transformers for Zephyr-7B
import json # Convert extracted data into json format

CHECKPOINT = "HuggingFaceH4/zephyr-7b-alpha"

In [3]:
# Initialize PaddleOCR (Using the English model as documents are in English)
ocr = PaddleOCR(use_angle_cls=True, lang='en', page_num=1, use_gpu = 0)

# ---- 1. Extract Information Using PaddleOCR with Average Confidence ----
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a given PDF using PaddleOCR and calculates the average OCR confidence.
    
    Args:
    - pdf_path (str): Path to the input PDF file.
    
    Returns: 
    - result (list): Raw OCR output.
    - ocr_string (str): Extracted text from the PDF.
    - avg_confidence (float): Average OCR confidence score.
    """

    # Perform OCR on the page
    result = ocr.ocr(pdf_path, cls=True)
    
    ocr_string = ""
    total_confidence = 0.0
    count = 0
    
    # Extract the text and calculate the sum of probabilities
    for i in range(len(result[0])):
        # Extract text
        ocr_string += result[0][i][1][0] + " "
        
        # Extract confidence score
        confidence = result[0][i][1][1]  # Probability is the second element in the tuple
        total_confidence += confidence
        count += 1

    # Calculate the average confidence
    avg_confidence = total_confidence / count if count > 0 else 0.0

    return result, ocr_string, avg_confidence

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:17<00:00, 224.94it/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10000/10000 [00:17<00:00, 577.63it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:15<00:00, 137.04it/s]

[2024/10/20 06:47:55] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=0, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=1, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_char_




In [4]:
# ---- 2. Extract Key Information Using Zephyr-7B (LLM) ----
def extract_key_info_using_llm(text, model_name):
    """
    Extracts key information from the OCR text using Zephyr-7B (via Hugging Face Transformers).
    Args:
    - text (str): OCR-processed text.
    
    Returns:
    - extracted_data (str): Key information extracted from the text.
    """
    # Load the model
    generator = pipeline("text-generation", model=model_name, torch_dtype=torch.bfloat16, device_map="auto")

    messages = [
        {
            "role": "system",
            "content": "You are a JSON converter which receives raw boarding pass OCR information as a string and returns a structured JSON output by organizing the information in the string."
        },
        {
            "role": "user", 
            "content": f"""
            Extract the following details and save them in a JSON structure:
            
            invoice_data = {{
                "GSTIN": "",
                "Invoice Number": "",
                "Invoice Date": "",
                "Due Date": "",
                "Total Amount": "",
                "Taxable Amount": "",
                "Place of Supply": "",
                "Customer Details": "",
                "Phone": "",
                "Item Details": ""
            }}
            
            Extracted information:
            GSTIN, Invoice number, Invoice Date, Due Date, Total Amount, Taxable Amount, Place of Supply, Customer Details, Phone number, and Item Details from the provided text: {text}
            """
        }
    ]
    outputs = generator(messages, max_new_tokens=1000, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
    return outputs[0]['generated_text'][-1]['content']

In [5]:
# ---- 3. Save Data as JSON ----
def save_data_as_json(data, output_path):
    """
    Saves the extracted data as a JSON file.
    Args:
    - data (dict): Extracted data.
    - output_path (str): Path to save the JSON file.
    """
    with open(output_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

In [6]:
import time  # Import the time module

if __name__ == "__main__":
    pdf_path = "/kaggle/input/zolvit-task/Jan to Mar/INV-100_Agrani Kandele.pdf"  # Path to your PDF file
    
    # Start the timer
    start_time = time.time()
    
    # Step 1: Extract text using PaddleOCR
    result, ocr_text, avg_confidence = extract_text_from_pdf(pdf_path)
    print("OCR Text Extracted:\n", ocr_text)
    print("\n")
    print("The Accuracy is: ", round(avg_confidence * 100, 2))
    print("\n")
    
    # Step 2: Extract key information using Zephyr-7B
    key_info = extract_key_info_using_llm(ocr_text, CHECKPOINT)
    print("Key Information Extracted:\n", key_info)
    print("\n")
    
    # Step 3: Save the data as a JSON file
    save_data_as_json(key_info, "extracted_data.json")
    print("Data has been saved as extracted_data.json")
    print("\n")
    
    # End the timer
    end_time = time.time()
    
    # Calculate the total time taken
    time_taken = end_time - start_time
    print(f"Time taken for the complete process: {time_taken:.2f} seconds")


[2024/10/20 06:50:03] ppocr DEBUG: dt_boxes num : 91, elapsed : 0.38840794563293457
[2024/10/20 06:50:03] ppocr DEBUG: cls num  : 91, elapsed : 0.2009563446044922
[2024/10/20 06:50:10] ppocr DEBUG: rec_res num  : 91, elapsed : 6.590212106704712
OCR Text Extracted:
 TAX INVOICE ORIGINAL FOR RECIPIENT UNCUE DERMACARE PRIVATE LIMITED GSTIN23AADCU2395N1ZY C/o KARUNA GUPTA KURELE, 1st Floor S.P Bungalow Ke Pichhe, Shoagpur Shahdol, Shahdol Shahdol, MADHYA PRADESH, 484001 Mobile +91 8585960963 Email ruhi@dermaq.in Invoice #: INV-100 Invoice Date:24 Jan 2024 Due Date:24 Jan 2024 Customer Details: Shipping Address: Agrani Kandele vadandana beauty parlour Ph: 8120482988 Murawara Katni, MADHYA PRADESH, 483501 Place of Supply: 23-MADHYA PRADESH # Item  Rate / Item Qty Taxable Value Tax Amount Amount 531.99 1 Acne UV Gel - 30 SPF 1 UNT 531.99 95.76 (18%) 627.75 709.32 (-25%) 392.07 2 Clindac-A mist spray 1 BTL 392.07 47.05 (12%) 439.12 445.54 (-12%) 184.20 3 AKNAYBAR soap 1 BOX 184.20 33.16 (18%) 

config.json:   0%|          | 0.00/628 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Key Information Extracted:
 {
  "GSTIN": "23AADCU2395N1ZY",
  "Invoice Number": "INV-100",
  "Invoice Date": "24 Jan 2024",
  "Due Date": "24 Jan 2024",
  "Total Amount": "1,284.00",
  "Taxable Amount": "1,108.27",
  "Place of Supply": "23-MADHYA PRADESH",
  "Customer Details": {
    "Shipping Address": "Agrani Kandele vadandana beauty parlour\nPh: 8120482988\nMurawara Katni,\nMADHYA PRADESH,\n483501"
  },
  "Phone": "+91 8585960963",
  "Item Details": [
    {
      "Item": "Acne UV Gel - 30 SPF 1 UNT",
      "Rate": "531.99",
      "Qty": "1",
      "Taxable Value": "531.99",
      "Tax Amount": "95.76",
      "Amount": "627.75",
      "Discount": "-25%",
      "Total": "392.07"
    },
    {
      "Item": "Clindac-A mist spray 1 BTL",
      "Rate": "392.07",
      "Qty": "1",
      "Taxable Value": "392.07",
      "Tax Amount": "47.05",
      "Amount": "439.12",
      "Discount": "-12%",
      "Total": "184.20"
    },
    {
      "Item": "AKNAYBAR soap 1 BOX",
      "Rate": "184.20",


____