## Library Imports

In [1]:
from OCR import OCR_Model
from PIL import Image, ImageDraw, ImageFont
from pdf2image import convert_from_path
from matplotlib import pyplot as plt
from dotenv import load_dotenv
from PIL import Image
from groq import Groq
import numpy as np
import torch
import pickle
import json
import os

load_dotenv()

api_key = os.getenv("GROQ_API_KEY")
client = Groq(api_key=api_key)

## LLM Setup to get desired output 

In [36]:
def extract_fields_from_ocr(all_text_lines):
    joined_text = "\n".join(all_text_lines)

    prompt = f"""
        You are a structured data extractor for Marathi/Hindi land record OCR text.

        Given the OCR text of a legal land document, extract the following fields **precisely**:

        - "अहवाल दिनांक" → from a line like "अहवाल दिनांक : DD/MM/YYYY"
        - "गाव" → word after गाव: (may be in brackets or followed by a number)
        - "तालुका" → word after तालुका :
        - "जिल्हा" → word after जिल्हा :
        - "ULPIN" or "PU-ID" → numerical code following "ULPIN" or "PU-ID"
        - "Owner Name" → all person names in Marathi/Hindi script that are **not** struck-through (i.e., ignore names with `<del>...</del>`), and not phrases like "हे आर.चौ.मी"
        - "प्रलंबित फ़ेरफ़ार" -> नाही.

        ### Rules for Owner Names:
        - Must be a **human name** (e.g., दिपक वावुलाल गोयकर)
        - Ignore any name with `<del>`, or anything that does not resemble a person
        - Return multiple names as a **list**

        ### Output Format:
        Respond with **only** the following JSON object (no extra explanation):

        ```json
        {{
        "अहवाल दिनांक": "DD/MM/YYYY",
        "गाव": "Village Name",
        "तालुका": "Taluka Name",
        "जिल्हा": "District Name",
        "ULPIN/PU-ID": "Number",
        "Owner Name": ["Name 1", "Name 2"],
        "प्रलंबित फ़ेरफ़ार" : "नाही"
        }}
        ```
        Only provide the JSON object without any additional text or explanation.
        \"\"\"{joined_text}\"\"\"
    """ 

    chat_completion = client.chat.completions.create(
        model="llama3-8b-8192",  # You can also try "llama3-8b-8192" or "gemma-7b-it"
        messages=[
            {"role": "system", "content": "You extract structured data from OCR of Marathi legal documents."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )

    return chat_completion.choices[0].message.content

## Processing in Bulk and storing the results

In [37]:
doc_dir ="MH_LandRecords_PDFs"
save_dir = "outputs"

outputs = []

for idx,doc_name in enumerate(np.sort(os.listdir(doc_dir))):
    if doc_name.endswith('.pdf') or doc_name.endswith('.png') or doc_name.endswith('.jpg'):
        print(f"Processing {idx+1}/{len(os.listdir(doc_dir))} , {doc_name}")

    if os.path.exists(os.path.join(save_dir, "jsons", f"{doc_name}.pkl")):
        print(f"\tSkipping {doc_name}, already processed.")
        continue
        

    doc_path = os.path.join(doc_dir, doc_name)

    if doc_path.endswith('.pdf'):
        # Convert PDF to images
        image = convert_from_path(doc_path, dpi=300)[0]
    elif doc_path.endswith('.png') or doc_path.endswith('.jpg'):
        image = Image.open(doc_path).convert('RGB')

    ocr = OCR_Model()
    print("\tRunning OCR...")
    predictions = ocr.predict(images = [image])

    #save the image with OCR bounding boxes
    ocr.plot_prediction(image=image, predictions= predictions, show=False,save_path = os.path.join(save_dir, "images", f"{doc_name}.png"))

    del ocr
    torch.cuda.empty_cache()

    All_text = []
    for line in predictions[0].text_lines:
        All_text.append(line.text)

    print("\tFetching relevant fields from OCR...")
    info = extract_fields_from_ocr(All_text)

    parsed = json.loads(info)
    outputs.append(parsed)

    # Save the output as a pickle file
    output_path = os.path.join(save_dir,"jsons", f"{doc_name}.pkl")
    with open(output_path, 'wb') as f:
        pickle.dump(parsed, f)
    

Processing 1/36 , 1 39.pdf
	Skipping 1 39.pdf, already processed.
Processing 2/36 , 2 30.pdf
	Skipping 2 30.pdf, already processed.
Processing 3/36 , 7-12 1.pdf
	Skipping 7-12 1.pdf, already processed.
Processing 4/36 , 7-12.pdf
	Skipping 7-12.pdf, already processed.
Processing 5/36 , 7_12 7 (1).pdf
	Skipping 7_12 7 (1).pdf, already processed.
Processing 6/36 , 7_12_365 2 (1).pdf
	Skipping 7_12_365 2 (1).pdf, already processed.
Processing 7/36 , Satbara-60-1.pdf
	Skipping Satbara-60-1.pdf, already processed.
Processing 8/36 , Satbara1 4.pdf
	Skipping Satbara1 4.pdf, already processed.
Processing 9/36 , Satbara1.pdf
	Skipping Satbara1.pdf, already processed.
Processing 10/36 , Satbara2.pdf
	Skipping Satbara2.pdf, already processed.
Processing 11/36 , Satbara3.pdf
	Skipping Satbara3.pdf, already processed.
Processing 12/36 , Satbara4.pdf
	Skipping Satbara4.pdf, already processed.
Processing 13/36 , download 17.pdf
	Skipping download 17.pdf, already processed.
Processing 14/36 , download 

Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.01s/it]
Recognizing Text: 100%|██████████| 264/264 [04:29<00:00,  1.02s/it]


Image saved to outputs/images/download 22.pdf.png
	Fetching relevant fields from OCR...
Processing 19/36 , download 23 (1).pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.02it/s]
Recognizing Text: 100%|██████████| 170/170 [04:24<00:00,  1.55s/it]


Image saved to outputs/images/download 23 (1).pdf.png
	Fetching relevant fields from OCR...
Processing 20/36 , download 23.pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
Recognizing Text: 100%|██████████| 170/170 [04:26<00:00,  1.56s/it]


Image saved to outputs/images/download 23.pdf.png
	Fetching relevant fields from OCR...
Processing 21/36 , download 24.pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.00it/s]
Recognizing Text: 100%|██████████| 132/132 [02:51<00:00,  1.30s/it]


Image saved to outputs/images/download 24.pdf.png
	Fetching relevant fields from OCR...
Processing 22/36 , download 25.pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.07it/s]
Recognizing Text: 100%|██████████| 102/102 [02:03<00:00,  1.21s/it]


Image saved to outputs/images/download 25.pdf.png
	Fetching relevant fields from OCR...
Processing 23/36 , download 26.pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.00it/s]
Recognizing Text: 100%|██████████| 137/137 [02:53<00:00,  1.27s/it]


Image saved to outputs/images/download 26.pdf.png
	Fetching relevant fields from OCR...
Processing 24/36 , download 27.pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
Recognizing Text: 100%|██████████| 357/357 [06:35<00:00,  1.11s/it]


Image saved to outputs/images/download 27.pdf.png
	Fetching relevant fields from OCR...
Processing 25/36 , download 28.pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.06s/it]
Recognizing Text: 100%|██████████| 357/357 [06:35<00:00,  1.11s/it]


Image saved to outputs/images/download 28.pdf.png
	Fetching relevant fields from OCR...
Processing 26/36 , download 29.pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.04s/it]
Recognizing Text: 100%|██████████| 232/232 [04:50<00:00,  1.25s/it]


Image saved to outputs/images/download 29.pdf.png
	Fetching relevant fields from OCR...
Processing 27/36 , satbara-101-1 1.pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.00it/s]
Recognizing Text: 100%|██████████| 140/140 [03:13<00:00,  1.38s/it]


Image saved to outputs/images/satbara-101-1 1.pdf.png
	Fetching relevant fields from OCR...
Processing 28/36 , satbara-864-2 1.pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.01s/it]
Recognizing Text: 100%|██████████| 191/191 [03:19<00:00,  1.04s/it]


Image saved to outputs/images/satbara-864-2 1.pdf.png
	Fetching relevant fields from OCR...
Processing 29/36 , satbara.pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.01s/it]
Recognizing Text: 100%|██████████| 130/130 [02:36<00:00,  1.21s/it]


Image saved to outputs/images/satbara.pdf.png
	Fetching relevant fields from OCR...
Processing 30/36 , satbara10.pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
Recognizing Text: 100%|██████████| 211/211 [05:05<00:00,  1.45s/it]


Image saved to outputs/images/satbara10.pdf.png
	Fetching relevant fields from OCR...
Processing 31/36 , satbara11.pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.00s/it]
Recognizing Text: 100%|██████████| 159/159 [02:59<00:00,  1.13s/it]


Image saved to outputs/images/satbara11.pdf.png
	Fetching relevant fields from OCR...
Processing 32/36 , satbara12.pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.01s/it]
Recognizing Text: 100%|██████████| 174/174 [02:59<00:00,  1.03s/it]


Image saved to outputs/images/satbara12.pdf.png
	Fetching relevant fields from OCR...
Processing 33/36 , satbara13.pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
Recognizing Text: 100%|██████████| 128/128 [02:12<00:00,  1.03s/it]


Image saved to outputs/images/satbara13.pdf.png
	Fetching relevant fields from OCR...
Processing 34/36 , satbara5 (1).pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.03it/s]
Recognizing Text: 100%|██████████| 121/121 [02:27<00:00,  1.22s/it]


Image saved to outputs/images/satbara5 (1).pdf.png
	Fetching relevant fields from OCR...
Processing 35/36 , satbara5 (2).pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.02it/s]
Recognizing Text: 100%|██████████| 119/119 [02:10<00:00,  1.10s/it]


Image saved to outputs/images/satbara5 (2).pdf.png
	Fetching relevant fields from OCR...
Processing 36/36 , satbara5 (3).pdf
	Running OCR...


Detecting bboxes: 100%|██████████| 1/1 [00:01<00:00,  1.00s/it]
Recognizing Text: 100%|██████████| 110/110 [02:38<00:00,  1.44s/it]


Image saved to outputs/images/satbara5 (3).pdf.png
	Fetching relevant fields from OCR...
