In [None]:
# !pip install -qqq -U git+https://github.com/huggingface/transformers.git
# !pip install -qqq -U git+https://github.com/huggingface/peft.git
# !pip install -qqq -U git+https://github.com/huggingface/accelerate.git
# !pip install -qqq bitsandbytes
# !pip install scipy
# !pip install google-cloud-vision
# !pip install datasets
# !pip install evaluate

In [None]:
import io
import os
from google.cloud import vision_v1
from google.cloud.vision_v1 import types

# path for the JSON key file
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'your vision key.json'

import json
import re
import time
from pprint import pprint
from datasets import load_dataset
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datetime import datetime
from datasets import load_dataset
import numpy as np
import nltk
import evaluate
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
base_model_id = "HuggingFaceH4/zephyr-7b-alpha"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=4096,
    padding=True,
    add_eos_token=True)

tokenizer.pad_token = tokenizer.eos_token
ft_model = PeftModel.from_pretrained(base_model, "your generated Fine-Tuned Checkpoint")

In [121]:
def model_output(ocr_text):
    eval_prompt = """Extract specific data from the provided OCR text of a marriage record. Identify and return the following details in a structured JSON format: Application number, date of marriage ( month,day, year), place of marriage, birth details (month,day, year, place) of both spouses, their county, gender, given names, residence, state, and surnames. Only show the extracted information. Don't show the OCR Text again. Also don't show the pattern which you are asked to follow. Concentrate on the data asked to extract . The output must strictly adhere to the given details and key names, starting with an opening curly brace '{' and ending with a closing curly brace '}'. Ensure that each key-value pair contains only accurate and relevant information from the OCR text. Avoid including any extraneous or irrelevant data that does not correspond directly to the specified keys. The structure of the output is as follows:

Application: The marriage Application number.
Spouse1_Given: The first name of the first spouse.
Spouse1_Surname: The last name of the first spouse.
Spouse1_Birth_Day: The birth day of the first spouse.
Spouse1_Birth_Month: The birth month of the first spouse.
Spouse1_Birth_Year: The birth year of the first spouse.
Spouse1_Residence: The residence of the first spouse.
Spouse1_County: The county of the first spouse.
Spouse1_State: The state of the first spouse.
Spouse1_Birthplace: The birthplace of the first spouse.
Spouse1_Gender: The gender of the first spouse.
Spouse2_Given: The first name of the second spouse.
Spouse2_Surname: The last name of the second spouse.
Spouse2_Maiden: The maiden name of the second spouse, if applicable.
Spouse2_Birth_Day: The birth day of the second spouse.
Spouse2_Birth_Month: The birth month of the second spouse.
Spouse2_Birth_Year: The birth year of the second spouse.
Spouse2_Residence: The residence of the second spouse.
Spouse2_County: The county of the second spouse.
Spouse2_State: The state of the second spouse.
Spouse2_Birthplace: The birthplace of the second spouse.
Spouse2_Gender: The gender of the second spouse.
Marriage_Day: The day of the marriage.
Marriage_Month: The month of the marriage.
Marriage_Year: The year of the marriage.
Marriage_Place: The location of the marriage.

    ### OCR Text:
    {ocr_text}
    """
    model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

    ft_model.eval()
    with torch.no_grad():
        output = eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=600)[0], skip_special_tokens=True)

    return output

In [122]:
def process_images_and_infer(directory_path):
    # List to store the inferencing results
    inferencing_results = []

    # Iterate over each file in the directory
    for filename in os.listdir(directory_path):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.jfif')):
            image_path = os.path.join(directory_path, filename)

            # Initialize a Vision API client
            client = vision_v1.ImageAnnotatorClient()

            # Split the extension from the image name
            image_name, _ = os.path.splitext(os.path.basename(image_path))

            with io.open(image_path, 'rb') as image_file:
                content = image_file.read()

            image = vision_v1.Image(content=content)
            response = client.document_text_detection(image=image)

            confidences = []
            bounding_boxes = []

            if response.text_annotations:
                ocr_data = response.text_annotations[0].description.replace('\n', ' ')
                for page in response.full_text_annotation.pages:
                    for block in page.blocks:
                        for paragraph in block.paragraphs:
                            for word in paragraph.words:
                                word_text = ''.join([symbol.text for symbol in word.symbols])
                                confidences.append((word_text, word.confidence))
                                vertices = [(v.x, v.y) for v in word.bounding_box.vertices]
                                bounding_boxes.append((word_text, vertices))
            else:
                ocr_data = ""

            # Perform inferencing using the finetuned LLM
            inference_output = model_output(ocr_data)

            # Append the results
            inferencing_results.append({
                'Image': image_name,
                'OCR': ocr_data,
                'Confidence Score': confidences,
                'Bounding Boxes': bounding_boxes,
                'Model_Output': inference_output
            })

    return inferencing_results

In [123]:
def extract_json_outputs(final_results):
    # List to store the modified JSON outputs
    json_outputs = []

    # Regex pattern to match JSON content
    json_pattern = (r'\{\s*"?Application\s*number"?\s*:\s*"[^"]*?"'
                    r'(?:\s*,\s*"(?:\\.|[^"\\])*?"\s*:\s*"(?:\\.|[^"\\])*")*?'
                    r'\s*,\s*"?Spouse2_surname"?\s*:\s*"(?:\\.|[^"\\])*?"\s*\}'
                )
    # Iterate over each item in the output of Function 1
    for item in final_results:
        # Extract the inference text
        inference_text = item['Model_Output']

        # Find all matches of JSON content
        matches = re.findall(json_pattern, inference_text)

        for match in matches:
            try:
                # Convert the JSON string to a dictionary
                json_data = json.loads(match)

                # Add the 'Image' key to the JSON data
                json_data['Image'] = item['Image']

                # Append the modified JSON data
                json_outputs.append(json_data)

            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")

    return json_outputs

In [None]:

start_time = time.time()
print(f"Start time: {start_time} seconds\n")

directory_path = 'your input image directory path'
final_results = process_images_and_infer(directory_path)
#print(final_results)
llm_json_output = extract_json_outputs(final_results)
#print(llm_json_output)


end_time = time.time()
print(f"End time: {end_time} seconds\n")

# Calculate and log the elapsed time
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time} seconds")