In [1]:
from config.settings import SRC_DIR, DATA_DIR
import os

# Change working directory
os.chdir(SRC_DIR)

# Define file path
file_dir = DATA_DIR / 'invoices' / 'valid_invoices'
files = os.listdir(file_dir)
files

['invoice.pdf',
 'ISS.pdf',
 'NEWFO-INV-7478.pdf',
 'PerfectMatch.pdf',
 'PO139090 GP.pdf',
 'SKonica Sit23112709230.pdf',
 'skonica3-4.pdf']

In [2]:
import base64

class DataIngestion:
    def __init__(self):
        pass

    @staticmethod
    def transform(file_path):
        '''Reads a PDF file and returns its base64 encoded content.'''
        with open(file_path, "rb") as pdf_file:
            pdf_data = pdf_file.read()
        return base64.standard_b64encode(pdf_data).decode("utf-8")

In [3]:
import re
import json

class Invoice:
    def __init__(self, data):
        self.data = data
    
    def __getitem__(self, key):
        '''Returns the value of the key from the data dictionary.'''
        return self.data.get(key, None)
    
class InvoiceExtractor:
    def __init__(self):
        pass

    def extract(self, response_text):
        '''Extracts and merges JSON data from model response using balanced brace parsing.'''
        def extract_json_objects(text):
            objs = []
            brace_count = 0
            start = None
            for i, ch in enumerate(text):
                if ch == '{':
                    if brace_count == 0:
                        start = i
                    brace_count += 1
                elif ch == '}':
                    brace_count -= 1
                    if brace_count == 0 and start is not None:
                        objs.append(text[start:i+1])
            return objs

        json_blocks = extract_json_objects(response_text)
        if not json_blocks:
            raise ValueError("No valid JSON found in response")
        combined_data = {}
        for block in json_blocks:
            data = json.loads(block)
            combined_data.update(data)
        return Invoice(combined_data)

In [4]:
import google.generativeai as genai
import os
from IPython.display import display, HTML
from json2html import json2html
from config.invoice_config import get_prompt
from dotenv import load_dotenv
load_dotenv()
GCP_KEY = os.getenv("GCP_KEY")

class OCR_Model:
    def __init__(self, model="gemini-2.0-flash-exp"):
        genai.configure(api_key=GCP_KEY)
        self.model = genai.GenerativeModel(model)

    def _predict(self, data):
        '''Generates response using the model.'''
        response = self.model.generate_content(
            [{"mime_type": "application/pdf", "data": data}, get_prompt()]
        )
        return response.text

    def extract(self, data):
        '''Extracts invoice data from the response.'''
        response_text = self._predict(data)
        extractor = InvoiceExtractor()
        return extractor.extract(response_text)

    def display(self, invoice, html=False):
        '''Displays the extracted invoice data.'''
        if html:
            html_table = json2html.convert(json=invoice.data)
            display(HTML(html_table))
        else:
            display(invoice.data)

In [5]:
from config.settings import ROOT_DIR
import os
output_folder = ROOT_DIR / 'invoice_outputs'/ 'gemini-2.0-flash-exp'
output_folder.mkdir(exist_ok=True)
import json

def save_file(invoice, file_name):
    json_file_path = output_folder / f"{file_name}.json"

    # Ensure the full path exists
    json_file_path.parent.mkdir(parents=True, exist_ok=True)

    # Open file in write mode
    with open(json_file_path, "w", encoding="utf-8") as json_file:
        json.dump(invoice, json_file, indent=4)

    print(f"JSON saved at: {json_file_path}")

In [6]:
from tqdm.auto import tqdm
for file in tqdm(files):
    file_path = file_dir / file

    data_ingestion = DataIngestion()
    data = data_ingestion.transform(file_path)

    ocr_model = OCR_Model()
    invoice = ocr_model.extract(data)

    file_name = str(file_path).split('\\')[-1][:-4]

    save_file(invoice.data,file_name)

  0%|          | 0/7 [00:00<?, ?it/s]

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\ai.users\\imscanCS\\invoice_outputs\\gemini-2.0-flash-exp\\invoice.json'