In [27]:
import re
import os
import streamlit as st
import google.generativeai as genai
import pytesseract
from PIL import Image
from pdf2image import convert_from_bytes
import sqlite3

In [3]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [5]:
# Google Generative AI Configuration
config = {
    "temperature": 0,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192
}
KEY = 'YOUR_GOOGLE_GENERATIVE_AI_API_KEY'  # Replace with your actual API key
genai.configure(api_key=KEY)

In [11]:
def get_db_connection():
    conn = sqlite3.connect('invoices.db')
    return conn

In [13]:
conn = get_db_connection()
cursor = conn.cursor()

In [15]:
cursor.execute('''
    CREATE TABLE IF NOT EXISTS invoices (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        company_name TEXT,
        invoice_date TEXT,
        total_amount REAL
    )
''')
conn.commit()

In [17]:
def extract_text_from_images(pdf_bytes, max_pages=10):
    try:
        # Attempt to convert PDF into images
        pages = convert_from_bytes(pdf_bytes)
    except Exception as e:
        print(f"Error converting PDF to images: {e}")
        return ""

    extracted_text = ""
    for page_num, page in enumerate(pages, start=1):
        if page_num > max_pages:
            break
        try:
            # Extract text using Tesseract
            text = pytesseract.image_to_string(page)
            extracted_text += text + "\n"
        except Exception as e:
            print(f"Error extracting text from page {page_num}: {e}")
            continue

    return extracted_text

In [19]:
def generate_text(instruction, prompt_parts):
    model = get_model(instruction)
    try:
        response = model.generate_content(prompt_parts)
        if response is None or not response.text:
            if response and hasattr(response, 'candidate') and hasattr(response.candidate, 'safety_ratings'):
                safety_ratings = response.candidate.safety_ratings
                return f'Generation blocked due to safety ratings: {safety_ratings}'
            else:
                return 'No valid response generated.'
        return response.text
    except Exception as ex:
        return str(ex)

In [21]:
def get_model(instruction):
    model = genai.GenerativeModel(
        model_name="gemini-1.5-flash",
        generation_config=config,
        system_instruction=instruction,
    )
    return model

In [31]:
def parse_llm_output(llm_output):
    company_name_match = re.search(r"Company name:\s*([^\n]+?)\s*Invoice date:", llm_output, re.IGNORECASE)
    date_match = re.search(r"Invoice date:\s*([^\n]+?)\s*Total amount:", llm_output, re.IGNORECASE)
    total_amount_match = re.search(r"Total amount:\s*([\d,]+\.\d{2})", llm_output, re.IGNORECASE)

    company_name = company_name_match.group(1).strip() if company_name_match else "Unknown"
    invoice_date = date_match.group(1).strip() if date_match else "Unknown"
    total_amount = total_amount_match.group(1).replace(",", "") if total_amount_match else "Unknown"

    if total_amount != "Unknown":
        try:
            total_amount = float(total_amount)
        except ValueError:
            total_amount = "Unknown"

    return {
        "Company Name": company_name,
        "Invoice Date": invoice_date,
        "Total Amount": total_amount
    }

# Process Multiple PDF Files (you can add more PDF file paths to the list)
pdf_file_paths = ["sample invoices/123.pdf", "sample invoices/-4180389598760657265.MB TIMBER 15.pdf"]  # Replace with your actual file paths


In [33]:
for pdf_file_path in pdf_file_paths:
    if not os.path.exists(pdf_file_path):
        print(f"File {pdf_file_path} does not exist.")
        continue

    print(f"Processing {pdf_file_path}...")
    
    # Read the PDF file
    with open(pdf_file_path, "rb") as pdf_file:
        pdf_bytes = pdf_file.read()

    if len(pdf_bytes) == 0:
        print(f"The file {pdf_file_path} is empty or could not be read.")
        continue

    # Extract text from the PDF
    invoice_text = extract_text_from_images(pdf_bytes)

    if not invoice_text.strip():
        print(f"No text extracted from {pdf_file_path}.")
        continue

    # Define the query for LLM to extract the relevant information
    prompt = """Extract the company name, invoice date, and total amount from the invoice. 
Only return the required information without adding extra words or sentences.
The output should strictly follow this format:
Company name: <company_name> 
Invoice date: <invoice_date> 
Total amount: <total_amount>

Ensure:
- The company name is enclosed within `Company name:`
- The invoice date is enclosed within `Invoice date:`
- The total amount is enclosed within `Total amount:`
- No extra text or comments are included.
- Use the exact field names and order as provided above.
"""

    instruction = """
You are an invoice examiner. Your job is to interpret the text of an invoice and extract the 
information from the document.
"""

    # Prompt parts: text extracted from invoice and the instruction
    prompt_parts = [invoice_text, prompt]

    # Generate structured text using the LLM
    llm_output = generate_text(instruction, prompt_parts)

    # Parse the LLM output to extract relevant information
    invoice_info = parse_llm_output(llm_output)

    # Show extracted information in Jupyter Notebook
    print(f"Company Name: {invoice_info['Company Name']}")
    print(f"Invoice Date: {invoice_info['Invoice Date']}")
    print(f"Total Amount: {invoice_info['Total Amount']}")


Processing sample invoices/123.pdf...
Company Name: Unknown
Invoice Date: Unknown
Total Amount: Unknown
Processing sample invoices/-4180389598760657265.MB TIMBER 15.pdf...
Company Name: Unknown
Invoice Date: Unknown
Total Amount: Unknown


In [35]:
  cursor.execute('''
        INSERT INTO invoices (company_name, invoice_date, total_amount)
        VALUES (?, ?, ?)
    ''', (invoice_info["Company Name"], invoice_info["Invoice Date"], invoice_info["Total Amount"]))
    conn.commit()

    print(f"Invoice from {invoice_info['Company Name']} saved to the database.")

# Optional: Display existing invoices from the database
cursor.execute("SELECT * FROM invoices")
records = cursor.fetchall()
if records:
    print("Saved Invoices:")
    for record in records:
        print(record)
else:
    print("No invoices saved yet.")

# Close the database connection
conn.close()

IndentationError: unexpected indent (3044707910.py, line 5)