In [9]:
import gradio as gr
from pdf2image import convert_from_path
import pytesseract
import pandas as pd
import re
import os
import logging
from supabase import create_client, Client

# Initialize Supabase client
url = "https://qumfinzjetrdhzmbuwcz.supabase.co" #"your_supabase_url"
key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InF1bWZpbnpqZXRyZGh6bWJ1d2N6Iiwicm9sZSI6ImFub24iLCJpYXQiOjE3MjM3ODU2NTUsImV4cCI6MjAzOTM2MTY1NX0.e8SwMVAOEILIgWk4QaJu0STayCur7uyJEnMXBNUxadg" #"your_supabase_key"

supabase: Client = create_client(url, key)

# Define the Supabase logging handler
class SupabaseHandler(logging.Handler):
    def emit(self, record):
        log_entry = self.format(record)
        # Insert log entry into Supabase
        data = {
            "level": record.levelname,
            "message": log_entry,
            "file_name": record.pathname  # Add more fields if needed
        }
        supabase.table("logs").insert(data).execute()

# Configure logging to use both file and Supabase
supabase_handler = SupabaseHandler()
logging.basicConfig(
    level=logging.ERROR ,  # Change from DEBUG to WARNING
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('process_log.log'),
        supabase_handler
    ]
)


def process_pdf(file):
    try:
        logging.info(f"Processing file: {file.name}")

        # Check if the file is a PDF
        if not file.name.lower().endswith('.pdf'):
            logging.warning("Uploaded file is not a PDF.")
            return "Error: Uploaded file is not a PDF. Please upload a PDF file.", None
        
        # Convert PDF to images
        images = convert_from_path(file.name)
        logging.info(f"PDF converted to {len(images)} image(s).")

        full_text = ""
        for i, img in enumerate(images):
            text = pytesseract.image_to_string(img)
            logging.debug(f"Text extracted from image {i+1}: {text[:100]}...")
            full_text += text + "\n"
        
        # If no text was extracted, likely a wrong or empty PDF
        if not full_text.strip():
            logging.error("No text could be extracted from the PDF.")
            return "Error: Unable to extract text. The PDF might be blank or incorrectly formatted.", None

        # Check for required keywords
        required_keywords = ['invoice', 'order no', 'invoice date', 'supplier name', 'warehouse id']
        if not any(keyword in full_text.lower() for keyword in required_keywords):
            logging.warning("Necessary keywords not found in the PDF.")
            return "Error: The uploaded PDF does not contain necessary fields like 'Invoice', 'Order No', etc. Please upload a correct invoice.", None
        
        # Save extracted data to CSV
        csv_data = save_to_csv(full_text)
        csv_path = "output.csv"  # Specify the output CSV file name
        with open(csv_path, "w") as f:
            f.write(csv_data)  # Write the CSV data to the file

        logging.info("Processing completed successfully.")
        return "Processing Complete", csv_path  # Return the file path

    except Exception as e:
        logging.error(f"An unexpected error occurred: {str(e)}")
        return f"Error: An unexpected error occurred: {str(e)}", None

def extract_fields(text):
    logging.debug("Extracting fields from text.")
    invoice_no = re.search(r'Invoice No:\s*([A-Z0-9-]+)', text)
    order_no = re.search(r'HM Order No:\s*(\d+)', text)
    invoice_date = re.search(r'Invoice Date:\s*([\d-]+)', text)
    supplier_name = re.search(r'Supplier Name:\s*([A-Z\s]+)', text)
    warehouse_id = re.search(r'\bINW\d{3}\b', text)
    
    description_of_goods = None
    if warehouse_id:
        warehouse_line_start = text.find(warehouse_id.group(0))
        description_lines = []
        lines = text[warehouse_line_start:].split('\n')
        for line in lines:
            if "Container No:" in line:
                break
            description_lines.append(line.strip())
        description = ' '.join(description_lines).replace(' Cartons', '')
        words = description.split()
        filtered_words = [
            word for word in words
            if not re.match(r'^\d+(\.\d+)?$', word) and word.lower() not in ['usd', 'pieces'] and '=' not in word
        ]
        description_of_goods = ' '.join(filtered_words)

    logging.debug(f"Fields extracted: Invoice No: {invoice_no}, Order No: {order_no}, Invoice Date: {invoice_date}, Supplier Name: {supplier_name}, Warehouse ID: {warehouse_id}")
    return {
        'Invoice No': invoice_no.group(1) if invoice_no else None,
        'HM Order No': order_no.group(1) if order_no else None,
        'Invoice Date': invoice_date.group(1) if invoice_date else None,
        'Supplier Name': supplier_name.group(1).strip() if supplier_name else None,
        'Warehouse ID': warehouse_id.group(0) if warehouse_id else None,
        'Description of Goods': description_of_goods
    }

def save_to_csv(full_text):
    logging.info("Saving extracted fields to CSV.")
    invoice_texts = full_text.split('INVOICE')
    data = []

    for i in range(1, len(invoice_texts)):
        invoice_text = "INVOICE" + invoice_texts[i]
        fields = extract_fields(invoice_text)
        if fields['Invoice No']:
            data.append(fields)
    
    df_invoices = pd.DataFrame(data)
    
    # Function to fix hanging words and capitalize text
    def fix_description(description):
        words = description.split()
        fixed_words = []
        for i in range(len(words) - 1):
            if len(words[i + 1]) <= 2:
                fixed_words.append(words[i] + words[i + 1])
            else:
                if len(words[i]) > 2:
                    fixed_words.append(words[i])
        if len(words[-1]) > 1:
            fixed_words.append(words[-1])

        return " ".join(fixed_words).upper()

    df_invoices['Description of Goods'] = df_invoices['Description of Goods'].apply(fix_description)

    csv_data = df_invoices.to_csv(index=False)
    logging.info("CSV file created successfully.")
    return csv_data

iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(file_types=['.pdf']),
    outputs=[gr.Textbox(), gr.File(label="Download CSV")]
)

iface.launch()


2024-08-16 22:18:20,368:INFO - HTTP Request: GET http://127.0.0.1:7867/startup-events "HTTP/1.1 200 OK"


Running on local URL:  http://127.0.0.1:7867


2024-08-16 22:18:20,772:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:18:20,930:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:18:21,094:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:18:21,246:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:18:21,338:INFO - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
2024-08-16 22:18:21,405:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:18:21,553:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:18:21,706:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-1

KeyboardInterrupt: 

2024-08-16 22:18:25,972:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:18:26,141:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:18:26,292:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:18:26,448:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:18:26,609:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:18:26,752:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:18:26,909:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:18:27,063:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1

Failed to log to Supabase: 
{'level': 'INFO', 'message': 'HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"', 'file_name': '/home/abinaya/.local/lib/python3.10/site-packages/httpx/_client.py'}
HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
1723826919.7782776
{'level': 'INFO', 'message': 'HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"', 'file_name': '/home/abinaya/.local/lib/python3.10/site-packages/httpx/_client.py'}
HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
1723826919.617122
{'level': 'INFO', 'message': 'HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"', 'file_name': '/home/abinaya/.local/lib/python3.10/site-packages/httpx/_client.py'}
HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
1723826919.206

In [2]:
pip install supabase


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [6]:
import gradio as gr
from pdf2image import convert_from_path
import pytesseract
import pandas as pd
import re
import os
import logging
from supabase import create_client, Client

# Initialize Supabase client
url = "https://qumfinzjetrdhzmbuwcz.supabase.co" #"your_supabase_url"
key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InF1bWZpbnpqZXRyZGh6bWJ1d2N6Iiwicm9sZSI6ImFub24iLCJpYXQiOjE3MjM3ODU2NTUsImV4cCI6MjAzOTM2MTY1NX0.e8SwMVAOEILIgWk4QaJu0STayCur7uyJEnMXBNUxadg" #"your_supabase_key"
supabase: Client = create_client(url, key)

# Define the Supabase logging handler
class SupabaseHandler(logging.Handler):
    def emit(self, record):
        log_entry = self.format(record)
        try:
            # Insert log entry into Supabase
            data = {
                "level": record.levelname,
                "message": log_entry,
                "file_name": record.pathname  # Add more fields if needed
            }
            result=supabase.table("logs").insert(data).execute()

        # Check if the insertion was successful
            if hasattr(result, 'data') and len(result.data) > 0:
                print(f"Log entry successfully added to Supabase: {result.data[0]['id']}")
            else:
                print(f"Failed to add log entry to Supabase: {result}")
        except Exception as e:
            # Use sys.stderr.write() for error output
            import sys
            sys.stderr.write(f"Failed to log to Supabase: {str(e)}\n")

# Configure logging to use both file and Supabase
supabase_handler = SupabaseHandler()
logging.basicConfig(
    level=logging.DEBUG, 
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('process_log.log'),
        supabase_handler
    ]
)

# # Explicitly add the SupabaseHandler to the root logger
# logger = logging.getLogger()  # Root logger
# logger.addHandler(supabase_handler)

def process_pdf(file):
    try:
        logging.info(f"Processing file: {file.name}")

        # Check if the file is a PDF
        if not file.name.lower().endswith('.pdf'):
            logging.warning("Uploaded file is not a PDF.")
            return "Error: Uploaded file is not a PDF. Please upload a PDF file.", None
        
        # Convert PDF to images
        images = convert_from_path(file.name)
        logging.info(f"PDF converted to {len(images)} image(s).")

        full_text = ""
        for i, img in enumerate(images):
            text = pytesseract.image_to_string(img)
            logging.debug(f"Text extracted from image {i+1}: {text[:100]}...")
            full_text += text + "\n"
        
        # If no text was extracted, likely a wrong or empty PDF
        if not full_text.strip():
            logging.error("No text could be extracted from the PDF.")
            return "Error: Unable to extract text. The PDF might be blank or incorrectly formatted.", None

        # Check for required keywords
        required_keywords = ['invoice', 'order no', 'invoice date', 'supplier name', 'warehouse id']
        if not any(keyword in full_text.lower() for keyword in required_keywords):
            logging.warning("Necessary keywords not found in the PDF.")
            return "Error: The uploaded PDF does not contain necessary fields like 'Invoice', 'Order No', etc. Please upload a correct invoice.", None
        
        # Save extracted data to CSV
        csv_data = save_to_csv(full_text)
        csv_path = "output.csv"  # Specify the output CSV file name
        with open(csv_path, "w") as f:
            f.write(csv_data)  # Write the CSV data to the file

        logging.info("Processing completed successfully.")
        return "Processing Complete", csv_path  # Return the file path

    except Exception as e:
        logging.error(f"An unexpected error occurred: {str(e)}")
        return f"Error: An unexpected error occurred: {str(e)}", None

def extract_fields(text):
    logging.debug("Extracting fields from text.")
    invoice_no = re.search(r'Invoice No:\s*([A-Z0-9-]+)', text)
    order_no = re.search(r'HM Order No:\s*(\d+)', text)
    invoice_date = re.search(r'Invoice Date:\s*([\d-]+)', text)
    supplier_name = re.search(r'Supplier Name:\s*([A-Z\s]+)', text)
    warehouse_id = re.search(r'\bINW\d{3}\b', text)
    
    description_of_goods = None
    if warehouse_id:
        warehouse_line_start = text.find(warehouse_id.group(0))
        description_lines = []
        lines = text[warehouse_line_start:].split('\n')
        for line in lines:
            if "Container No:" in line:
                break
            description_lines.append(line.strip())
        description = ' '.join(description_lines).replace(' Cartons', '')
        words = description.split()
        # Filter out the warehouse ID explicitly from the description
        filtered_words = [
            word for word in words
            if not re.match(r'^\d+(\.\d+)?$', word) and word.lower() not in ['usd', 'pieces'] and '=' not in word and word != warehouse_id.group(0)
        ]

        description_of_goods = ' '.join(filtered_words)

    logging.debug(f"Fields extracted: Invoice No: {invoice_no}, Order No: {order_no}, Invoice Date: {invoice_date}, Supplier Name: {supplier_name}, Warehouse ID: {warehouse_id}")
    return {
        'Invoice No': invoice_no.group(1) if invoice_no else None,
        'HM Order No': order_no.group(1) if order_no else None,
        'Invoice Date': invoice_date.group(1) if invoice_date else None,
        'Supplier Name': supplier_name.group(1).strip() if supplier_name else None,
        'Warehouse ID': warehouse_id.group(0) if warehouse_id else None,
        'Description of Goods': description_of_goods
    }

def save_to_csv(full_text):
    logging.info("Saving extracted fields to CSV.")
    invoice_texts = full_text.split('INVOICE')
    data = []

    for i in range(1, len(invoice_texts)):
        invoice_text = "INVOICE" + invoice_texts[i]
        fields = extract_fields(invoice_text)
        if fields['Invoice No']:
            data.append(fields)
    
    df_invoices = pd.DataFrame(data)
    
    # Function to fix hanging words and capitalize text
    def fix_description(description):
        words = description.split()
        fixed_words = []
        for i in range(len(words) - 1):
            if len(words[i + 1]) <= 2:
                fixed_words.append(words[i] + words[i + 1])
            else:
                if len(words[i]) > 2:
                    fixed_words.append(words[i])
        if len(words[-1]) > 1:
            fixed_words.append(words[-1])

        return " ".join(fixed_words).upper()

    df_invoices['Description of Goods'] = df_invoices['Description of Goods'].apply(fix_description)

    csv_data = df_invoices.to_csv(index=False)
    logging.info("CSV file created successfully.")
    return csv_data

iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(file_types=['.pdf']),
    outputs=[gr.Textbox(), gr.File(label="Download CSV")]
)

iface.launch()


2024-08-16 22:11:41,144:INFO - HTTP Request: GET http://127.0.0.1:7864/startup-events "HTTP/1.1 200 OK"


Running on local URL:  http://127.0.0.1:7864


2024-08-16 22:11:41,574:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:11:41,822:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:11:41,977:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:11:42,089:INFO - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
2024-08-16 22:11:42,140:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:11:42,337:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:11:42,498:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:11:42,653:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-1

KeyboardInterrupt: 

2024-08-16 22:11:47,061:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:11:47,218:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:11:47,371:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:11:47,528:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:11:47,679:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:11:47,857:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:11:48,025:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
2024-08-16 22:11:48,189:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1

Failed to log to Supabase: 
{'level': 'INFO', 'message': 'HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"', 'file_name': '/home/abinaya/.local/lib/python3.10/site-packages/httpx/_client.py'}
HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
1723826517.5365763
{'level': 'INFO', 'message': 'HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"', 'file_name': '/home/abinaya/.local/lib/python3.10/site-packages/httpx/_client.py'}
HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
1723826517.3858442
{'level': 'INFO', 'message': 'HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"', 'file_name': '/home/abinaya/.local/lib/python3.10/site-packages/httpx/_client.py'}
HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"
1723826517.24

In [3]:
from supabase import create_client, Client

url = "https://qumfinzjetrdhzmbuwcz.supabase.co" #"your_supabase_url"
key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InF1bWZpbnpqZXRyZGh6bWJ1d2N6Iiwicm9sZSI6ImFub24iLCJpYXQiOjE3MjM3ODU2NTUsImV4cCI6MjAzOTM2MTY1NX0.e8SwMVAOEILIgWk4QaJu0STayCur7uyJEnMXBNUxadg" #"your_supabase_key"
supabase: Client = create_client(url, key)

data = {"level": "INFO", "message": "Test log entry", "file_name": "test.py"}
result = supabase.table("logs").insert(data).execute()
print(result)


2024-08-16 21:48:17,052:INFO - HTTP Request: POST https://qumfinzjetrdhzmbuwcz.supabase.co/rest/v1/logs "HTTP/1.1 201 Created"


data=[{'id': 1, 'timestamp': '2024-08-16T16:18:17.002576', 'level': 'INFO', 'message': 'Test log entry', 'file_name': 'test.py'}] count=None
