In [7]:
import os
import io
import textwrap
import re
import requests
from pypdf import PdfReader
from pdfminer.high_level import extract_text

In [99]:
class PO_num_extracter:
    
    def __init__(self,pdf_path_or_url : str):
        self.pdf_path_or_url = pdf_path_or_url
    
    def log(self,message:str,success_flag=True):
        if success_flag: print(f"\n\n###################   {message}   ###################")
        else: print(f"!!!!!!!!!!!!!!!!!!   {message}   !!!!!!!!!!!!!!!!!!!!") 
        
    def format_text(self, raw_text: str):
        formatted_text = ' '.join(raw_text.split())
        formatted_text = ''.join(char if char.isalnum() or char.isspace() else ' ' for char in formatted_text)
        sections = formatted_text.split('   ')
        formatted_text = ' '.join(section.strip() for section in sections if section.strip())
        return formatted_text.strip()
    
    def download_pdf(self):
        if self.pdf_path_or_url.startswith("http"):
            response = requests.get(self.pdf_path_or_url)
            if response.status_code == 200:
                return response.content
            else:
                raise ValueError(f"Failed to download PDF from {self.pdf_path_or_url}")
        else:
            with open(self.pdf_path_or_url, 'rb') as f:
                return f.read()
            
    def extract_data(self):
    
        pdf_data = self.download_pdf()
        
        reader = PdfReader(io.BytesIO(pdf_data))
        text = ''.join([page.extract_text() for page in reader.pages])
        self.wrapped_text = textwrap.fill(text, width=120)
        
        self.text = extract_text(self.pdf_path_or_url)
        
        return [self.wrapped_text,self.text]
    
    def extract_invoice_number(self,text: str):
        
        invoice_numbers = re.findall(r'\b\d{5}\b', text)
        if invoice_numbers: return invoice_numbers
        else:
            pattern = r'(?:invoice\s*(?:no(?:\.|:)?|number|num)?\s*:?)(\d{5})'
            invoice_numbers = re.search(pattern, text, re.IGNORECASE)
            if invoice_numbers:
                return invoice_numbers.group()
            else:
                return
    
    def main(self):
        texts = self.extract_data()
        invoice_numbers = []
        # print(texts[0])
        for text in texts:
            if self.extract_invoice_number(text):
                invoice_numbers.append(self.extract_invoice_number(text)[0])
                
        return invoice_numbers[0] if invoice_numbers else None
    

In [100]:
import time

# Your code to measure runtime
start_time = time.time()

obj = PO_num_extracter(r"E:\Projects\SA - R&D\invoice-sample-2.pdf")
print("Invoice number :",obj.main())

end_time = time.time()

# Calculate runtime
runtime = end_time - start_time
print("Runtime:", str(runtime)[:5], "seconds")

Invoice number : 12345
Runtime: 0.082 seconds
