In [None]:
import os
import base64
import tempfile
import pdfkit
from lxml import etree

class XMLInvoiceProcessor:
    def __init__(self, xml_dir, output_dir, wkhtmltopdf_path, default_xslt_path):
        self.xml_dir = xml_dir
        self.output_dir = output_dir
        self.wkhtmltopdf_path = wkhtmltopdf_path
        self.default_xslt_path = default_xslt_path  

    def xml_to_pdf_and_tally(self):
        config = pdfkit.configuration(wkhtmltopdf=self.wkhtmltopdf_path)
        timestamped_output_dir = os.path.join(self.output_dir, datetime.datetime.now().strftime("%Y-%m-%d"))
        if not os.path.exists(timestamped_output_dir):
            os.makedirs(timestamped_output_dir)

        for file in os.listdir(self.xml_dir):
            if file.endswith('.xml'):
                xml_path = os.path.join(self.xml_dir, file)
                xml_parser = etree.XMLParser(encoding='utf-8')  # Ensure parser uses UTF-8
                xml_doc = etree.parse(xml_path, parser=xml_parser)

                invoice_number = self.extract_invoice_number(xml_doc)
                if invoice_number is None:
                    continue

                try:
                    self.process_xml_and_create_pdf(xml_doc, invoice_number, timestamped_output_dir, config)
                except Exception as e:
                    print(f"Error processing invoice {invoice_number}: {e}")
                    self.use_default_xslt(xml_doc, invoice_number, timestamped_output_dir, config)

    def extract_invoice_number(self, xml_doc):
        try:
            return xml_doc.find('.//cbc:ID', namespaces={'cbc': 'urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2'}).text
        except AttributeError:
            return None

    def process_xml_and_create_pdf(self, xml_doc, invoice_number, output_dir, config):
        embedded_xslt_used = False
        try:
            embedded_objects = xml_doc.findall('.//cac:Attachment/cbc:EmbeddedDocumentBinaryObject', namespaces={
                'cac': 'urn:oasis:names:specification:ubl:schema:xsd:CommonAggregateComponents-2', 
                'cbc': 'urn:oasis:names:specification:ubl:schema:xsd:CommonBasicComponents-2'
            })
            for embedded_object in embedded_objects:
                mime_code = embedded_object.get('mimeCode')
                if mime_code == 'application/xml':
                    base64_xslt = embedded_object.text
                    decoded_xslt = base64.b64decode(base64_xslt)
                    try:
                        xslt_doc = etree.fromstring(decoded_xslt)
                        transform = etree.XSLT(xslt_doc)
                        result = transform(xml_doc)
                        html_content = str(result)

                        with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.html', encoding='utf-8') as temp_html_file:
                            temp_html_file.write(html_content)
                            temp_html_path = temp_html_file.name

                        pdf_file_name = f"{invoice_number}.pdf"
                        pdf_path = os.path.join(output_dir, pdf_file_name)
                        pdfkit.from_file(temp_html_path, pdf_path, configuration=config)
                        os.remove(temp_html_path)
                        embedded_xslt_used = True
                        return  # Stop after successful processing of one XSLT.
                    except etree.XMLSyntaxError as e:
                        print(f"XML Syntax Error in embedded XSLT for invoice {invoice_number}: {str(e)}")
                        continue  # Try the next embedded XSLT if there is a syntax error
            if not embedded_xslt_used:
                raise ValueError("No suitable embedded XSLT found for invoice.")
        except Exception as e:
            print(f"Error processing invoice {invoice_number}: {e}")
            self.use_default_xslt(xml_doc, invoice_number, output_dir, config)

    def use_default_xslt(self, xml_doc, invoice_number, output_dir, config):
        try:
            with open(self.default_xslt_path, 'rb') as default_xslt_file:
                xslt_doc = etree.parse(default_xslt_file)
                transform = etree.XSLT(xslt_doc)
                result = transform(xml_doc)
                html_content = str(result)

                with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.html', encoding='utf-8') as temp_html_file:
                    temp_html_file.write(html_content)
                    temp_html_path = temp_html_file.name

                pdf_file_name = f"{invoice_number}.pdf"
                pdf_path = os.path.join(output_dir, pdf_file_name)
                pdfkit.from_file(temp_html_path, pdf_path, configuration=config)
                os.remove(temp_html_path)
        except Exception as e:
            print(f"Failed to process invoice {invoice_number} with default XSLT due to: {e}")


In [None]:
processor = XMLInvoiceProcessor(
    "C:\\Users\\user name\\Desktop\\Corp\\Extracted_XMLs",
    "C:\\Users\\user name\\Desktop\\Corp\\Output_PDFs",
    "C:\\Users\\user name\\Desktop\\Corp\\Developer\\wkhtmltopdf\\bin\\wkhtmltopdf.exe",
    "C:\\Users\\user name\\Desktop\\Corp\\Developer\\FaturaFormat.XSLT" 
)

processor.xml_to_pdf_and_tally()