# Financial Document Generator

This notebook generates synthetic financial documents (invoices, statements, receipts) using Faker library for realistic data generation. The documents include company logos and signatures from an existing dataset, with QR codes for document tracking. Each document is saved in both JSON and PDF formats.

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
pip install reportlab

Collecting reportlab
  Downloading reportlab-4.3.0-py3-none-any.whl.metadata (1.5 kB)
Downloading reportlab-4.3.0-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m1.3/1.9 MB[0m [31m37.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.3.0


In [9]:
!pip install reportlab faker Pillow



In [15]:
!pip install qrcode

Collecting qrcode
  Downloading qrcode-8.0-py3-none-any.whl.metadata (17 kB)
Downloading qrcode-8.0-py3-none-any.whl (45 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: qrcode
Successfully installed qrcode-8.0


In [22]:
import qrcode
from PIL import Image
import io
from faker import Faker
import random
import os
import json
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image as RLImage
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch

def get_random_file(directory):
    valid_formats = ['.png', '.jpg', '.jpeg']
    try:
        files = [f for f in os.listdir(directory)
                if os.path.splitext(f)[1].lower() in valid_formats]
        return random.choice(files) if files else None
    except Exception:
        return None

class FinancialGenerator:
    def __init__(self):
        self.fake = Faker(['en_GB', 'fr_FR', 'de_DE'])
        self.setup_directories()
        self.styles = getSampleStyleSheet()

    def setup_directories(self):
        self.dirs = {
            'json': "/content/drive/MyDrive/Omdena PII/docs_dataset/json_files",
            'logos': "/content/drive/MyDrive/Omdena PII/docs_dataset/logos",
            'signatures': "/content/drive/MyDrive/Omdena PII/docs_dataset/signatures",
            'pdfs': "/content/drive/MyDrive/Omdena PII/docs_dataset/generated_pdfs",
            'temp': "temp"
        }
        # Create directories if they don't exist
        os.makedirs(self.dirs['json'], exist_ok=True)
        os.makedirs(self.dirs['pdfs'], exist_ok=True)
        os.makedirs(self.dirs['temp'], exist_ok=True)
        # Only create temp directory
        os.makedirs(self.dirs['temp'], exist_ok=True)

    def generate_qr(self, data):
        try:
            qr = qrcode.QRCode(
                version=1,
                error_correction=qrcode.constants.ERROR_CORRECT_L,
                box_size=10,
                border=4,
            )
            qr.add_data(data)
            qr.make(fit=True)
            qr_image = qr.make_image(fill_color="black", back_color="white")

            img_byte_arr = io.BytesIO()
            qr_image.save(img_byte_arr, format='PNG')
            img_byte_arr.seek(0)

            return img_byte_arr
        except Exception as e:
            print(f"Error generating QR code: {e}")
            return None

    def save_temp_image(self, img, prefix):
        try:
            if img.mode not in ('RGB', 'L'):
                img = img.convert('RGB')
            temp_path = os.path.join(self.dirs['temp'], f'{prefix}_{random.randint(1000, 9999)}.png')
            img.save(temp_path)
            return temp_path
        except Exception as e:
            print(f"Error saving temporary image: {e}")
            return None

    def generate_document(self, index):
        doc = {
            "document_id": str(self.fake.uuid4()),
            "generated_date": self.fake.date_this_year().strftime("%Y-%m-%d"),
            "type": random.choice(["Invoice", "Statement", "Receipt"]),
            "layout": "modern",
            "logo_file": get_random_file(self.dirs['logos']),
            "signature_file": get_random_file(self.dirs['signatures']),
            "company_info": {
                "name": self.fake.company(),
                "address": self.fake.address(),
                "vat_id": f"EU{self.fake.random_number(digits=10)}"
            },
            "customer_info": {
                "full_name": self.fake.name(),
                "address": self.fake.address(),
                "email": self.fake.email(),
                "account_number": self.fake.bban()
            },
            "transactions": [
                {
                    "date": self.fake.date_this_year().strftime("%Y-%m-%d"),
                    "description": self.fake.bs(),
                    "amount": round(random.uniform(100, 5000), 2)
                } for _ in range(random.randint(3, 7))
            ]
        }

        doc["total_amount"] = sum(t["amount"] for t in doc["transactions"])
        return doc

    def create_pdf(self, doc_data, output_path):
        try:
            doc = SimpleDocTemplate(output_path, pagesize=A4,
                                  rightMargin=72, leftMargin=72,
                                  topMargin=72, bottomMargin=72)

            story = []
            temp_files = []

            # Create header table for logo and QR code
            header_data = [[]]

            # Add logo to left cell
            if doc_data['logo_file']:
                try:
                    logo_path = os.path.join(self.dirs['logos'], doc_data['logo_file'])
                    if os.path.exists(logo_path):
                        img = Image.open(logo_path)
                        temp_path = self.save_temp_image(img, 'logo')
                        if temp_path:
                            header_data[0].append(RLImage(temp_path, width=2*inch, height=1*inch))
                            temp_files.append(temp_path)
                except Exception as e:
                    print(f"Error processing logo: {e}")
                    header_data[0].append('')
            else:
                header_data[0].append('')

            # Add empty middle cell for spacing
            header_data[0].append('')

            # Add QR code to right cell
            qr_data = f"Document ID: {doc_data['document_id']}\nTotal: €{doc_data['total_amount']:.2f}"
            qr_image = self.generate_qr(qr_data)
            if qr_image:
                header_data[0].append(RLImage(qr_image, width=1*inch, height=1*inch))
            else:
                header_data[0].append('')

            # Create and style header table
            header_table = Table(header_data, colWidths=[2.5*inch, 2*inch, 2.5*inch])
            header_table.setStyle(TableStyle([
                ('ALIGN', (0, 0), (0, 0), 'LEFT'),
                ('ALIGN', (-1, 0), (-1, 0), 'RIGHT'),
                ('VALIGN', (0, 0), (-1, -1), 'TOP'),
            ]))

            story.append(header_table)
            story.append(Spacer(1, 20))

            # Document content
            story.append(Paragraph(doc_data['company_info']['name'], self.styles['Heading1']))
            story.append(Paragraph(doc_data['company_info']['address'], self.styles['Normal']))
            story.append(Spacer(1, 12))

            # Customer info
            customer_data = [
                ['Customer Information'],
                [doc_data['customer_info']['full_name']],
                [doc_data['customer_info']['address']],
                [f"Account: {doc_data['customer_info']['account_number']}"]
            ]

            customer_table = Table(customer_data, colWidths=[400])
            customer_table.setStyle(TableStyle([
                ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
                ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
                ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
                ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
                ('GRID', (0, 0), (-1, -1), 1, colors.black)
            ]))
            story.append(customer_table)
            story.append(Spacer(1, 20))

            # Transactions
            trans_data = [['Date', 'Description', 'Amount']]
            for t in doc_data['transactions']:
                trans_data.append([
                    t['date'],
                    t['description'],
                    f"€{t['amount']:.2f}"
                ])
            trans_data.append(['', 'Total:', f"€{doc_data['total_amount']:.2f}"])

            trans_table = Table(trans_data, colWidths=[100, 300, 100])
            trans_table.setStyle(TableStyle([
                ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
                ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
                ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
                ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
                ('GRID', (0, 0), (-1, -1), 1, colors.black),
                ('ALIGN', (-1, 1), (-1, -1), 'RIGHT'),
            ]))
            story.append(trans_table)

            # Add signature
            if doc_data['signature_file']:
                try:
                    story.append(Spacer(1, 30))
                    sig_path = os.path.join(self.dirs['signatures'], doc_data['signature_file'])
                    if os.path.exists(sig_path):
                        img = Image.open(sig_path)
                        # Convert RGBA to RGB if needed and handle transparency
                        if img.mode == 'RGBA':
                            # Create a white background
                            background = Image.new('RGB', img.size, (255, 255, 255))
                            # Paste using alpha channel as mask
                            background.paste(img, mask=img.split()[3])
                            img = background
                        temp_path = self.save_temp_image(img, 'signature')
                        if temp_path:
                            story.append(RLImage(temp_path, width=2*inch, height=0.75*inch))
                            temp_files.append(temp_path)
                except Exception as e:
                    print(f"Error processing signature: {e}")

            # Build PDF
            doc.build(story)

            # Cleanup temp files
            for temp_file in temp_files:
                try:
                    os.remove(temp_file)
                except Exception:
                    pass

            return True
        except Exception as e:
            print(f"Error creating PDF: {e}")
            return False

    def generate_all(self, num_documents=100):
        for i in range(num_documents):
            try:
                doc = self.generate_document(i)

                json_path = os.path.join(self.dirs['json'], f'document_{i+1}.json')
                with open(json_path, 'w') as f:
                    json.dump(doc, f, indent=2)

                pdf_path = os.path.join(self.dirs['pdfs'], f'document_{i+1}.pdf')
                if self.create_pdf(doc, pdf_path):
                    print(f"Successfully generated document {i+1}/{num_documents}")
                else:
                    print(f"Failed to generate PDF for document {i+1}/{num_documents}")
            except Exception as e:
                print(f"Error generating document {i+1}: {e}")
                continue

if __name__ == "__main__":
    generator = FinancialGenerator()
    generator.generate_all(100)

Successfully generated document 1/100
Successfully generated document 2/100
Successfully generated document 3/100
Successfully generated document 4/100
Successfully generated document 5/100
Successfully generated document 6/100
Successfully generated document 7/100
Successfully generated document 8/100
Successfully generated document 9/100
Successfully generated document 10/100
Successfully generated document 11/100
Successfully generated document 12/100
Successfully generated document 13/100
Successfully generated document 14/100
Successfully generated document 15/100
Successfully generated document 16/100
Successfully generated document 17/100
Successfully generated document 18/100
Successfully generated document 19/100
Successfully generated document 20/100
Successfully generated document 21/100
Successfully generated document 22/100
Successfully generated document 23/100
Successfully generated document 24/100
Successfully generated document 25/100
Successfully generated document 26