In [1]:
!pip install PyMuPDF pylatex
# For LaTeX engine


Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting pylatex
  Downloading PyLaTeX-1.4.2.tar.gz (59 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting ordered-set (from pylatex)
  Downloading ordered_set-4.1.0-py3-none-any.whl.metadata (5.3 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m412.1 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:03[0m:05[0m
[?25hDownloading ordered_set-4.1.0-py3-none-any.whl (7.6 kB)
Building wheels for collected packages: pylatex
  Building wheel for pylatex (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pylatex: filename=pylatex-1.4.2-py3-none-any.whl size=43653 sha256=a301e1d1bd5

# run the following in terminal

In [None]:
# sudo apt-get install texlive-full

# generation

In [15]:
import re
import fitz  # PyMuPDF
from pylatex import Document, Section, Subsection, Command
from pylatex.utils import NoEscape
from typing import List, Dict

# STEP 1: Parse clause numbers using regex
def parse_clauses_by_number(text: str) -> Dict[str, str]:
    # Matches things like 2/3/1 or 1.1.1 or 2.4/3 etc.
    pattern = re.compile(r"(?<=\n)(\d+(?:[/\.]\d+)+)\s+(.*?)(?=\n\d|$)", re.DOTALL)
    clauses = {}
    for match in pattern.finditer(text):
        clause_id = match.group(1).strip()
        clause_text = match.group(2).strip().replace('\n', ' ')
        clauses[clause_id] = clause_text
    return clauses
import os

def generate_updated_standard_pdf(original_pdf_path: str, enhancements: List[Dict], output_path: str):
    # Ensure output directory exists
    output_dir = os.path.dirname(output_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Start LaTeX document
    doc = Document("AAOIFI_Enhanced_Standard")

    # Step 1: Extract text from PDF
    with fitz.open(original_pdf_path) as pdf:
        full_text = ""
        for page in pdf:
            full_text += page.get_text()

    # Step 2: Parse by clauses
    clauses = parse_clauses_by_number(full_text)

    # Step 3: Apply enhancements
    for enhancement in enhancements:
        clause_id = enhancement['clause_id']
        if clause_id in clauses:
            print(f"Enhancing clause: {clause_id}")
            clauses[clause_id] = enhancement['proposed_text']

    # Step 4: Write structured content
    with doc.create(Section("AAOIFI Standard (Enhanced)")):
        for clause_id in sorted(clauses):
            with doc.create(Subsection(f"Clause {clause_id}")):
                doc.append(NoEscape(clauses[clause_id]))

    # Step 5: Generate PDF
    doc.generate_pdf(output_path, clean_tex=False)
    print(f"\n✅ Enhanced standard PDF saved to: {output_path}.pdf")


In [17]:
enhancements = [
    {
        "clause_id": "2/4/1",
        "proposed_text": "Profits or losses in respect of Musharaka transactions should be recognized immediately upon agreement, not just at liquidation. This enhances transparency and compliance with Shariah accountability."
    },
    {
        "clause_id": "2/5/2",
        "proposed_text": "The disclosure requirements should explicitly include Shariah review board comments and compliance notes in the financial statements."
    }
]

generate_updated_standard_pdf(
    original_pdf_path='data/standards/FI5F55_1_Musharaka Financing(4).PDF',
    enhancements=enhancements,
    output_path='output/FAS4_Enhanced2'
)


Enhancing clause: 2/4/1
Enhancing clause: 2/5/2

✅ Enhanced standard PDF saved to: output/FAS4_Enhanced2.pdf
