In [4]:
import pymupdf  # PyMuPDF
import re
import csv

In [5]:
doc = pymupdf.open('_pdf/pp21-23.pdf')

In [13]:
def extract_articles_with_pages(doc):
    articles = []
    current_article = None

    for page_number in range(len(doc)):
        page = doc[page_number]
        text = page.get_text()
        lines = text.split('\n')

        for i, line in enumerate(lines):
            article_match = re.match(r'Article\s+(\d+)', line)
            if article_match:
                if current_article:
                    articles.append(current_article)
                current_article = {
                    'article_number': f"Article {article_match.group(1)}",
                    'title': lines[i + 1].strip() if i + 1 < len(lines) else '',
                    'body': '',
                    'start_page': page_number + 1
                }
            elif current_article:
                current_article['body'] += line + '\n'

    if current_article:
        articles.append(current_article)

    return articles


In [59]:
def extract_requirements_from_article(article):
    requirements = []
    body_lines = article['body'].splitlines()
    current_parent_id = None
    current_parent_text = ""

    # Pattern to detect numbered and lettered requirement IDs
    numbered_pattern = re.compile(r'^(\d)+\.\s*(.*)')
    lettered_pattern = re.compile(r'^\(([a-z])\)\s*(.*)')

    for line in body_lines:
        line = line.strip()
        if not line:
            continue
        
        numbered_match = numbered_pattern.match(line)
        lettered_match = lettered_pattern.match(line)

        if numbered_match:
            print(line)
            if current_parent_id and current_parent_text:
                references = re.findall(r'\b(Article\s+\d+|Annex\s+[A-Z]+)\b', current_parent_text)
                requirements.append({
                    'Article': article['article_number'],
                    'Title': article['title'],
                    'Requirement_ID': current_parent_id,
                    'Parent': '',
                    'Requirement Text': current_parent_text,
                    'References': '; '.join(references),
                    'Page': article['start_page']
                })
            current_parent_id = numbered_match.group(1) + '.'
            current_parent_text = numbered_match.group(2).strip()
        elif lettered_match and current_parent_id:
            print(line)
            sub_id = f"({lettered_match.group(1)})"
            text = lettered_match.group(2).strip()
            references = re.findall(r'\b(Article\s+\d+|Annex\s+[A-Z]+)\b', text)
            requirements.append({
                'Article': article['article_number'],
                'Title': article['title'],
                'Requirement_ID': sub_id,
                'Parent': current_parent_id,
                'Requirement Text': text,
                'References': '; '.join(references),
                'Page': article['start_page']
            })
        else:
            if current_parent_id:
                current_parent_text += ' ' + line

    if current_parent_id and current_parent_text:
        references = re.findall(r'\b(Article\s+\d+|Annex\s+[A-Z]+)\b', current_parent_text)
        requirements.append({
            'Article': article['article_number'],
            'Title': article['title'],
            'Requirement_ID': current_parent_id,
            'Parent': '',
            'Requirement Text': current_parent_text,
            'References': '; '.join(references),
            'Page': article['start_page']
        })
    print(requirements)
    return requirements


In [60]:
def extract_requirements(pdf_path, output_csv='requirements.csv'):
    doc = pymupdf.open(pdf_path)
    articles = extract_articles_with_pages(doc)
#    print(articles[0])

    all_requirements = []
    for article in articles[:1]: #
        all_requirements.extend(extract_requirements_from_article(article))
    print(len(all_requirements))

    with open(output_csv, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=[
            'Article', 'Title', 'Requirement_ID', 'Parent', 'Requirement Text', 'References', 'Page'
        ])
        writer.writeheader()
        for row in all_requirements:
            writer.writerow(row)

    return f"{output_csv}", len(all_requirements)


In [61]:
inp_path = '_pdf/pp21-23.pdf'
out_path = '_csv/pp21-23.csv'

In [62]:
extract_requirements(inp_path, out_path)

1.
2.
3.
4.
5.
(a)  the devices are not transferred to another legal entity,
(b)  manufacture and use of the devices occur under appropriate quality management systems,
(c)  the health institution justifies in its documentation that the target patient group's specific needs cannot be met, or
(d)  the health institution provides information upon request on the use of such devices to its competent authority,
(e)  the health institution draws up a declaration which it shall make publicly available, including:
(i)  the name and address of the manufacturing health institution;
(f)  the health institution draws up documentation that makes it possible to have an understanding of the manufacturing
(g)  the health institution takes all necessary measures to ensure that all devices are manufactured in accordance with the
(h)  the health institution reviews experience gained from clinical use of the devices and takes all necessary corrective
6.
5.5.2017
[{'Article': 'Article 5', 'Title': 'Placing

('_csv/pp21-23.csv', 16)