# Data Preparation for obligation annotations

## Find and copy relevant subset of documents

In [1]:
# EUR-Lex Target Documents for Obligation Extraction
# Consolidated versions with CELEX numbers

eur_lex_documents = [
    # GDPR & Data Protection
    {
        "title": "General Data Protection Regulation",
        "celex": "02016R0679-20160504",
        "category": "data_protection",
        "type": "regulation",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02016R0679-20160504",
    },
    {
        "title": "ePrivacy Directive",
        "celex": "02002L0058-20091219",
        "category": "data_protection",
        "type": "directive",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02002L0058-20091219",
    },
    {
        "title": "EU Institutions Data Protection Regulation",
        "celex": "02018R1725-20181212",
        "category": "data_protection",
        "type": "regulation",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02018R1725-20181212",
    },
    {
        "title": "Police Directive (Data Protection in Criminal Law)",
        "celex": "02016L0680-20160504",
        "category": "data_protection",
        "type": "directive",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02016L0680-20160504",
    },
    # Environmental Law
    {
        "title": "EU Deforestation Regulation",
        "celex": "02023R1115-20241226",
        "category": "environmental",
        "type": "regulation",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02023R1115-20241226",
    },
    {
        "title": "Industrial Emissions Directive",
        "celex": "02010L0075-20240804",
        "category": "environmental",
        "type": "directive",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02010L0075-20240804",
    },
    {
        "title": "REACH Regulation (Chemicals)",
        "celex": "02006R1907-20250623",
        "category": "environmental",
        "type": "regulation",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02006R1907-20250623",
    },
    {
        "title": "Waste Framework Directive",
        "celex": "02008L0098-20180705",
        "category": "environmental",
        "type": "directive",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02008L0098-20180705",
    },
    {
        "title": "EU Taxonomy Regulation (Sustainable Finance)",
        "celex": "02020R0852-20230101",
        "category": "environmental",
        "type": "regulation",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02020R0852-20230101",
    },
    # Financial Services
    {
        "title": "MiFID II (Markets in Financial Instruments)",
        "celex": "02014L0065-20250117",
        "category": "financial_services",
        "type": "directive",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02014L0065-20250117",
    },
    {
        "title": "Capital Requirements Regulation (CRR)",
        "celex": "02013R0575-20250629",
        "category": "financial_services",
        "type": "regulation",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02013R0575-20250629",
    },
    {
        "title": "Capital Requirements Directive (CRD IV)",
        "celex": "02013L0036-20250117",
        "category": "financial_services",
        "type": "directive",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02013L0036-20250117",
    },
    {
        "title": "Payment Services Directive 2 (PSD2)",
        "celex": "02015L2366-20250117",
        "category": "financial_services",
        "type": "directive",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02015L2366-20250117",
    },
    {
        "title": "Benchmark Regulation",
        "celex": "02016R1011-20220101",
        "category": "financial_services",
        "type": "regulation",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02016R1011-20220101",
    },
]

print(f"Total documents: {len(eur_lex_documents)}")
print(
    f"Data Protection: {len([d for d in eur_lex_documents if d['category'] == 'data_protection'])}"
)
print(
    f"Environmental: {len([d for d in eur_lex_documents if d['category'] == 'environmental'])}"
)
print(
    f"Financial Services: {len([d for d in eur_lex_documents if d['category'] == 'financial_services'])}"
)

Total documents: 14
Data Protection: 4
Environmental: 5
Financial Services: 5


In [2]:
import shutil
from pathlib import Path

# Define paths
corpus_folder = Path("../../corpus-crawler/documents/eng")
target_folder = Path("../data/eng/subset")

# Create target directory if it doesn't exist
target_folder.mkdir(parents=True, exist_ok=True)

print(f"Source folder: {corpus_folder}")
print(f"Target folder: {target_folder}")
print(f"Source folder exists: {corpus_folder.exists()}")
print(f"Target folder exists: {target_folder.exists()}")

Source folder: ../../corpus-crawler/documents/eng
Target folder: ../data/eng/subset
Source folder exists: True
Target folder exists: True


In [3]:
# Copy target documents from corpus to subset folder
copied_files = []
missing_files = []

for doc in eur_lex_documents:
    celex = doc["celex"]
    filename = f"{celex}_eng.json"
    source_path = corpus_folder / filename
    target_path = target_folder / filename

    if source_path.exists():
        shutil.copy2(source_path, target_path)
        copied_files.append(celex)
        print(f"✓ Copied: {filename}")
    else:
        missing_files.append(celex)
        print(f"✗ Missing: {filename}")

print("\nSummary:")
print(f"Copied files: {len(copied_files)}")
print(f"Missing files: {len(missing_files)}")

✓ Copied: 02016R0679-20160504_eng.json
✓ Copied: 02002L0058-20091219_eng.json
✗ Missing: 02018R1725-20181212_eng.json
✓ Copied: 02016L0680-20160504_eng.json
✓ Copied: 02023R1115-20241226_eng.json
✓ Copied: 02010L0075-20240804_eng.json
✓ Copied: 02006R1907-20250623_eng.json
✓ Copied: 02008L0098-20180705_eng.json
✗ Missing: 02020R0852-20230101_eng.json
✓ Copied: 02014L0065-20250117_eng.json
✓ Copied: 02013R0575-20250629_eng.json
✓ Copied: 02013L0036-20250117_eng.json
✓ Copied: 02015L2366-20250117_eng.json
✓ Copied: 02016R1011-20220101_eng.json

Summary:
Copied files: 12
Missing files: 2


In [4]:
for document in eur_lex_documents:
    if document["celex"] in missing_files:
        print(document)

{'title': 'EU Institutions Data Protection Regulation', 'celex': '02018R1725-20181212', 'category': 'data_protection', 'type': 'regulation', 'url': 'https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02018R1725-20181212'}
{'title': 'EU Taxonomy Regulation (Sustainable Finance)', 'celex': '02020R0852-20230101', 'category': 'environmental', 'type': 'regulation', 'url': 'https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02020R0852-20230101'}


## Create Question/Answer pairs prompts and annotations

In [9]:
from pathlib import Path

target_folder = Path("../data/eng/subset")

In [12]:
import json

from finetune_llms.annotation import QuestionAnswerGenerator

prompts_folder = target_folder / "prompts"
prompts_folder.mkdir(exist_ok=True)
input_files = target_folder.glob("*_eng.json")

for file_path in input_files:
    print(f"Generating prompt for {file_path}")

    # Load document content
    with open(file_path, "r", encoding="utf-8") as f:
        doc_data = json.load(f)

    content = doc_data.get("content")
    prompt = QuestionAnswerGenerator.generate_prompts(content, use_chunking=False)

    # Save prompt to text file
    prompt_filename = file_path.name.replace(".json", ".txt")
    prompt_path = prompts_folder / prompt_filename

    with open(prompt_path, "w", encoding="utf-8") as f:
        f.write(prompt[0][0])

Generating prompt for ../data/eng/subset/02002L0058-20091219_eng.json
Generating prompt for ../data/eng/subset/02015L2366-20250117_eng.json
Generating prompt for ../data/eng/subset/02006R1907-20250623_eng.json
Generating prompt for ../data/eng/subset/02016R0679-20160504_eng.json
Generating prompt for ../data/eng/subset/02014L0065-20250117_eng.json
Generating prompt for ../data/eng/subset/02016L0680-20160504_eng.json
Generating prompt for ../data/eng/subset/02013L0036-20250117_eng.json
Generating prompt for ../data/eng/subset/02016R1011-20220101_eng.json
Generating prompt for ../data/eng/subset/02010L0075-20240804_eng.json
Generating prompt for ../data/eng/subset/02013R0575-20250629_eng.json
Generating prompt for ../data/eng/subset/02008L0098-20180705_eng.json
Generating prompt for ../data/eng/subset/02023R1115-20241226_eng.json


In [None]:
from finetune_llms.ollama_client import OllamaClient

ollama_client = OllamaClient(base_url="http://localhost:11434", model="gemma3:12b")

annotations_folder = target_folder / "annotations"
annotations_folder.mkdir(exist_ok=True)
input_files = target_folder.glob("*_eng.json")

for file_path in input_files:
    print(f"Generating annotations for {file_path}")

    with open(file_path, "r", encoding="utf-8") as f:
        doc_data = json.load(f)

    content = doc_data.get("content")
    annotations = QuestionAnswerGenerator.generate_annotations(
        ollama_client, content, doc_data.get("id")
    )

    annotations_filename = file_path.name.replace(
        "_eng.json", "eng_qa_annotations.json"
    )
    annotations_path = annotations_folder / annotations_filename

    with open(prompt_path, "w", encoding="utf-8") as f:
        json.dump(annotations, f, indent=2)

## Create all annotations

In [None]:
from finetune_llms.annotation import (
    QuestionAnswerGenerator,
    ObligationAnnotationGenerator,
)
from finetune_llms.ollama_client import OllamaClient

ollama_client = OllamaClient(base_url="http://localhost:11434", model="gemma3:12b")

annotations_folder = target_folder / "annotations"
annotations_folder.mkdir(exist_ok=True)
input_files = target_folder.glob("*_eng.json")

annotations_to_create = {
    "qa_annotations": QuestionAnswerGenerator,
    "obligation_annotations": ObligationAnnotationGenerator,
}

for annotations_name, annotations_class in annotations_to_create.items():
    for file_path in input_files:
        print(f"Generating annotations for {file_path}")

        with open(file_path, "r", encoding="utf-8") as f:
            doc_data = json.load(f)

        content = doc_data.get("content")
        annotations = annotations_class.generate_annotations(
            ollama_client, content, doc_data.get("id")
        )

        annotations_filename = file_path.name.replace(
            "_eng.json", f"eng_{annotations_name}.json"
        )
        annotations_path = annotations_folder / annotations_filename

        with open(annotations_path, "w", encoding="utf-8") as f:
            json.dump(annotations, f, indent=2)