# Data Preparation for obligation annotations

## Find and copy relevant subset of documents

In [2]:
# EUR-Lex Target Documents for Obligation Extraction
# Consolidated versions with CELEX numbers

eur_lex_documents = [
    # GDPR & Data Protection
    {
        "title": "General Data Protection Regulation",
        "celex": "02016R0679-20160504",
        "category": "data_protection",
        "type": "regulation",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02016R0679-20160504"
    },
    {
        "title": "ePrivacy Directive",
        "celex": "02002L0058-20091219",
        "category": "data_protection",
        "type": "directive",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02002L0058-20091219"
    },
    {
        "title": "EU Institutions Data Protection Regulation",
        "celex": "02018R1725-20181212",
        "category": "data_protection",
        "type": "regulation",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02018R1725-20181212"
    },
    {
        "title": "Police Directive (Data Protection in Criminal Law)",
        "celex": "02016L0680-20160504",
        "category": "data_protection",
        "type": "directive",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02016L0680-20160504"
    },
    
    # Environmental Law
    {
        "title": "EU Deforestation Regulation",
        "celex": "02023R1115-20241226",
        "category": "environmental",
        "type": "regulation",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02023R1115-20241226"
    },
    {
        "title": "Industrial Emissions Directive",
        "celex": "02010L0075-20240804",
        "category": "environmental",
        "type": "directive",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02010L0075-20240804"
    },
    {
        "title": "REACH Regulation (Chemicals)",
        "celex": "02006R1907-20250623",
        "category": "environmental",
        "type": "regulation",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02006R1907-20250623"
    },
    {
        "title": "Waste Framework Directive",
        "celex": "02008L0098-20180705",
        "category": "environmental",
        "type": "directive",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02008L0098-20180705"
    },
    {
        "title": "EU Taxonomy Regulation (Sustainable Finance)",
        "celex": "02020R0852-20230101",
        "category": "environmental",
        "type": "regulation",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02020R0852-20230101"
    },
    
    # Financial Services
    {
        "title": "MiFID II (Markets in Financial Instruments)",
        "celex": "02014L0065-20250117",
        "category": "financial_services",
        "type": "directive",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02014L0065-20250117"
    },
    {
        "title": "Capital Requirements Regulation (CRR)",
        "celex": "02013R0575-20250629",
        "category": "financial_services",
        "type": "regulation",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02013R0575-20250629"
    },
    {
        "title": "Capital Requirements Directive (CRD IV)",
        "celex": "02013L0036-20250117",
        "category": "financial_services",
        "type": "directive",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02013L0036-20250117"
    },
    {
        "title": "Payment Services Directive 2 (PSD2)",
        "celex": "02015L2366-20250117",
        "category": "financial_services",
        "type": "directive",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02015L2366-20250117"
    },
    {
        "title": "Benchmark Regulation",
        "celex": "02016R1011-20220101",
        "category": "financial_services",
        "type": "regulation",
        "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02016R1011-20220101"
    }
]

print(f"Total documents: {len(eur_lex_documents)}")
print(f"Data Protection: {len([d for d in eur_lex_documents if d['category'] == 'data_protection'])}")
print(f"Environmental: {len([d for d in eur_lex_documents if d['category'] == 'environmental'])}")
print(f"Financial Services: {len([d for d in eur_lex_documents if d['category'] == 'financial_services'])}")

Total documents: 14
Data Protection: 4
Environmental: 5
Financial Services: 5


In [3]:
import shutil
from pathlib import Path

# Define paths
corpus_folder = Path("../../corpus-crawler/documents/eng")
target_folder = Path("../data/eng/subset")

# Create target directory if it doesn't exist
target_folder.mkdir(parents=True, exist_ok=True)

print(f"Source folder: {corpus_folder}")
print(f"Target folder: {target_folder}")
print(f"Source folder exists: {corpus_folder.exists()}")
print(f"Target folder exists: {target_folder.exists()}")

Source folder: ../../corpus-crawler/documents/eng
Target folder: ../data/eng/subset
Source folder exists: True
Target folder exists: True


In [4]:
# Copy target documents from corpus to subset folder
copied_files = []
missing_files = []

for doc in eur_lex_documents:
    celex = doc['celex']
    filename = f"{celex}_eng.json"
    source_path = corpus_folder / filename
    target_path = target_folder / filename
    
    if source_path.exists():
        shutil.copy2(source_path, target_path)
        copied_files.append(celex)
        print(f"✓ Copied: {filename}")
    else:
        missing_files.append(celex)
        print(f"✗ Missing: {filename}")

print("\nSummary:")
print(f"Copied files: {len(copied_files)}")
print(f"Missing files: {len(missing_files)}")

✓ Copied: 02016R0679-20160504_eng.json
✓ Copied: 02002L0058-20091219_eng.json
✗ Missing: 02018R1725-20181212_eng.json
✓ Copied: 02016L0680-20160504_eng.json
✓ Copied: 02023R1115-20241226_eng.json
✓ Copied: 02010L0075-20240804_eng.json
✓ Copied: 02006R1907-20250623_eng.json
✓ Copied: 02008L0098-20180705_eng.json
✗ Missing: 02020R0852-20230101_eng.json
✓ Copied: 02014L0065-20250117_eng.json
✓ Copied: 02013R0575-20250629_eng.json
✓ Copied: 02013L0036-20250117_eng.json
✓ Copied: 02015L2366-20250117_eng.json
✓ Copied: 02016R1011-20220101_eng.json

Summary:
Copied files: 12
Missing files: 2


In [5]:
for document in eur_lex_documents:
    if document["celex"] in missing_files:
        print(document)

{'title': 'EU Institutions Data Protection Regulation', 'celex': '02018R1725-20181212', 'category': 'data_protection', 'type': 'regulation', 'url': 'https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02018R1725-20181212'}
{'title': 'EU Taxonomy Regulation (Sustainable Finance)', 'celex': '02020R0852-20230101', 'category': 'environmental', 'type': 'regulation', 'url': 'https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02020R0852-20230101'}


## Create obligation annotations for these documents

In [6]:
from finetune_llms.ollama_client import OllamaClient
import json
import logging
from datetime import datetime

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize Ollama client
ollama_client = OllamaClient(
    base_url="http://localhost:11434", 
    model="llama3.1:8b"  # Change model as needed
)

# Test connection
print("Testing Ollama connection...")
if ollama_client.test_connection():
    print("✓ Ollama connection successful")
else:
    print("✗ Ollama connection failed - make sure Ollama is running and the model is available")
    print("Run: ollama pull llama3.1:8b")

INFO:finetune_llms.ollama_client:Ollama connection successful. Model llama3.1:8b is available.


Testing Ollama connection...
✓ Ollama connection successful


In [8]:
# Generate obligation annotations for all copied documents
annotations_folder = target_folder / "annotations"
annotations_folder.mkdir(exist_ok=True)

all_annotations = []
processed_docs = 0
failed_docs = []

print(f"Starting annotation generation for {len(copied_files)} documents...")
print(f"Annotations will be saved to: {annotations_folder}")

for celex in copied_files:
    filename = f"{celex}_eng.json"
    file_path = target_folder / filename
    
    print(f"\n--- Processing {celex} ---")
    print(f"File: {filename}")
    
    # Load document content
    with open(file_path, 'r', encoding='utf-8') as f:
        doc_data = json.load(f)
    
    content = doc_data.get('content')
    
    print(f"Document length: {len(content)} characters")
    
    # Generate annotations using Ollama
    obligations = ollama_client.generate_obligation_annotations(
        text=content,
        document_id=celex
    )
    
    print(f"Found {len(obligations)} obligations")
    
    # Create annotation record
    annotation_record = {
        'celex': celex,
        'annotations': obligations,
        'timestamp': datetime.now().isoformat(),
        'model_used': ollama_client.model,
        'total_obligations': len(obligations)
    }
    
    # Save individual annotation file
    annotation_filename = f"{celex}_annotations.json"
    annotation_path = annotations_folder / annotation_filename
    
    with open(annotation_path, 'w', encoding='utf-8') as f:
        json.dump(annotation_record, f, indent=2, ensure_ascii=False)
    
    all_annotations.extend(obligations)
    processed_docs += 1
    
    print(f"✓ Saved annotations to {annotation_filename}")

print("\n=== ANNOTATION SUMMARY ===")
print(f"Documents processed: {processed_docs}/{len(copied_files)}")
print(f"Failed documents: {len(failed_docs)}")
print(f"Total obligations extracted: {len(all_annotations)}")

if failed_docs:
    print("\nFailed documents:")
    for failed in failed_docs:
        print(f"  - {failed['doc']['title']}: {failed['reason']}")

INFO:finetune_llms.ollama_client:Processing chunk 1/56 for document 02016R0679-20160504
ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_client:Failed to process chunk 0 for document 02016R0679-20160504: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
INFO:finetune_llms.ollama_client:Processing chunk 2/56 for document 02016R0679-20160504
ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_client:Failed to process chunk 1 for document 02016R0679-20160504: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
INFO:finetune_llms.ollama_client:Processing chunk 3/56 for document 02016R0679-20160504
ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Interna

Starting annotation generation for 12 documents...
Annotations will be saved to: ../data/eng/subset/annotations

--- Processing 02016R0679-20160504 ---
File: 02016R0679-20160504_eng.json
Document length: 194349 characters


ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_client:Failed to process chunk 3 for document 02016R0679-20160504: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
INFO:finetune_llms.ollama_client:Processing chunk 5/56 for document 02016R0679-20160504
ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_client:Failed to process chunk 4 for document 02016R0679-20160504: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
INFO:finetune_llms.ollama_client:Processing chunk 6/56 for document 02016R0679-20160504
ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_c

Found 0 obligations
✓ Saved annotations to 02016R0679-20160504_annotations.json

--- Processing 02002L0058-20091219 ---
File: 02002L0058-20091219_eng.json
Document length: 33699 characters


ERROR:finetune_llms.ollama_client:Failed to process chunk 4 for document 02002L0058-20091219: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
INFO:finetune_llms.ollama_client:Processing chunk 6/10 for document 02002L0058-20091219
ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_client:Failed to process chunk 5 for document 02002L0058-20091219: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
INFO:finetune_llms.ollama_client:Processing chunk 7/10 for document 02002L0058-20091219
ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_client:Failed to process chunk 6 for document 02002L0058-20091219: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/gene

Found 0 obligations
✓ Saved annotations to 02002L0058-20091219_annotations.json

--- Processing 02016L0680-20160504 ---
File: 02016L0680-20160504_eng.json
Document length: 89454 characters


ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_client:Failed to process chunk 4 for document 02016L0680-20160504: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
INFO:finetune_llms.ollama_client:Processing chunk 6/26 for document 02016L0680-20160504
ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_client:Failed to process chunk 5 for document 02016L0680-20160504: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
INFO:finetune_llms.ollama_client:Processing chunk 7/26 for document 02016L0680-20160504
ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_c

Found 0 obligations
✓ Saved annotations to 02016L0680-20160504_annotations.json

--- Processing 02023R1115-20241226 ---
File: 02023R1115-20241226_eng.json
Document length: 102170 characters


ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_client:Failed to process chunk 4 for document 02023R1115-20241226: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
INFO:finetune_llms.ollama_client:Processing chunk 6/28 for document 02023R1115-20241226
ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_client:Failed to process chunk 5 for document 02023R1115-20241226: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
INFO:finetune_llms.ollama_client:Processing chunk 7/28 for document 02023R1115-20241226
ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_c

Found 0 obligations
✓ Saved annotations to 02023R1115-20241226_annotations.json

--- Processing 02010L0075-20240804 ---
File: 02010L0075-20240804_eng.json
Document length: 312227 characters


ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_client:Failed to process chunk 4 for document 02010L0075-20240804: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
INFO:finetune_llms.ollama_client:Processing chunk 6/76 for document 02010L0075-20240804
ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_client:Failed to process chunk 5 for document 02010L0075-20240804: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
INFO:finetune_llms.ollama_client:Processing chunk 7/76 for document 02010L0075-20240804
ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_c

Found 0 obligations
✓ Saved annotations to 02010L0075-20240804_annotations.json

--- Processing 02006R1907-20250623 ---
File: 02006R1907-20250623_eng.json
Document length: 1171735 characters


ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_client:Failed to process chunk 4 for document 02006R1907-20250623: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
INFO:finetune_llms.ollama_client:Processing chunk 6/319 for document 02006R1907-20250623
ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama_client:Failed to process chunk 5 for document 02006R1907-20250623: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
INFO:finetune_llms.ollama_client:Processing chunk 7/319 for document 02006R1907-20250623
ERROR:finetune_llms.ollama_client:Ollama API request failed: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate
ERROR:finetune_llms.ollama

KeyboardInterrupt: 

In [11]:
# Generate example prompts for each document (without running full annotation)
prompts_folder = target_folder
prompts_folder.mkdir(exist_ok=True)

for celex in copied_files:
  filename = f"{celex}_eng.json"
  file_path = target_folder / filename
  
  print(f"Generating prompt for {celex}")
  
  # Load document content
  with open(file_path, 'r', encoding='utf-8') as f:
      doc_data = json.load(f)
  
  content = doc_data.get('content')
  
  # Create prompt using the same method as OllamaClient
  prompt = f"""You are a legal AI assistant specializing in EU law. Extract legal obligations from the provided EUR-Lex text.

Focus on requirements (what must be done) and prohibitions (what must not be done).

Return your response as valid JSON with the following structure for each obligation found:
{{
  "obligations": [
    {{
      "type": "requirement|prohibition",
      "description": "Clear summary of what must/must not be done",
      "scope_subject": "Who is obligated, who must comply with the rules",
      "scope_affected_parties": "Those who need to be aware because the obligation impacts them, even if they're not directly obligated",
      "context": "When it applies (simplified conditions)"
    }}
  ]
}}

Only return valid JSON. If no clear obligations are found, return {{"obligations": []}}.


TEXT:

{content}

JSON:"""
        
  # Save prompt to text file
  prompt_filename = f"{celex}_eng_prompt.txt"
  prompt_path = prompts_folder / prompt_filename
  
  with open(prompt_path, 'w', encoding='utf-8') as f:
      f.write(prompt)

Generating prompt for 02016R0679-20160504
Generating prompt for 02002L0058-20091219
Generating prompt for 02016L0680-20160504
Generating prompt for 02023R1115-20241226
Generating prompt for 02010L0075-20240804
Generating prompt for 02006R1907-20250623
Generating prompt for 02008L0098-20180705
Generating prompt for 02014L0065-20250117
Generating prompt for 02013R0575-20250629
Generating prompt for 02013L0036-20250117
Generating prompt for 02015L2366-20250117
Generating prompt for 02016R1011-20220101


In [None]:
# Create consolidated dataset and analysis
dataset_path = target_folder / "obligation_dataset.json"
analysis_path = target_folder / "dataset_analysis.json"

# Prepare the consolidated dataset
dataset = {
    'metadata': {
        'created_at': datetime.now().isoformat(),
        'total_documents': processed_docs,
        'total_obligations': len(all_annotations),
        'model_used': ollama_client.model,
        'categories': {
            'data_protection': len([a for a in all_annotations if any(d['celex'] == a['document_id'] and d['category'] == 'data_protection' for d in copied_files)]),
            'environmental': len([a for a in all_annotations if any(d['celex'] == a['document_id'] and d['category'] == 'environmental' for d in copied_files)]),
            'financial_services': len([a for a in all_annotations if any(d['celex'] == a['document_id'] and d['category'] == 'financial_services' for d in copied_files)])
        }
    },
    'annotations': all_annotations
}

# Save consolidated dataset
with open(dataset_path, 'w', encoding='utf-8') as f:
    json.dump(dataset, f, indent=2, ensure_ascii=False)

# Generate analysis
obligation_types = {}
subjects = {}
categories_stats = {}

for annotation in all_annotations:
    # Count obligation types
    obligation_type = annotation.get('type', 'unknown')
    obligation_types[obligation_type] = obligation_types.get(obligation_type, 0) + 1
    
    # Count subjects (first 50 chars for grouping)
    subject = annotation.get('scope_subject', 'unknown')[:50]
    subjects[subject] = subjects.get(subject, 0) + 1
    
    # Category analysis
    doc_id = annotation.get('document_id', '')
    for doc in copied_files:
        if doc['celex'] == doc_id:
            cat = doc['category']
            if cat not in categories_stats:
                categories_stats[cat] = {'total': 0, 'requirements': 0, 'prohibitions': 0}
            categories_stats[cat]['total'] += 1
            if obligation_type == 'requirement':
                categories_stats[cat]['requirements'] += 1
            elif obligation_type == 'prohibition':
                categories_stats[cat]['prohibitions'] += 1
            break

analysis = {
    'summary': {
        'total_obligations': len(all_annotations),
        'processed_documents': processed_docs,
        'failed_documents': len(failed_docs)
    },
    'obligation_types': obligation_types,
    'top_subjects': dict(sorted(subjects.items(), key=lambda x: x[1], reverse=True)[:10]),
    'categories_breakdown': categories_stats,
    'avg_obligations_per_doc': len(all_annotations) / processed_docs if processed_docs > 0 else 0
}

# Save analysis
with open(analysis_path, 'w', encoding='utf-8') as f:
    json.dump(analysis, f, indent=2, ensure_ascii=False)

print(f"✓ Dataset saved to: {dataset_path}")
print(f"✓ Analysis saved to: {analysis_path}")
print("\n=== DATASET ANALYSIS ===")
print(f"Total obligations: {analysis['summary']['total_obligations']}")
print(f"Average obligations per document: {analysis['avg_obligations_per_doc']:.1f}")
print("\nObligation types:")
for type_name, count in obligation_types.items():
    print(f"  {type_name}: {count}")
print("\nTop subjects:")
for subject, count in list(analysis['top_subjects'].items())[:5]:
    print(f"  {subject}: {count}")