In [1]:
!pip install fastapi uvicorn requests python-dotenv pydantic pydantic-settings loguru



In [2]:
!pip install httpx



In [3]:
import os
from pathlib import Path

# load BaFin + Firecrawl secrets from the repo env file
env_path = Path("external_sources/.env")
for line in env_path.read_text().splitlines():
    if not line or line.strip().startswith("#"):
        continue
    key, value = line.split("=", 1)
    os.environ[key] = value.strip().strip('"')

# service settings needed by service.models.Settings
os.environ.update(
    SERVICE_PHARIA_KERNEL_URL="http://localhost",
    SERVICE_PHARIA_STUDIO_URL="http://localhost",
    SERVICE_PHARIA_AUTH_SERVICE_URL="http://localhost/auth",
    SERVICE_PHARIA_IAM_ISSUER_URL="http://localhost/issuer",
    SERVICE_PHARIA_DATA_URL="http://localhost/data",
    SERVICE_PHARIA_DATA_STAGE_NAME="dev",
    SERVICE_STORAGE_TYPE="filesystem",
    SERVICE_COMPLETION_MODEL_NAME="dummy",
    SERVICE_DATABASE_URL="postgresql://user:pass@localhost:5432/db",
    MY_PROVIDER_BASE_URL="http://localhost:8000",
)

# make sure Python resolves the service package
import sys
sys.path.append("apps/law_monitoring/service/src")


In [None]:
import asyncio
import threading
import uvicorn
import nest_asyncio

nest_asyncio.apply()

from external_sources.http_server.app import app

def run_server():
    config = uvicorn.Config(app, host="0.0.0.0", port=8000, log_level="info")
    server = uvicorn.Server(config)
    asyncio.run(server.serve())

server_thread = threading.Thread(target=run_server, daemon=True)
server_thread.start()


[32m2025-10-08 17:03:06.082[0m | [1mINFO    [0m | [36mservice.dependencies[0m:[36mwith_settings[0m:[36m17[0m - [1mLoaded settings: enable_cors=True enable_partner_button=False pharia_kernel_url=HttpUrl('http://localhost/') pharia_studio_url=HttpUrl('http://localhost/') pharia_auth_service_url='http://localhost/auth' pharia_iam_issuer_url=HttpUrl('http://localhost/issuer') pharia_data_url=HttpUrl('http://localhost/data') pharia_data_stage_name='dev' storage_type='filesystem' completion_model_name='dummy' database_url=SecretPostgresDsn('**********') tenant_id=None[0m


INFO:     Started server process [65269]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
Failed to crawl https://www.bafin.de/SharedDocs/Veroeffentlichungen/DE/Verbrauchermitteilung/unerlaubte/2025/neu/meldung_2025_10_06_geldpro_ai.html: 429 Client Error: Too Many Requests for url: https://api.firecrawl.dev/v1/scrape


INFO:     127.0.0.1:40576 - "GET /legal-acts?refresh=1 HTTP/1.1" 200 OK
INFO:     127.0.0.1:39358 - "GET /legal-acts/bafin-lex-lim-com-bafin-warns-consumer-88bcf78c58f7c8aa HTTP/1.1" 200 OK


In [5]:
from datetime import UTC, datetime, timedelta
from external_sources.my_provider import fetch_legal_acts

end = datetime.now(tz=UTC)
start = end - timedelta(days=7)
acts = list(fetch_legal_acts(start, end, limit=5))
acts


[LegalAct(expression_url='http://localhost:8000/legal-acts/bafin-lex-lim-com-bafin-warns-consumer-88bcf78c58f7c8aa', title='lex-lim(.)com:\xa0BaFin warns consumers about website and suspected identity fraud', pdf_url='', eurovoc_labels=['banking regulation', 'bafin'], document_type='notice', document_type_label=<DocumentTypeLabel.NOTICE: 'Notice'>, oj_series_label=<OfficialJournalSeries.UNKNOWN: 'Unknown'>, publication_date=datetime.datetime(2025, 10, 8, 13, 44, tzinfo=datetime.timezone.utc), document_date=None, effect_date=None, end_validity_date=None, notification_date=None),
 LegalAct(expression_url='http://localhost:8000/legal-acts/bafin-deutsche-sparkassen-leasing-ag-c-a643ffb8a4f00e41', title='Deutsche Sparkassen Leasing AG & Co. KG: BaFin ordnet Sicherstellung der ordnungsgemäßen Geschäftsorganisation an', pdf_url='', eurovoc_labels=['banking regulation', 'bafin'], document_type='notice', document_type_label=<DocumentTypeLabel.NOTICE: 'Notice'>, oj_series_label=<OfficialJournalS

In [6]:
import requests

json_resp = requests.get("http://localhost:8000/legal-acts", params={"refresh": "1"})
json_resp.json()[:2]  # preview

first_url = json_resp.json()[0]["expression_url"]
html_resp = requests.get(first_url)
html_resp.text[:500]


'<!doctype html>\n            <html lang="en">\n              <head>\n                <meta charset="utf-8" />\n                <title>lex-lim(.)com:\xa0BaFin warns consumers about website and suspected identity fraud</title>\n              </head>\n              <body>\n                <article\n                  data-law-monitoring="legal-act"\n                  data-expression-url="http://localhost:8000/legal-acts/bafin-lex-lim-com-bafin-warns-consumer-88bcf78c58f7c8aa"\n                  data-source-id="b'

In [7]:
from service.models import LegalAct
from pprint import pprint

validation_results = []
for index, act in enumerate(acts, start=1):
    # Round-trip through Pydantic to ensure the schema accepts the data
    validated = LegalAct.model_validate(act.model_dump())
    validation_results.append(
        {
            "row": index,
            "title": validated.title,
            "publication_date": validated.publication_date.isoformat(),
            "document_type": validated.document_type,
            "pdf_url": validated.pdf_url or "<none>",
        }
    )

pprint(validation_results)


[{'document_type': 'notice',
  'pdf_url': '<none>',
  'publication_date': '2025-10-08T13:44:00+00:00',
  'row': 1,
  'title': 'lex-lim(.)com:\xa0BaFin warns consumers about website and '
           'suspected identity fraud'},
 {'document_type': 'notice',
  'pdf_url': '<none>',
  'publication_date': '2025-10-08T07:00:00+00:00',
  'row': 2,
  'title': 'Deutsche Sparkassen Leasing AG & Co. KG: BaFin ordnet '
           'Sicherstellung der ordnungsgemäßen Geschäftsorganisation an'},
 {'document_type': 'notice',
  'pdf_url': '<none>',
  'publication_date': '2025-10-07T12:13:00+00:00',
  'row': 3,
  'title': 'H&W Invest: BaFin warns consumers about the website\xa0'
           'hw-invest(.)pro'},
 {'document_type': 'notice',
  'pdf_url': 'https://www.bafin.de/SharedDocs/Downloads/DE/Konsultation/2025/neu/dl_kon_14_2025_stellungnahme_aba.pdf?__blob=publicationFile&v=2',
  'publication_date': '2025-10-07T12:04:07+00:00',
  'row': 4,
  'title': 'Stellungnahme aba'},
 {'document_type': 'notice',