In [None]:
import json
from pathlib import Path

CACHE_FILE = "summarized_cases.json"


In [None]:
def load_cached_urls():
    if Path(CACHE_FILE).exists():
        with open(CACHE_FILE, "r") as f:
            return set(json.load(f))
    return set()

def save_cached_urls(urls):
    with open(CACHE_FILE, "w") as f:
        json.dump(list(urls), f)


In [None]:
cached_urls = load_cached_urls()
new_items = [item for item in items if item['link'] not in cached_urls]


In [2]:
# Run this if you're on Colab or a new environment
!pip install feedparser PyMuPDF openai


Collecting PyMuPDF
  Downloading pymupdf-1.26.1-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.1-cp39-abi3-win_amd64.whl (18.5 MB)
   ---------------------------------------- 0.0/18.5 MB ? eta -:--:--
   ------- -------------------------------- 3.4/18.5 MB 22.5 MB/s eta 0:00:01
   -------------- ------------------------- 6.6/18.5 MB 16.8 MB/s eta 0:00:01
   -------------------------------- ------- 15.2/18.5 MB 25.2 MB/s eta 0:00:01
   ---------------------------------------- 18.5/18.5 MB 26.6 MB/s eta 0:00:00
Installing collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.1


In [3]:
import feedparser
import requests
import fitz  # PyMuPDF
import openai
import os
from dotenv import load_dotenv


In [8]:
from openai import OpenAI
from dotenv import load_dotenv
import os

# Load API key
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-') and len(api_key) > 10:
    print("✅ API key looks valid")
else:
    print("❌ Invalid API key. Please check .env file")

# Initialize OpenAI client
openai = OpenAI(api_key=api_key)

# Model to use
MODEL = 'gpt-4o'


✅ API key looks valid


In [9]:
response = openai.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "user", "content": "Summarize this legal text: A doctor received a complaint after disclosing information without patient consent."}
    ]
)

print(response.choices[0].message.content.strip())


A doctor faced a complaint for sharing patient information without obtaining prior consent from the patient.


In [10]:
## --- fetch and parse the RSS feed
import requests
import xml.etree.ElementTree as ET

def fetch_rss_items(url="https://feeds.tuchtcollege-gezondheidszorg.nl/nieuws.rss"):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"❌ Failed to fetch RSS feed: {response.status_code}")
    
    root = ET.fromstring(response.content)
    items = root.findall(".//item")
    
    results = []
    for item in items:
        title = item.find("title").text
        link = item.find("link").text
        pub_date = item.find("pubDate").text
        results.append({"title": title, "link": link, "date": pub_date})
    
    return results

# Test it
rss_items = fetch_rss_items()
for item in rss_items[:5]:  # show first 5
    print(f"- {item['title']} ({item['date']})\n  {item['link']}\n")


- Plastisch chirurg één jaar geschorst vanwege seksueel grensoverschrijdend gedrag en onvoldoende medische zorg (Wed, 11 Jun 2025 12:00:00 GMT)
  https://www.tuchtcollege-gezondheidszorg.nl/actueel/nieuws/2025/06/11/plastisch-chirurg-een-jaar-geschorst-vanwege-seksueel-grensoverschrijdend-gedrag-en-onvoldoende-medische-zorg

- Berisping psychiater en GZ-psycholoog gehandhaafd voor onvoldoende helder en consistent rapport (Wed, 23 Apr 2025 11:15:00 GMT)
  https://www.tuchtcollege-gezondheidszorg.nl/actueel/nieuws/2025/04/23/berisping-psychiater-en-gz-psycholoog-gehandhaafd-voor-onheldere-formulering-rapportage

- Doorlooptijden tuchtcolleges verkort (Mon, 31 Mar 2025 10:10:00 GMT)
  https://www.tuchtcollege-gezondheidszorg.nl/actueel/nieuws/2025/03/31/jaarverslag-2024-gepubliceerd

- BIG-registratie internist doorgehaald vanwege seksueel wangedrag bij twee jonge patiëntes en arts-assistente (Fri, 07 Mar 2025 12:00:00 GMT)
  https://www.tuchtcollege-gezondheidszorg.nl/actueel/nieuws/2025

In [11]:
###---extract the court decision content --

from bs4 import BeautifulSoup
import time

def get_case_text(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"❌ Failed to fetch: {url}")
        return ""
    
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Most rulings are in <div class="field field--name-body ..."> or similar
    content_div = soup.find("div", class_="field--name-body")
    if content_div:
        return content_div.get_text(separator="\n", strip=True)
    
    # Fallback if structure changes
    return soup.get_text()

# Optional: Preview one
test_url = rss_items[0]["link"]
test_text = get_case_text(test_url)
print(test_text[:1000])  # just preview first 1000 chars











Plastisch chirurg één jaar geschorst vanwege seksueel grensoverschrijdend gedrag en onvoldoende medische zorg | Nieuwsbericht | Tuchtcolleges voor de gezondheidszorg


































Ga direct naar inhoud












U bevindt zich hier:
Home
Actueel
Nieuws
Plastisch chirurg één jaar geschorst vanwege seksueel grensoverschrijdend gedrag en onvoldoende medische zorg



Zoeken binnen Tuchtcolleges voor de gezondheidszorg


      Zoek







      Plastisch chirurg één jaar geschorst vanwege seksueel grensoverschrijdend gedrag en onvoldoende medische zorg

  Nieuwsbericht | 11-06-2025 | 14:00Den Haag - Het Centraal Tuchtcollege voor de Gezondheidszorg legt een plastisch chirurg een onvoorwaardelijke schorsing voor de duur van één jaar op vanwege seksueel grensoverschrijdend gedrag en onvoldoende medische zorg. De eerder bevolen doorhaling van de inschrijving in het BIG-register van de plastisch chirurg wordt vernietigd.
Beroep deels gegrond
De Inspectie voor Gezon

In [12]:
##--summarise the court decisions

def summarize_case(title, text, url):
    prompt = f"""
You are a legal writer summarizing disciplinary rulings for Dutch healthcare professionals. 

Summarize this court decision clearly in English for a medical audience. Include:
- Title (use the title provided)
- Summary (in 3–5 sentences max)
- Keep it clear, neutral, and informative.

Title: {title}
Court Text:
{text[:3500]}  # limit input to ~3500 characters
    """

    response = openai.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.4
    )

    summary = response.choices[0].message.content.strip()
    return {
        "title": title,
        "summary": summary,
        "url": url
    }

# Test with one
test_summary = summarize_case(rss_items[0]["title"], test_text, rss_items[0]["link"])
print(test_summary["summary"])


**Title:** Plastic Surgeon Suspended for One Year Due to Sexual Misconduct and Inadequate Medical Care

**Summary:** The Central Disciplinary Board for Healthcare in the Netherlands has imposed a one-year suspension on a plastic surgeon for sexual misconduct and inadequate medical care. Initially, the Regional Disciplinary Board in Zwolle had ordered the removal of the surgeon from the BIG-register, but this decision was overturned on appeal. The incident involved inappropriate behavior during a video consultation, which led to a patient filing a complaint with the Health and Youth Care Inspectorate and the police. The surgeon's practice management, including constant availability via WhatsApp, was deemed medically irresponsible. Despite contesting the severity of the disciplinary action, the Central Board upheld the suspension as the maximum penalty it could impose.


In [14]:
def fetch_court_text(url):
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    # Typical location of court case body text on that site
    main = soup.find("main")
    if not main:
        raise ValueError("❌ Could not find main content on page")

    paragraphs = main.find_all(["p", "h2"])
    text = "\n".join(p.get_text(strip=True) for p in paragraphs)
    return text


In [15]:
from tqdm.notebook import tqdm

all_summaries = []

for item in tqdm(rss_items[:10]):  # Change 10 to len(rss_items) to process all
    title = item["title"]
    url = item["link"]

    if url in cached_urls:
        print(f"⏭️ Skipping already summarized: {url}")
        continue

    try:
        text = fetch_court_text(url)
        summary = summarize_case(title, text, url)
        all_summaries.append(summary)

        # ✅ Save link to prevent reprocessing
        cached_urls.add(url)
        save_cached_urls(cached_urls)

    except Exception as e:
        print(f"❌ Error for {url}: {e}")
        all_summaries.append({
            "title": title,
            "summary": f"Error processing case: {e}",
            "url": url
        })

# Export to .txt
with open("court_summaries.txt", "w", encoding="utf-8") as f:
    for case in all_summaries:
        f.write(f"Title: {case['title']}\n")
        f.write(f"Link: {case['url']}\n")
        f.write(f"{case['summary']}\n\n")

print(f"✅ Exported {len(all_summaries)} summaries to 'court_summaries.txt'")


  0%|          | 0/10 [00:00<?, ?it/s]

✅ Exported 10 summaries to 'court_summaries.txt'
