In [2]:
import xml.etree.ElementTree as ET
import requests
import os
import json
from datetime import datetime
from pathlib import Path


In [3]:
from pathlib import Path
import requests

xml_url = "https://medlineplus.gov/xml/mplus_topics_2025-04-24.xml"
xml_path = Path("E:/MiiHA/app/data/raw/medlineplus_healthtopics.xml")


# Always recreate if file is empty or missing
if not xml_path.exists() or xml_path.stat().st_size == 0:
    print("⚠️ Existing XML file is empty or missing — redownloading...")
    response = requests.get(xml_url)
    response.raise_for_status()
    xml_path.write_bytes(response.content)
    print("✅ XML downloaded and saved:", xml_path)
else:
    print("📁 XML already exists and is non-empty")



📁 XML already exists and is non-empty


In [4]:

import json

xml_path = Path("E:/MiiHA/app/data/raw/medlineplus_healthtopics.xml")
output_path = Path("E:/MiiHA/app/data/processed/medlineplus_health_topics.jsonl")

tree = ET.parse(xml_path)
root = tree.getroot()

parsed_records = []

for topic in root.findall(".//health-topic"):
   
    if topic.attrib.get("language", "English") != "English": #ONLY ENGLISH TOPICS
        continue

    topic_data = {
        "id": topic.attrib.get("id"),
        "title": topic.attrib.get("title"),
        "url": topic.attrib.get("url"),
        "language": topic.attrib.get("language", "English"),
        "date_created": topic.attrib.get("date-created"),
        "summary": topic.findtext("full-summary"),
        "also_called": [ac.text for ac in topic.findall("also-called")],
        "groups": [g.text for g in topic.findall("group")],
        "mesh_terms": [m.findtext("descriptor") for m in topic.findall("mesh-heading")],
        "see_references": [ref.text for ref in topic.findall("see-reference")],
        "primary_institute": topic.findtext("primary-institute"),
        "source_sites": []
    }

    for site in topic.findall("site"):
        topic_data["source_sites"].append({
            "title": site.attrib.get("title"),
            "url": site.attrib.get("url"),
            "category": site.findtext("information-category"),
            "organization": site.findtext("organization")
        })

    parsed_records.append(topic_data)


# Save to JSONL
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, "w", encoding="utf-8") as f:
    for record in parsed_records:
        json.dump(record, f, ensure_ascii=False)
        f.write("\n")

print(f" Parsed and saved {len(parsed_records)} topics to {output_path}")


 Parsed and saved 1017 topics to E:\MiiHA\app\data\processed\medlineplus_health_topics.jsonl


✅ Saved processed data to: app\data\processed\medlineplus_health_topics.jsonl


✅ Saved to medlineplus_health_topics.jsonl
