In [None]:
from bs4 import BeautifulSoup, NavigableString
import time
import json 
import requests
from tqdm import tqdm 

with open("./diseases.json", "r", encoding="utf-8") as file:
    diseases_data = json.load(file)

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

In [None]:
def extract_info_section(info_section):
    data = {}
    if not info_section:
        return data
    
    for li in info_section.find_all('li'):
        li_wrap = li.find('div', class_='li-wrap')
        if not li_wrap:
            continue
        
        
        strong_tag = li_wrap.find('strong')
        if not strong_tag:
            continue
        field_name = strong_tag.get_text(strip=True).rstrip(':')

        
        if field_name == "Categories":
            categories = []

            for category_span in li_wrap.find_all('span', class_='text-bluegray', recursive=False):
                category_parts = [span.get_text(strip=True) for span in category_span.find_all('span')]
                categories.append(' '.join(category_parts))
            data[field_name] = categories
        else:
          
            spans = li_wrap.find_all('span', class_='text-bluegray')
            field_value = ' '.join([span.get_text(strip=True) for span in spans])
            data[field_name] = field_value
    
    return data

In [None]:
extended_data = []

for disease in tqdm(diseases_data, desc="Scraping diseases"):
    disease_name = disease["disease_name"]
    disease_url = disease["disease_url"]
    disease_nickname = disease["other_names"]

    try:
        response = requests.get(disease_url, headers=headers, timeout=10)
        response.encoding = 'utf-8'  
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        summary_tag = soup.find("div", class_="fs-md-18 mb-5")
        summary = summary_tag.get_text(strip=True) if summary_tag else "N/A"

        info_section = soup.find("div", class_="snippets-wrapper")

        info_data = extract_info_section(info_section)


        population_estimate = info_data.get("Population Estimate", "N/A")
        symptoms = info_data.get("Symptoms", "N/A")
        cause = info_data.get("Cause", "N/A")
        organizations = info_data.get("Organizations", "N/A")
        categories = info_data.get("Categories", "N/A")

        

        extended_data.append({
            "disease_name": disease_name,
            "disease_url": disease_url,
            "other_names": disease_nickname, 
            "summary": summary,
            "population_estimate": population_estimate,
            "symptoms": symptoms,
            "cause": cause,
            "organizations": organizations,
            "categories": categories  
        })

    except requests.exceptions.RequestException as e:
        print(f"Error scraping {disease_name}: {e}")

In [None]:
with open("./disease_detailed.json", "w", encoding="utf-8") as file:
    json.dump(extended_data, file, indent=4, ensure_ascii=False)