In [1]:
%pip install -qU  pandas requests beautifulsoup4 webdriver-manager selenium


Note: you may need to restart the kernel to use updated packages.


### Scrape diseases from Africa CDC

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import string

def scrape_africa_cdc_diseases(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Failed to retrieve {url}. Error: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    disease_links = soup.find_all("div", class_="col-8 col-md-4")

    diseases = []
    for disease in disease_links:
        link = disease.find("a")
        if link:
            name = link.text.strip()
            href = link["href"]
            diseases.append((name, href))
    return diseases

africa_cdc_url = "https://africacdc.org/disease/"
africa_cdc_diseases = scrape_africa_cdc_diseases(africa_cdc_url)
diseases_df = pd.DataFrame(africa_cdc_diseases, columns=["Disease Name", "Link"])
# Print the scraped diseases
for name, link in africa_cdc_diseases:
    print(f"Disease: {name}")
    print(f"Link: {link}")
    print("-" * 50)



Disease: Anthrax
Link: https://africacdc.org/disease/anthrax/
--------------------------------------------------
Disease: Avian Influenza
Link: https://africacdc.org/disease/avian-influenza/
--------------------------------------------------
Disease: Chikungunya
Link: https://africacdc.org/disease/chikungunya/
--------------------------------------------------
Disease: Cholera
Link: https://africacdc.org/disease/cholera/
--------------------------------------------------
Disease: COVID-19
Link: https://africacdc.org/disease/covid-19/
--------------------------------------------------
Disease: Crimean-Congo Haemorrhagic Fever
Link: https://africacdc.org/disease/crimean-congo-haemorrhagic-fever/
--------------------------------------------------
Disease: Dengue Fever
Link: https://africacdc.org/disease/dengue-fever/
--------------------------------------------------
Disease: Ebola Virus Disease
Link: https://africacdc.org/disease/ebola-virus-disease/
-------------------------------------

In [3]:
diseases_df.head()

Unnamed: 0,Disease Name,Link
0,Anthrax,https://africacdc.org/disease/anthrax/
1,Avian Influenza,https://africacdc.org/disease/avian-influenza/
2,Chikungunya,https://africacdc.org/disease/chikungunya/
3,Cholera,https://africacdc.org/disease/cholera/
4,COVID-19,https://africacdc.org/disease/covid-19/


In [4]:
webmd = pd.read_csv("webmd_df.csv")

In [5]:
# Filtering WebMD dataset
filtered_webmd = webmd[webmd["Required"] == 1.0]

# Selecting the disease names from both DataFrames
disease_names_1 = diseases_df[['Disease Name']].rename(columns={'Disease Name': 'disease_name'})  # Standardizing name
disease_names_2 = filtered_webmd[['condition']].rename(columns={'condition': 'disease_name'})  # Standardizing name

# Ensuring both columns contain only string values
disease_names_1['disease_name'] = disease_names_1['disease_name'].astype(str)
disease_names_2['disease_name'] = disease_names_2['disease_name'].astype(str)

# Concatenating both DataFrames vertically (stacking them)
combined_disease_names = pd.concat([disease_names_1, disease_names_2], ignore_index=True)

print(combined_disease_names)  # Prints the DataFrame to the console

              disease_name
0                  Anthrax
1          Avian Influenza
2              Chikungunya
3                  Cholera
4                 COVID-19
..                     ...
57       Tuberculosis (TB)
58  Tuberculous Meningitis
59        Viral Meningitis
60            Yellow Fever
61                    Zika

[62 rows x 1 columns]


### Use Africa CDC Diseases as Reference for WebMd

In [6]:
# Define the keyword dictionary.
# The keys are the column names (for your output), and the values are lists of common words found in those sections.
keyword_dict = {
    "overview": ["overview","what is", "introduction", "background"],
    "symptoms": ["symptoms"],
    "causes": ["causes", "etiology", "risk factors","risk"],
    "diagnosis": ["diagnosis", "lab", "clinical test"],
    "treatment": ["treatment", "therapy"],
    "medication": ["medication", "prescription", "dosage", "administration"],
    "complications": ["complications"],
    "prognosis": ["prognosis", "outlook", "recovery"],
    "prevention": ["prevention", "protection"]
}

In [7]:
# Extract the first letter (in lowercase) for each disease
combined_disease_names['first_letter'] = combined_disease_names["disease_name"].str[0].str.lower()

# Create a dictionary mapping each letter to a list of disease names
disease_dict = combined_disease_names.groupby('first_letter')["disease_name"].apply(list).to_dict()
print("Disease dictionary:", disease_dict)

Disease dictionary: {'a': ['Anthrax', 'Avian Influenza', 'Arbovirus A Chikungunya Type', 'Avian Influenza (Bird Flu)'], 'b': ['Bacterial Meningitis', 'Bacterial Meningococcal Meningitis', 'Bird Flu (Avian Influenza)'], 'c': ['Chikungunya', 'Cholera', 'COVID-19', 'Crimean-Congo Haemorrhagic Fever', 'CHIK', 'Coronavirus (COVID-19)', 'Coronavirus (COVID-19), Complicaitons', 'Coronavirus (COVID-19) and Asthma', 'Coronavirus (COVID-19) and High Blood Pressure', 'Coronavirus (COVID-19) and Pregnancy', 'Coronavirus (COVID-19) in Children, Infants', 'Coronavirus (COVID-19), Recovery', 'Coronavirus (COVID-19), Symptoms', 'Coronavirus (COVID-19), Testing at Home', 'Coronavirus (COVID-19), Testing', 'COVID-19, Antibody Testing', 'Cryptococcic Meningitis'], 'd': ['Dengue Fever'], 'e': ['Ebola Virus Disease', 'Ebola', 'Epidemic Cerebrospinal Meningitis', 'Epidemic Cholera'], 'g': ['Glandular Plague'], 'h': ['Hepatitis B Virus (HBV)', 'Hepatitis C Virus', 'Hepatitis E Virus', 'HIV (Human Immunodefic

In [18]:
disease_dict

['Hepatitis B Virus (HBV)',
 'Hepatitis C Virus',
 'Hepatitis E Virus',
 'HIV (Human Immunodeficiency Virus)',
 'Hepatitis',
 'HIV',
 'Human Immunodeficiency Virus (HIV)']

In [9]:
# ===============================
# STEP 2: FIND DISEASE URLs ON WEBDMD
# ===============================

DEBUG = True

# Dummy user agent and headers
user_agent = "Educational"
headers = {'User-Agent': user_agent}

# Base URL for the WebMD A-to-Z health topics page
base_url = "https://www.webmd.com/a-to-z-guides/health-topics"

# Containers for results:
found_records = []  # List of dictionaries: {"Base Disease": ..., "Disease Name": ..., "URL": ..., "Letter": ...}
not_found = []      # List of disease names (from your dictionary) that were not found on the website

# Loop only through the letters in the disease_dict
for letter, disease_list in disease_dict.items():
    url = f"{base_url}?pg={letter}"
    print(f"\nFetching URL: {url}")
    
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        print(f"Successfully fetched page for letter '{letter}' (Status Code: {response.status_code})")
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Optional debugging: Print a snippet of the page text
        snippet = soup.get_text(separator=' ', strip=True)[:200]
        print("Page text snippet:", snippet)
        
        # Find all relevant <a> tags (with href and where data-metrics-link is explicitly False)
        relevant_tags = soup.find_all('a', href=True, attrs={'data-metrics-link': False})
        
        # For each disease in the current letter group, search in the relevant links
        for disease in disease_list:
            found_any = False
            for link in relevant_tags:
                link_text = link.get_text(strip=True)
                # Check if the base disease name appears anywhere in the link text (case-insensitive)
                if disease.lower() in link_text.lower():
                    found_records.append({
                        "Base Disease": disease,
                        "Disease Name": link_text,  # the full variant as shown on the page
                        "URL": link['href'],
                        "Letter": letter
                    })
                    print(f"Found disease variant for '{disease}' on page '{letter}': {link_text} - {link['href']}")
                    found_any = True
            if not found_any:
                print(f"Disease '{disease}' not found on page '{letter}'.")
                not_found.append(disease)
    else:
        print(f"Failed to fetch URL for letter '{letter}'. Status code: {response.status_code}")

# Create a DataFrame for the found diseases
found_df = pd.DataFrame(found_records)

print("\n===== Found Diseases =====")
print(found_df)

print("\n===== Diseases Not Found =====")
print(not_found)


Fetching URL: https://www.webmd.com/a-to-z-guides/health-topics?pg=a
Successfully fetched page for letter 'a' (Status Code: 200)
Page text snippet: All Health Topics Skip to main content Home Conditions Back Conditions View All ADD/ADHD Allergies Arthritis Atrial fibrillation Breast Cancer Cancer Crohn's Disease Depression Diabetes DVT Eczema Eye
Disease 'Anthrax' not found on page 'a'.
Found disease variant for 'Avian Influenza' on page 'a': Avian Influenza (Bird Flu) - https://www.webmd.com/cold-and-flu/flu-guide/what-know-about-bird-flu
Found disease variant for 'Arbovirus A Chikungunya Type' on page 'a': Arbovirus A Chikungunya Type - https://www.webmd.com/a-to-z-guides/what-is-chikungunya
Found disease variant for 'Avian Influenza (Bird Flu)' on page 'a': Avian Influenza (Bird Flu) - https://www.webmd.com/cold-and-flu/flu-guide/what-know-about-bird-flu

Fetching URL: https://www.webmd.com/a-to-z-guides/health-topics?pg=b
Successfully fetched page for letter 'b' (Status Code: 200)

In [10]:
found_df

Unnamed: 0,Base Disease,Disease Name,URL,Letter
0,Avian Influenza,Avian Influenza (Bird Flu),https://www.webmd.com/cold-and-flu/flu-guide/w...,a
1,Arbovirus A Chikungunya Type,Arbovirus A Chikungunya Type,https://www.webmd.com/a-to-z-guides/what-is-ch...,a
2,Avian Influenza (Bird Flu),Avian Influenza (Bird Flu),https://www.webmd.com/cold-and-flu/flu-guide/w...,a
3,Bacterial Meningitis,Bacterial Meningitis,https://www.webmd.com/children/understanding-m...,b
4,Bacterial Meningococcal Meningitis,Bacterial Meningococcal Meningitis,https://www.webmd.com/children/meningococcal-m...,b
...,...,...,...,...
59,Tuberculous Meningitis,Tuberculous Meningitis,https://www.webmd.com/children/understanding-m...,t
60,Viral Meningitis,Viral Meningitis,https://www.webmd.com/children/understanding-m...,v
61,Yellow Fever,Yellow Fever,https://www.webmd.com/a-to-z-guides/yellow-fev...,y
62,Yellow Fever,Yellow Fever,https://www.webmd.com/a-to-z-guides/yellow-fev...,y


In [11]:
found_df.columns

Index(['Base Disease', 'Disease Name', 'URL', 'Letter'], dtype='object')

In [17]:
found_df[found_df["Letter"]=="h"]

Unnamed: 0,Base Disease,Disease Name,URL,Letter
44,Hepatitis,Hepatitis,https://www.webmd.com/hepatitis/default.htm,h
45,HIV,HIV,https://www.webmd.com/hiv-aids/default.htm,h
46,HIV,Hives,https://www.webmd.com/skin-problems-and-treatm...,h
47,HIV,Human Immunodeficiency Virus (HIV),https://www.webmd.com/hiv-aids/default.htm,h
48,Human Immunodeficiency Virus (HIV),Human Immunodeficiency Virus (HIV),https://www.webmd.com/hiv-aids/default.htm,h


In [13]:
not_found

['Anthrax',
 'Chikungunya',
 'Cholera',
 'Crimean-Congo Haemorrhagic Fever',
 'Dengue Fever',
 'Ebola Virus Disease',
 'Hepatitis B Virus (HBV)',
 'Hepatitis C Virus',
 'Hepatitis E Virus',
 'HIV (Human Immunodeficiency Virus)',
 'Lassa Fever',
 'Malaria',
 'Marburg Virus Disease (MVD)',
 'Measles',
 'Meningococcal Meningitis',
 'Middle East Respiratory Syndrome',
 'Mpox (Monkeypox)',
 'Plague',
 'Poliomyelitis (Polio)',
 'Rift Valley Fever',
 'Zika Virus']

In [14]:
def extract_section_text(soup, keywords):
    """
    Look for header tags (h1 to h6) that contain any of the keywords in the given list.
    Gather text from the following siblings until the next header.
    Returns a single text string (concatenated content) for each category.
    """
    content_list = []

    for header in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        header_text = header.get_text().strip().lower()
        if any(keyword in header_text for keyword in keywords):
            temp_content = []
            
            for sibling in header.find_next_siblings():
                if sibling.name and sibling.name.startswith('h'):
                    break  # Stop at the next section header

                if sibling.name in ['p', 'div', 'ul', 'li', 'span']:
                    text = sibling.get_text(separator=' ', strip=True)
                    if text:
                        temp_content.append(text)

            # Merge extracted text into a single string
            section_text = ' '.join(temp_content)
            if section_text:
                content_list.append(section_text)

    # Return a single merged text block for this section
    return ' '.join(content_list) if content_list else ''

In [15]:
# Example container for detailed results:
disease_data_list = []  # List of dictionaries with unified columns for each disease
invalid_links = []      # List of disease names with invalid or error pages

# Process each disease from found_df (assuming found_df has been created from your Step 2 code)
for idx, row in found_df.iterrows():
# for idx, row in tuber.iterrows():
    disease_name = row["Disease Name"]
    url = row["URL"]
    print(f"\nProcessing '{disease_name}' with URL: {url}")
    
    try:
        response = requests.get(url, headers=headers)
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        invalid_links.append(disease_name)
        continue

    if response.status_code != 200:
        print(f"Link for '{disease_name}' returned status code {response.status_code}.")
        invalid_links.append(disease_name)
        continue
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # For debugging: print a snippet of the page text
    page_text = soup.get_text(separator=' ', strip=True).lower()
    print(f"Page text for '{disease_name}' (first 500 characters):\n{page_text[:500]}\n")
    
    # Initialize a record for this disease
    disease_record = {"Disease Name": disease_name, "URL": url}

    # Extract sections for each category
    for column, keywords in keyword_dict.items():
        extracted_text = extract_section_text(soup, keywords)
        
        # Store only one combined text block per section
        disease_record[column] = extracted_text

    # Add to the data list
    disease_data_list.append(disease_record)
    print(f"Finished processing '{disease_name}'.")

# Create a unified DataFrame with one row per disease
disease_details_df = pd.DataFrame(disease_data_list)

print("\n===== Extracted Disease Details =====")
print(disease_details_df.head())

print("\n===== Diseases with Invalid or Non-Working Links =====")
print(invalid_links)


Processing 'Avian Influenza (Bird Flu)' with URL: https://www.webmd.com/cold-and-flu/flu-guide/what-know-about-bird-flu
Page text for 'Avian Influenza (Bird Flu)' (first 500 characters):
bird flu (avian influenza): causes, symptoms, and treatment skip to main content home conditions back conditions view all add/adhd allergies arthritis atrial fibrillation breast cancer cancer crohn's disease depression diabetes dvt eczema eye health heart disease hiv & aids lung disease lupus mental health multiple sclerosis migraine pain management psoriasis psoriatic arthritis rheumatoid arthritis sexual conditions skin problems sleep disorders ulcerative colitis view all drugs & supplements b

Finished processing 'Avian Influenza (Bird Flu)'.

Processing 'Arbovirus A Chikungunya Type' with URL: https://www.webmd.com/a-to-z-guides/what-is-chikungunya
Page text for 'Arbovirus A Chikungunya Type' (first 500 characters):
chikungunya: transmission, treatment, and prevention skip to main content home con

In [16]:
# ===============================
# OUTPUT RESULTS
# ===============================

print("\n===== Extracted Disease Details =====")
print(disease_details_df.head())

print("\n===== Diseases with Invalid or Non-Working Links =====")
print(invalid_links)

# Save results to CSV and JSON formats
disease_details_df.to_csv("disease_details.csv", index=False)
# Remove lines=True so that the JSON output is a list enclosed in square brackets
disease_details_df.to_json("disease_details.json", orient="records", indent=4)

print("\nData saved to 'disease_details.csv' and 'disease_details.json'.")


===== Extracted Disease Details =====
                         Disease Name  \
0          Avian Influenza (Bird Flu)   
1        Arbovirus A Chikungunya Type   
2          Avian Influenza (Bird Flu)   
3                Bacterial Meningitis   
4  Bacterial Meningococcal Meningitis   

                                                 URL  \
0  https://www.webmd.com/cold-and-flu/flu-guide/w...   
1  https://www.webmd.com/a-to-z-guides/what-is-ch...   
2  https://www.webmd.com/cold-and-flu/flu-guide/w...   
3  https://www.webmd.com/children/understanding-m...   
4  https://www.webmd.com/children/meningococcal-m...   

                                            overview  \
0  Bird flu, also called avian influenza, is a vi...   
1                                                      
2  Bird flu, also called avian influenza, is a vi...   
3  Meningitis is a rare infection that affects th...   
4                                                      

                                        