In [1]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import StringIO
import re

In [2]:
# Fetch Disease-symptoms knowledge database
url = 'https://people.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/index.html'

# Request page content
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.find_all('table', {'class': 'MsoTableWeb3'})

    if len(tables) > 0:
        table = tables[0]
        rows = table.find_all('tr')

        # Extract header from the first row
        headers = [th.get_text(strip=True) for th in rows[0].find_all(['th', 'td'])]

        # Extract all subsequent rows into a list of lists
        data = []
        for row in rows[1:]:
            cols = row.find_all('td')
            cols = [col.get_text(strip=True) for col in cols]
            if cols:
                data.append(cols)

        # Create DataFrame
        df = pd.DataFrame(data, columns=headers)

        # Print preview
        print("✅ Table scraped successfully:")


        # Now df holds your table as a DataFrame and you can continue using it
    else:
        print("❌ No table with class 'MsoTableWeb3' found.")
else:
    print(f"❌ Failed to fetch the page. Status code: {response.status_code}")


✅ Table scraped successfully:


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1866 entries, 0 to 1865
Data columns (total 3 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Disease                        1866 non-null   object
 1   Count of Disease
  Occurrence  1866 non-null   object
 2   Symptom                        1866 non-null   object
dtypes: object(3)
memory usage: 43.9+ KB


In [4]:
df.head()

Unnamed: 0,Disease,Count of Disease\n Occurrence,Symptom
0,UMLS:C0020538_hypertensive\n disease,3363.0,UMLS:C0008031_pain\n chest
1,,,UMLS:C0392680_shortness\n of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall


In [5]:
import pandas as pd
import re

# Check if 'Disease' column exists
if 'Disease' in df.columns:
    # Step 1: Filter out rows containing actual diseases
    disease_rows = df[df['Disease'].notna() & df['Disease'].str.startswith('UMLS:')]

    # Step 2: Extract diseaseID and Dname from valid entries
    disease_set = set()

    for row in disease_rows['Disease']:
        entries = str(row).split('^')  # Split in case of multiple diseases

        for entry in entries:
            entry = entry.strip()
            match = re.match(r'UMLS:(C\d{6})_(.+)', entry)
            if match:
                disease_id = match.group(1)
                dname = match.group(2).replace('_', ' ').strip().lower()
                disease_set.add((disease_id, dname))
            else:
                # Log unmatched entries for debugging
                print(f"Unmatched entry: {entry}")

    # Step 3: Convert to DataFrame
    if disease_set:
        disease_records = [{'diseaseID': did, 'Dname': dname} for did, dname in sorted(disease_set)]
        diseases_df = pd.DataFrame(disease_records)
        
        # Final Output
        print(f"✅ diseases_df created with {len(diseases_df)} records:")
        print(diseases_df.head())
    else:
        print("No valid disease entries found.")
else:
    print("The 'Disease' column is missing!")


Unmatched entry: UMLS:C0020538_hypertensive
  disease
Unmatched entry: UMLS:C0011847_diabetes
Unmatched entry: UMLS:C0011570_depression
  mental
Unmatched entry: UMLS:C0011581_depressive disorder
Unmatched entry: UMLS:C0010054_coronary
  arteriosclerosis
Unmatched entry: UMLS:C0010068_coronary heart disease
Unmatched entry: UMLS:C0032285_pneumonia
Unmatched entry: UMLS:C0018802_failure
  heart congestive
Unmatched entry: UMLS:C0038454_accidentcerebrovascular
Unmatched entry: UMLS:C0004096_asthma
Unmatched entry: UMLS:C0027051_myocardial
  infarction
Unmatched entry: UMLS:C0020443_hypercholesterolemia
Unmatched entry: UMLS:C0021311_infection
Unmatched entry: UMLS:C0042029_infection
  urinary tract
Unmatched entry: UMLS:C0002871_anemia
Unmatched entry: UMLS:C0024117_chronic
  obstructive airway disease
Unmatched entry: UMLS:C0497327_dementia
Unmatched entry: UMLS:C1565489_insufficiency
  renal
Unmatched entry: UMLS:C0009676_confusion
Unmatched entry: UMLS:C0029408_degenerativepolyarthrit