In [None]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import StringIO
import re
from pprint import pprint

In [46]:
# Fetch Disease-symptoms knowledge database
url = 'https://people.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/index.html'

# Request page content
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.find_all('table', {'class': 'MsoTableWeb3'})

    if len(tables) > 0:
        table = tables[0]
        rows = table.find_all('tr')

        # Extract header from the first row
        headers = [th.get_text(strip=True) for th in rows[0].find_all(['th', 'td'])]

        # Extract all subsequent rows into a list of lists
        data = []
        for row in rows[1:]:
            cols = row.find_all('td')
            cols = [col.get_text(strip=True) for col in cols]
            if cols:
                data.append(cols)

        # Create DataFrame
        df = pd.DataFrame(data, columns=headers)

        # Print preview
        print("Table scraped successfully")


        # Now df holds your table as a DataFrame and you can continue using it
    else:
        print("No table with class 'MsoTableWeb3' found.")
else:
    print(f"Failed to fetch the page. Status code: {response.status_code}")
df

Table scraped successfully


Unnamed: 0,Disease,Count of Disease\n Occurrence,Symptom
0,UMLS:C0020538_hypertensive\n disease,3363,UMLS:C0008031_pain\n chest
1,,,UMLS:C0392680_shortness\n of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall
...,...,...,...
1861,,,UMLS:C0425251_bedridden^UMLS:C0741453_bedridden
1862,,,UMLS:C0242453_prostatism
1863,UMLS:C0011127_decubitus\n ulcer,42,UMLS:C0232257_systolic murmur
1864,,,UMLS:C0871754_frail


In [47]:
# Step 1: Add a column to count the occurrences of "UMLS" in each row
df['UMLS_Count'] = df['Disease'].fillna('').str.count('UMLS')

# Step 2: Calculate the total occurrences of "UMLS" in the 'Disease' column
disease_umls_count = df['UMLS_Count'].sum()

# Print the total count of "UMLS"
print(f"Total occurrences of 'UMLS' in 'Disease' column: {disease_umls_count}")

# Step 3: Identify rows where a cell contains more than one occurrence of "UMLS"
rows_with_multiple_umls = df[df['UMLS_Count'] > 1]

# Print these rows
print("\nRows with more than one 'UMLS':")
rows_with_multiple_umls


Total occurrences of 'UMLS' in 'Disease' column: 149

Rows with more than one 'UMLS':


Unnamed: 0,Disease,Count of Disease\n Occurrence,Symptom,UMLS_Count
26,UMLS:C0011570_depression\n mental^UMLS:C00115...,1337,UMLS:C0424000_feeling\n suicidal,2
47,UMLS:C0010054_coronary\n arteriosclerosis^UML...,1284,UMLS:C0008031_pain\n chest,2
295,UMLS:C0006826_malignant\n neoplasms^UMLS:C130...,354,UMLS:C0030193_pain,2
305,UMLS:C0001175_acquiredimmuno-deficiency\n syn...,350,UMLS:C0015967_fever,3
350,UMLS:C0036690_septicemia^UMLS:C0243026_systemi...,311,UMLS:C0015967_fever,3
819,UMLS:C0376358_malignant\n neoplasm of prostat...,163,UMLS:C0018965_hematuria,2
885,UMLS:C0006142_malignant\n neoplasm of breast^...,152,UMLS:C0024103_mass in breast,2
1294,UMLS:C0006840_candidiasis^UMLS:C0006849_oralca...,99,UMLS:C0011991_diarrhea,2
1357,UMLS:C0007102_malignant\n tumor of colon^UMLS...,94,UMLS:C0221198_lesion,2
1387,UMLS:C0014549_tonic-clonic\n epilepsy^UMLS:C0...,92,UMLS:C0013144_drowsiness,2


In [None]:
# Step 1: Clean and normalize the 'Disease' column
df['Disease'] = df['Disease'].str.replace('\n', ' ', regex=False).str.strip()

# Step 2: Filter rows containing valid 'UMLS:' entries
disease_rows = df[df['Disease'].notna() & df['Disease'].str.contains('UMLS:')]

# Step 3: Extract the first valid diseaseID and Dname from each cell
disease_list = []  # Use a list instead of a set
seen_diseases = set()  # Track unique entries to avoid duplicates

for row in disease_rows['Disease']:
    # Extract the first occurrence of a valid "UMLS:" entry
    entry = next((e.strip() for e in row.split('^') if e.startswith("UMLS:")), None)
    if entry and '_' in entry:  # Ensure it's a valid entry with an underscore
        parts = entry.split('_', 1)  # Split at the first underscore
        if len(parts) == 2:
            disease_id = parts[0].replace("UMLS:", "").strip()  # Remove 'UMLS:'
            # Replace underscores with spaces, then normalize spaces in Dname
            dname = ' '.join(parts[1].replace('_', ' ').split()).lower()
            if (disease_id, dname) not in seen_diseases:  # Avoid duplicates
                disease_list.append({'diseaseID': disease_id, 'Dname': dname})
                seen_diseases.add((disease_id, dname))
    else:
        # Log unmatched or invalid entries for debugging
        print(f"Unmatched or invalid entry: '{row}'")

# Step 4: Convert to DataFrame
if disease_list:
    diseases_df = pd.DataFrame(disease_list)

    # Step 5: Check for duplicate diseaseID values
    duplicates = diseases_df[diseases_df['diseaseID'].duplicated(keep=False)]  # Find all duplicate rows
    print("\nDuplicate diseaseIDs:")
    display(duplicates)

    # Step 6: Drop duplicate rows and keep the first occurrence
    diseases_df = diseases_df.drop_duplicates(subset='diseaseID', keep='first')
    print("\nDuplicates dropped, DataFrame Updated.")

    # Final Output
    print(f"\nFinal diseases_df created with {len(diseases_df)} records.")
else:
    print("No valid disease entries found.")

# Display the resulting DataFrame
diseases_df



Duplicate diseaseIDs:


Unnamed: 0,diseaseID,Dname
20,C0006826,malignant neoplasms
99,C0006826,malignantneoplasms



Duplicates dropped, DataFrame Updated.

Final diseases_df created with 133 records.


Unnamed: 0,diseaseID,Dname
0,C0020538,hypertensive disease
1,C0011847,diabetes
2,C0011570,depression mental
3,C0010054,coronary arteriosclerosis
4,C0032285,pneumonia
...,...,...
129,C1258215,ileus
130,C0001511,adhesion
131,C0011253,delusion
132,C0233472,affect labile


In [None]:
# Extract Dnames into a dictionary with index as key
if 'Dname' in diseases_df.columns:
    dnames_dict = diseases_df['Dname'].to_dict()  
    print("Disease Names:")
    pprint(dnames_dict)  
else:
    print("The 'Dname' column is missing!")

Disease Names:
{0: 'hypertensive disease',
 1: 'diabetes',
 2: 'depression mental',
 3: 'coronary arteriosclerosis',
 4: 'pneumonia',
 5: 'failure heart congestive',
 6: 'accidentcerebrovascular',
 7: 'asthma',
 8: 'myocardial infarction',
 9: 'hypercholesterolemia',
 10: 'infection',
 11: 'infection urinary tract',
 12: 'anemia',
 13: 'chronic obstructive airway disease',
 14: 'dementia',
 15: 'insufficiency renal',
 16: 'confusion',
 17: 'degenerativepolyarthritis',
 18: 'hypothyroidism',
 19: 'anxiety state',
 20: 'malignant neoplasms',
 21: 'acquiredimmuno-deficiency syndrome',
 22: 'cellulitis',
 23: 'gastroesophageal reflux disease',
 24: 'septicemia',
 25: 'deep vein thrombosis',
 26: 'dehydration',
 27: 'neoplasm',
 28: 'embolism pulmonary',
 29: 'epilepsy',
 30: 'cardiomyopathy',
 31: 'chronic kidney failure',
 32: 'carcinoma',
 33: 'hepatitis c',
 34: 'peripheral vascular disease',
 35: 'psychotic disorder',
 36: 'hyperlipidemia',
 37: 'bipolar disorder',
 38: 'obesity',
 39:

In [28]:
import pandas as pd
import requests

# Replace the URL below with the raw file URL for your text file
github_url = "https://raw.githubusercontent.com/Saurabh-Lakhanpal/symptoms-analyzer/main/resources/disease_description.txt"

# Fetch the file content from GitHub
response = requests.get(github_url)
content = response.text

# Parse the content to create a mapping of Dname to description
dname_to_description = {}
for line in content.splitlines():
    if ": " in line:
        parts = line.split(": ", 1)
        dname_key = parts[0].split(". ", 1)[1].strip().lower()
        description_value = parts[1].strip()
        dname_to_description[dname_key] = description_value

# Map descriptions to the DataFrame
diseases_df['Description'] = diseases_df['Dname'].str.lower().map(dname_to_description)

# Output the updated DataFrame
diseases_df.head()


Unnamed: 0,diseaseID,Dname,Description
0,C0020538,Acquired Immuno-deficiency Syndrome,"A chronic, life-threatening condition caused b..."
1,C0011847,Diabetes,Chronic condition affecting the body's ability...
2,C0011570,Depression Mental,Persistent feelings of sadness or hopelessness...
3,C0011581,Depressive Disorder,A mood disorder characterized by prolonged per...
4,C0010054,Coronary Arteriosclerosis,Hardening and narrowing of coronary arteries d...


In [29]:
diseases_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   diseaseID    149 non-null    object
 1   Dname        149 non-null    object
 2   Description  143 non-null    object
dtypes: object(3)
memory usage: 3.6+ KB


In [32]:
# Step 1: Check for duplicates in the 'diseaseID' column
duplicates_in_diseaseID = diseases_df[diseases_df['diseaseID'].duplicated(keep=False)]  # Find all duplicate rows
print("\nDuplicates in 'diseaseID' column (detailed):")
print(duplicates_in_diseaseID)

# Count the number of unique and duplicate entries
total_duplicates_count = duplicates_in_diseaseID.shape[0]
print(f"\nTotal duplicate entries in 'diseaseID' column: {total_duplicates_count}")

# Step 2: Trim leading and trailing whitespaces from all string columns
diseases_df = diseases_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
print("\nWhitespace trimming complete.")

# Step 3: Check for null values and display rows with NaN values
null_values_per_column = diseases_df.isnull().sum()
print("\nNull values in the DataFrame (per column):")
print(null_values_per_column)

# Show rows with NaN values
rows_with_nan = diseases_df[diseases_df.isnull().any(axis=1)]  # Select rows with any NaN value
print("\nRows with NaN values:")
print(rows_with_nan)



Duplicates in 'diseaseID' column (detailed):
    diseaseID                Dname  \
22   C0006826  Malignant Neoplasms   
111  C0006826   Malignantneoplasms   

                                           Description  
22   General term for cancers characterized by unco...  
111                                                NaN  

Total duplicate entries in 'diseaseID' column: 2

Whitespace trimming complete.

Null values in the DataFrame (per column):
diseaseID      0
Dname          0
Description    6
dtype: int64

Rows with NaN values:
    diseaseID                      Dname Description
8    C0038454    Accidentcerebrovascular         NaN
19   C0029408  Degenerativepolyarthritis         NaN
61   C0013405          Paroxysmaldyspnea         NaN
100  C0006849            Oralcandidiasis         NaN
111  C0006826         Malignantneoplasms         NaN
138  C0019291               Herniahiatal         NaN


In [None]:
# Remove duplicates based on the 'diseaseID' column
diseases_df = diseases_df.drop_duplicates(subset='diseaseID', keep='first')

# Confirm duplicates are removed
print("Duplicates removed. Current shape of diseases_df:")
diseases_df

In [None]:
import pandas as pd

# Check if 'Symptom' column exists
if 'Symptom' in df.columns:
    # Step 1: Clean and normalize the 'Symptom' column
    df['Symptom'] = df['Symptom'].str.replace('\n', ' ', regex=False).str.strip()

    # Step 2: Filter rows containing valid 'UMLS:' entries
    symptom_rows = df[df['Symptom'].notna() & df['Symptom'].str.startswith('UMLS:')]

    # Step 3: Extract symptomID and Sname from valid entries
    symptom_set = set()

    for row in symptom_rows['Symptom']:
        entries = str(row).split('^')

        for entry in entries:
            entry = ' '.join(entry.split())
            if entry.startswith("UMLS:") and '_' in entry:
                parts = entry.split('_', 1)
                if len(parts) == 2:
                    symptom_id = parts[0].replace("UMLS:", "").strip()
                    sname = parts[1].replace('_', ' ').strip().lower()
                    symptom_set.add((symptom_id, sname))
            else:
                # Log unmatched entries for debugging
                print(f"Unmatched entry: '{entry}'")

    # Step 4: Convert to DataFrame
    if symptom_set:
        symptom_records = [{'symptomID': sid, 'Sname': sname.title()} for sid, sname in sorted(symptom_set)]
        symptoms_df = pd.DataFrame(symptom_records)

        # Final Output
        print(f"symptoms_df created with {len(symptoms_df)} records.")
        print(symptoms_df.head())
    else:
        print("No valid symptom entries found.")
else:
    print("The 'Symptom' column is missing!")
