In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_html('https://people.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/',attrs={'class': 'MsoTableWeb3'},encoding='ISO-8859-1' )
df = df[0].iloc[1:]
# rename columns to Disease, Count of Disease, Symptom
df.columns = ['Disease', 'Count of Disease', 'Symptom']
df['Disease'] = df['Disease'].ffill()

df

In [None]:
# Initialize lists to store reformatted data
diseases = []
counts = []
symptoms = []

current_disease = None
current_symptoms = []

def format_symptom(symptom):
    if isinstance(symptom, str):
        if '^' in symptom:
            # If the symptom contains '^', split and join with '/'
            return '/'.join(symptom.split('_')[1] for symptom in symptom.split('^')).replace('  ',' ')
        else:
            # Otherwise, split and take the second part
            return symptom.split('_')[1].replace('  ',' ')
    else:
        return symptom

# Apply the function to the 'Symptom' column
df['Symptom'] = df['Symptom'].apply(format_symptom)


# Iterate through each row of the DataFrame
for index, row in df.iterrows():
    if row['Disease'] != current_disease:
        # If a new disease is encountered, append the accumulated data for the previous disease
        if current_disease is not None:
            diseases.append(current_disease)
            counts.append(current_count)
            symptoms.append(current_symptoms)
        # Start accumulating data for the new disease
        current_disease = row['Disease']
        current_count = row['Count of Disease']
        current_symptoms = [row['Symptom']]
    else:
        # If the disease is the same, continue accumulating symptoms
        current_symptoms.append(row['Symptom'])

# Append the data for the last disease after exiting the loop
diseases.append(current_disease)
counts.append(current_count)
symptoms.append(current_symptoms)

# Create a new DataFrame with the reformatted data
reformatted_df = pd.DataFrame({'Disease': diseases, 'Count of Disease Occurrence': counts, 'Symptoms': symptoms})
reformatted_df.dropna(inplace=True)
# since disease looks like: UMLS:C0020538_hypertensive disease, split at _ and take the second part
# however, some diseases have multiple names, so we should split at '^' and combine using a slash
# eg: UMLS:C0011570_depression mental^UMLS:C0011581_depressive disorder
reformatted_df['Disease'] = reformatted_df['Disease'].apply(lambda x: '/'.join([i.split('_')[1] for i in x.split('^')]))

# Display the reformatted DataFrame
reformatted_df

In [None]:
reformatted_df.head(1).values

In [None]:
reformatted_df['Symptoms'].head(1).values[0]

In [None]:
# we will generate a column for every symptom and set it to 1 if the disease has that symptom
# we will then use this to predict the disease based on the symptoms
# eg:
# Disease | Symptom1 | Symptom2 | Symptom3 | ...
# create a new dataframe with the symptoms as columns

# create a new dataframe with all symptoms as columns and set them to 0
# first map the symptoms to a number using df['Symptom'].dropna().unique()
symptoms_list = df['Symptom'].dropna().unique()


symptoms_df = pd.DataFrame(columns=symptoms_list)
symptoms_df['Disease'] = ""

# iterate through every disease and set the symptoms to 1 if the disease has that symptom else 0
for index, row in reformatted_df.iterrows():
    # Extract the disease name and symptoms for the current row
    disease_name = row['Disease']
    disease_symptoms = row['Symptoms']
    
    # Create a dictionary to store symptom values for the current disease
    symptom_dict = {symptom: 0 for symptom in symptoms_list}  # Initialize all symptoms to 0
    
    # Set the value to 1 for each symptom present in the current disease
    for symptom in disease_symptoms:
        symptom_dict[symptom] = 1
    
    symptom_dict['Disease'] = disease_name
    # Add a row to symptoms_df with the disease name and symptom values
    symptoms_df.loc[index]=(pd.Series(symptom_dict))

symptoms_df    

In [None]:
# test illness
test_illness = ['pain chest', 'asthenia', 'pressure chest']

# generate one hot encoded features for the test illness
test_illness_df = pd.DataFrame(columns=symptoms_list)
test_illness_dict = {symptom: 0 for symptom in symptoms_list}
for symptom in test_illness:
    test_illness_dict[symptom] = 1
test_illness_df.loc[0]=(pd.Series(test_illness_dict))

# create a dataframe from the dict
test_illness_df


In [None]:
print(list(symptoms_list))

In [None]:
# create bit vectors for every disease
# create a bit vector for the test illness
# and then compare distances between the test illness and the diseases vectors

# Extract symptom columns (excluding the 'Disease' column)
symptom_columns = symptoms_df.columns[:-1]  # Exclude the last column, which is 'Disease'

# Convert symptom columns to arrays for each disease
bit_vectors = symptoms_df[symptom_columns].to_numpy()

# test bit vector
test_illness_bit_vector = test_illness_df[symptom_columns].to_numpy()
# Display bit vectors
print(bit_vectors)

In [None]:
# get the distance between the test illness and each of the diseases
# use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
# Compute cosine similarity between the test illness bit vector and the bit vectors for each disease
cosine_similarities = cosine_similarity(test_illness_bit_vector, bit_vectors)
cosine_similarities



In [None]:
# get top 3 highest similarity scores
sorted_sims = sorted(cosine_similarities[0], reverse=True)[:3]

# get the index of the top 3 highest similarity scores
top_3_indices = np.argsort(cosine_similarities[0])[-3:][::-1]
top_3_indices

In [None]:
print(list(reformatted_df['Disease'].values))

In [None]:
# get corresponding diseases names
diseases = reformatted_df['Disease'].values[top_3_indices]
diseases


In [None]:
# read data/symptoms.txt
with open('data/symptoms.txt', 'r') as f:
    t = f.read().splitlines()

print(t)