In [66]:
import pandas as pd

symptoms_separated1 = pd.read_csv("../data/disease_symptom_list.csv")
symptoms_separated1 = symptoms_separated1.assign(symptom=symptoms_separated1['symptoms'].str.split(', ')).explode('symptom')
symptoms_separated1['symptom'] = symptoms_separated1['symptom'].str.strip()
symptoms_separated1 = symptoms_separated1.drop(columns=['symptoms'])
symptoms_separated1.to_csv("../data/symptoms_separated.csv", index=False)

symptoms_separated2 = pd.read_csv("../data/diseases_with_symptoms.csv")
symptoms_separated2 = symptoms_separated2.assign(symptom=symptoms_separated2['symptoms'].str.split(', ')).explode('symptom')
symptoms_separated2['symptom'] = symptoms_separated2['symptom'].str.strip()
symptoms_separated2 = symptoms_separated2[['disease', 'symptom']]
symptoms_separated2.to_csv("../data/symptoms_separated2.csv", index=False)

In [67]:
symptoms_separated1['symptom'].value_counts()

symptom
sharp abdominal pain          135
headache                      116
sharp chest pain              104
shortness of breath           101
cough                         100
                             ... 
wrist lump or mass              1
hip stiffness or tightness      1
incontinence of stool           1
skin pain                       1
bedwetting                      1
Name: count, Length: 333, dtype: int64

In [68]:
print(symptoms_separated2.head())

          disease                           symptom
0  panic disorder           anxiety and nervousness
0  panic disorder               shortness of breath
0  panic disorder  depressive or psychotic symptoms
0  panic disorder                   chest tightness
0  panic disorder                      palpitations


In [69]:
separated_symptoms1 = pd.read_csv("../data/symptoms_separated.csv")
separated_symptoms2 = pd.read_csv("../data/symptoms_separated2.csv")

# Merge both CSVs
all_separated_symptoms = pd.concat([separated_symptoms1, separated_symptoms2], ignore_index=True)
# Remove duplicates
all_separated_symptoms = all_separated_symptoms.drop_duplicates()
all_separated_symptoms.to_csv("../data/clean/all_separated_symptoms.csv", index=False)
print(all_separated_symptoms)

                          disease                           symptom
0                  panic disorder           anxiety and nervousness
1                  panic disorder                        depression
2                  panic disorder               shortness of breath
3                  panic disorder  depressive or psychotic symptoms
4                  panic disorder                         dizziness
...                           ...                               ...
5462  conjunctivitis due to virus                       swollen eye
5463       open wound of the nose                diminished hearing
5464       open wound of the nose                          headache
5465       open wound of the nose                       facial pain
5466       open wound of the nose                         nosebleed

[5467 rows x 2 columns]


In [70]:
diseases = all_separated_symptoms['disease'].unique()
print(diseases)

['panic disorder' 'vocal cord polyp' 'turner syndrome' 'cryptorchidism'
 'poisoning due to ethylene glycol' 'atrophic vaginitis'
 'fracture of the hand' 'cellulitis or abscess of mouth'
 'eye alignment disorder' 'headache after lumbar puncture'
 'pyloric stenosis' 'salivary gland disorder' 'osteochondrosis'
 'injury to the knee' 'metabolic disorder' 'vaginitis'
 'sick sinus syndrome' 'tinnitus of unknown cause' 'glaucoma'
 'eating disorder' 'transient ischemic attack' 'pyelonephritis'
 'rotator cuff injury' 'chronic pain disorder' 'problem during pregnancy'
 'liver cancer' 'atelectasis' 'injury to the hand' 'choledocholithiasis'
 'injury to the hip' 'cirrhosis' 'thoracic aortic aneurysm'
 'subdural hemorrhage' 'diabetic retinopathy' 'fibromyalgia'
 'ischemia of the bowel' 'fetal alcohol syndrome' 'peritonitis'
 'injury to the abdomen' 'acute pancreatitis' 'thrombophlebitis' 'asthma'
 'foreign body in the vagina' 'restless leg syndrome' 'emphysema'
 'cysticercosis' 'induced abortion' 't

In [71]:
patients_reports = pd.read_csv(("../data/patient_reports.csv"))
patients_reports = patients_reports.rename(columns={'label' : 'disease'})
patients_reports['symptom'] = ""
patients_reports = patients_reports[['disease', 'symptom', 'text']]
all_separated_symptoms['text'] = ""

merged_symptoms_text = pd.concat([all_separated_symptoms, patients_reports], ignore_index=True)
merged_symptoms_text['disease'] = merged_symptoms_text['disease'].str.lower().str.strip()
merged_symptoms_text = merged_symptoms_text.sort_values(by='disease').reset_index(drop=True)

merged_symptoms_text.to_csv("../data/clean/merged_symptoms_text.csv", index=False)
print(merged_symptoms_text.head(10))


                     disease                 symptom text
0  abdominal aortic aneurysm  burning abdominal pain     
1  abdominal aortic aneurysm               back pain     
2  abdominal aortic aneurysm            arm swelling     
3  abdominal aortic aneurysm    sharp abdominal pain     
4  abdominal aortic aneurysm            palpitations     
5  abdominal aortic aneurysm     shortness of breath     
6           abdominal hernia  irregular belly button     
7           abdominal hernia         regurgitation.1     
8           abdominal hernia         swollen abdomen     
9           abdominal hernia    upper abdominal pain     


In [72]:
train01 = pd.read_csv("../data/train-00000-of-00001.csv")

train01 = train01.rename(columns={
    'Source_URL' : 'source_url',
    'Disease_Name' : 'disease',
    'Symptom_List' : 'symptom',
    'Generated_Sentence_From_symptoms' : 'text'
})

train01 = train01.loc[:, ~train01.columns.str.contains('^Unnamed')]
train01['disease'] = train01['disease'].str.lower().str.strip()

train01 = train01.assign(symptom=train01['symptom'].str.split(' \| ')).explode('symptom')
train01['symptom'] = train01['symptom'].str.strip()

if 'source_url' not in merged_symptoms_text.columns:
    merged_symptoms_text['source_url'] = None

train01 = train01[['disease', 'symptom', 'text', 'source_url']]
merged_symptoms_text = merged_symptoms_text[['disease', 'symptom', 'text', 'source_url']]

final = pd.concat([merged_symptoms_text, train01], ignore_index=True)

final['disease'] = final['disease'].str.lower().str.strip()
final = final.sort_values(by='disease').reset_index(drop=True)

final.to_csv("../data/clean/final.csv", index=False)

print(final.head(10))


            disease                                            symptom  \
0  aarskog syndrome             Mildly sunken chest (pectus excavatum)   
1  aarskog syndrome  Downward palpebral slant to eyes (palpebral sl...   
2  aarskog syndrome                            Delayed sexual maturity   
3  aarskog syndrome                     Hairline with a "widow's peak"   
4  aarskog syndrome        Scrotum surrounds the penis (shawl scrotum)   
5  aarskog syndrome                                      Delayed teeth   
6  aarskog syndrome           Short fingers and toes with mild webbing   
7  aarskog syndrome  Small, broad hands and feet with short fingers...   
8  aarskog syndrome  Mild to moderate short height which may not be...   
9  aarskog syndrome                                       Rounded face   

                                                text  \
0  I've been noticing some unusual physical chara...   
1  I've been noticing some unusual physical chara...   
2  I've been noti