In [1]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as bs

In [29]:
MC_URL = 'https://www.mayoclinic.org'
SC_URL = '/symptom-checker/select-symptom/itt-20009075'

def extract_mayo_clinic_symptoms():
    req = Request(MC_URL + SC_URL, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    soup = bs(webpage)
    adult_symptom_list = soup.find('div', {'class': 'adult'})
        
    return [{
        'name': a.text.split('in adults')[0].strip().lower(),
        'url': a['href'], 
        'factors': extract_factors(a['href'])
    } for a in adult_symptom_list.find_all('a')]

def extract_factors(symptom_url):
    req = Request(MC_URL + symptom_url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    soup = bs(webpage)
    fieldsets = soup.find_all('fieldset')
    return [[fieldset.find('legend').text, [label.text for label in fieldset.find_all('label')]] for fieldset in fieldsets]

In [57]:
symptoms = extract_mayo_clinic_symptoms()

In [62]:
import json
with open('mayo_clinic_symptoms.json', 'w') as f:
    json.dump(symptoms, f)

In [3]:
import json
with open('mayo_clinic_symptoms.json') as f:
    symptoms = json.load(f)

In [139]:
import spacy
from spacy import displacy

def prompt_to_question(prompt):
    doc = nlp(prompt)
    pos = [token.pos_ for token in doc]
    tag = [token.tag_ for token in doc]
    dep = [token.dep_ for token in doc]
#         print(prompt)
#         print(' '.join(pos))
#         print(factors)

    START = "IS"
    SYMPTOM = symptom['name'].split('in adults')[0]
    POSESSIVE = "YOUR"
    modified_doc = doc
    modified_doc_texts = [tok.text for tok in doc]

    PN_index = pos.index('PROPN') if 'PROPN' in pos else -1
    N_index = PN_index if PN_index > -1 else (pos.index('NOUN') if 'NOUN' in pos else -1)
    ROOT_index = dep.index('ROOT')
    
    # IS/DOES NOUN...?
    if pos[ROOT_index] == 'NOUN' and doc[N_index].text != 'use':
        SYMPTOM = ""

        if tag[N_index] == 'NNS':
            START = "ARE"
    
    # IS VERB-ing...?
    if pos[ROOT_index] == 'VERB' or pos[ROOT_index] == 'AUX':
        if N_index > -1:
            if doc[ROOT_index].text == 'is':
                START = 'IS'
                SYMPTOM = ''
            elif tag[N_index] == 'NNS':
                START = 'DO'
            elif not doc[ROOT_index].text.endswith('ed'):
                START = 'DOES'
                POSESSIVE = ''
                SYMPTOM = ''
                modified_doc_texts[-1] = modified_doc[-1].lemma_
        else:
            s_doc = nlp(symptom['name'])
            s_tag = [tok.tag_ for tok in s_doc]
            s_dep = [tok.dep_ for tok in s_doc]
            if s_tag[s_dep.index('ROOT')] == 'NNS':
                START = 'ARE'

    if pos[-1] == 'AUX':
        modified_doc_texts = modified_doc_texts[:-1]

    # Singular cases
#     if pos[0] == 'VERB' and len(pos) == 1 and doc[0].text != "Located":
#         POSESSIVE = ''
#         SYMPTOM = ''

#     if prompt == 'Pain best described as':
#         START = "IS"
#         SYMPTOM = ''

    question = f"{START} {POSESSIVE} {SYMPTOM} {' '.join(modified_doc_texts)}"
    question = ' '.join(question.split()).lower()
    return question

nlp = spacy.load('en_core_web_sm')

dialog_tree = []
for symptom in symptoms:
    symptom_dialogs = {}
    for prompt, factors in symptom['factors']:
        question = prompt_to_question(prompt)
        symptom_dialogs[question] = factors
    dialog_tree.append({'name': symptom['name'],
                        'dialogs': symptom_dialogs})

In [41]:
with open('mayo_clinic_dialog.json', 'w') as f:
    json.dump(dialog_tree, f)

In [140]:
for s in dialog_tree:
    for d in s['dialogs']:
        print(d)

is your pain
is your pain located in
is your abdominal pain triggered or worsened by
is your abdominal pain relieved by
is your abdominal pain accompanied by
does blood appear
is your blood in stool triggered or worsened by
is your blood in stool relieved by
is your blood in stool accompanied by
is your chest pain pain best described as
is your problem
is your chest pain triggered or worsened by
is your chest pain relieved by
is your chest pain accompanied by
does blood appear
is your problem
is your constipation accompanied by
is your cough
is your problem
is your cough triggered or worsened by
is your cough accompanied by
is your problem
is your diarrhea preceded by
is your diarrhea triggered or worsened by
is your diarrhea relieved by
is your diarrhea accompanied by
is your difficulty swallowing swallowing
is your difficulty swallowing triggered or worsened by
is your difficulty swallowing accompanied by
is your dizziness you feel
do your dizziness symptoms
is your dizziness trigger

In [138]:
# doc = nlp("preceded by use of")
doc = nlp("swallowing")
print([tok.dep_ for tok in doc])
print([tok.pos_ for tok in doc])

['ROOT']
['VERB']


In [130]:
symptoms

[{'name': 'Abdominal pain',
  'url': '/symptom-checker/abdominal-pain-in-adults-adult/related-factors/itt-20009075',
  'factors': [['Pain is',
    ['Burning',
     'Crampy',
     'Dull',
     'Gnawing',
     'Intense',
     'Intermittent or episodic',
     'Ongoing (chronic)',
     'Sharp',
     'Steady',
     'Sudden (acute)',
     'Worsening or progressing']],
   ['Pain located in',
    ['Abdomen but radiates to other parts of the body',
     'Lower abdomen',
     'Middle abdomen',
     'One or both sides',
     'Upper abdomen']],
   ['Triggered or worsened by',
    ['Coughing or other jarring movements',
     'Drinking alcohol',
     'Eating certain foods',
     'Menstrual cycle',
     'Stress']],
   ['Relieved by',
    ['Antacids',
     'Avoiding certain foods',
     'Changing position',
     'Drinking more water',
     'Eating certain foods',
     'Eating more fiber']],
   ['Accompanied by',
    ['Abdominal swelling',
     'Black or bloody stools',
     'Constipation',
     'Diarr