In [1]:
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import numpy as np
import spacy
import scispacy
# from spacy import displacy
# from scispacy.abbreviation import AbbreviationDetector
# from scispacy.umls_linking import UmlsEntityLinker

# 'en_core_web_sm'
nlp = spacy.load("en_core_sci_sm")

from bert_serving.client import BertClient
bc = BertClient()

nltk.download('wordnet')
nltk.download('stopwords')

ps = PorterStemmer()
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
CONDITIONS_JSON = 'conditions_clean.json'

with open(CONDITIONS_JSON) as f:
    condition_dict = json.load(f)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import pickle as pkl

def make_sdd(condition_dict):
    symp2cond = {}
    for cond, data in condition_dict.items():
        for symp in data['symptoms']:
            if symp not in symp2cond:
                symp2cond[symp] = []
            symp2cond[symp].append(cond)

    symptoms = list(symp2bert.keys())
    symp2bert = dict(zip(symptoms, bc.encode(symptoms)))
    symptom_data_dict = {s: {'conditions': symp2cond[s], 'bert': symp2bert[s]} for s in symptoms}
    return symptom_data_dict


def make_cdd(condition_dict):
    condition_data_dict = {}
    for cond, data in condition_dict.items():
        all_texts = []
        if 'Symptoms' in data:
            all_texts.extend(data['Symptoms'].split('.'))
        if 'How Common' in data:
            all_texts.extend(data['How Common'].split('.'))
        if 'Overview' in data:
            all_texts.extend(data['Overview'].split('.'))
        if 'Risk Factors' in data:
            all_texts.extend(data['Risk Factors'].split('.'))

        all_texts = [text.strip() for text in all_texts if text.strip() != '']
        if all_texts:
            condition_data_dict[cond] = {
                'text': all_texts,
                'bert': bc.encode(all_texts)
            }
    return condition_data_dict

# with open('condition_data_dict.pkl', 'wb') as f:
#     condition_data_dict = make_cdd(condition_dict)
#     pkl.dump(condition_data_dict, f)

# with open('symptom_data_dict.pkl', 'wb') as f:
#     symptom_data_dict = make_sdd(condition_dict)
#     pkl.dump(symptom_data_dict, f)

with open('condition_data_dict.pkl', 'rb') as f:
    condition_data_dict = pkl.load(f)

with open('symptom_data_dict.pkl', 'rb') as f:
    symptom_data_dict = pkl.load(f)

In [11]:
# Would lemmatization help BERT classification?
def clean_text(s, remove_stopwords=False, return_tokens=False):
    pattern = re.compile('[\W_]+')
    words = pattern.sub(' ', s.lower()).split()
    if remove_stopwords:
        words = [word for word in words if word not in stop_words]
    if return_tokens:
        return words
    return ' '.join([wnl.lemmatize(word) for word in words])

# BERT works pretty well out of the box, but maybe keyword matching can further improve
def nearest_neighbor(query):
    # clean_query = clean_text(query, remove_stopwords=True)
    clean_query = query
    print(f"Searched using \'{clean_query}\':")
    query_bert = bc.encode([clean_query])
    distances = [(symp, np.linalg.norm(query_bert - symp_bert)) for symp, symp_bert in symp2bert.items()]
    return sorted(distances, key=lambda x: x[1])

from fuzzywuzzy import fuzz
# Dealing with synonyms -- word embeddings?
# sorted(match_score('im always nervous').items(), key=lambda x: x[1])[-10:]
def match_score(query):
    # ss: symptom scores
    # cc: candidate condition
    ss = {}
    tokens = clean_text(query, remove_stopwords=True, return_tokens=True)
    print("Search using:", tokens)
    for cond, data in condition_data_dict.items():
        cond_match = {'score': 0, 'text': []}
        for token in tokens:
            for text in data['text']:
                if token in text:
                    cond_match['score'] += 1
                    cond_match['text'].append(text)
                    break

        if cond_match['score'] > 0:
            ss[cond] = cond_match
        
#     for symp_kw in symp2cond:
#         ss[symp_kw] = sum(
#             [max([fuzz.ratio(token, kw) for token in tokens])
#              for kw in symp_kw]) / len(tokens)
    return sorted(ss.items(), key=lambda x: x[1]['score'], reverse=True)

def nearest_neighbor(query):
    # clean_query = clean_text(query, remove_stopwords=True)
    clean_query = query
    print(f"Searched using \'{clean_query}\':")
    query_bert = bc.encode([clean_query])
    distances = []
    for symp, bert in symp2bert.items():
        distances.append((symp, np.linalg.norm(query_bert - bert)))
#     for cond, data in condition_data_dict.items(): 
#         if len(data['text']) > 0:
#             distances.append(min([t for t in zip(data['text'], data['bert'])], key=lambda t: np.linalg.norm(query_bert - t[1])))
    print(distances[0])
    return distances


def match(query, symptom_data_dict):
    scores = {}
    tokens = clean_text(query, remove_stopwords=True, return_tokens=True)
    print(tokens)
    query_bert = bc.encode([query])
    
    for symp, data in symptom_data_dict.items():
        conds = data['conditions']
        bert = data['bert']

        scores[symp] = {}
        scores[symp]['fuzz'] = max([fuzz.partial_ratio(token, symp) for token in tokens])
        scores[symp]['bert'] = np.linalg.norm(query_bert - bert)
    
    return scores

In [5]:
def tok_text(query, token):
    return query[token['start']:token['end']]

def print_tok(query, token):
    if token is not None:
        print(tok_text(query, token).ljust(10), token['id'], token['pos'], token['dep'], token['head'])

def get_deps(root_id, toks):
    if root_id is None:
        return [tok for tok in toks if tok['dep'] == 'ROOT']
    return [tok for tok in toks if tok['id'] is not root_id and tok['head'] == root_id]

def all_deps(root_id, toks):
    deps = get_deps(root_id, toks)
    alldep = {root_id: [all_deps(dep['id'], toks) for dep in deps]}
    return alldep

def all_keys_dep(deps):
    all_keys = []
#     print(deps)
    for key, child_dep_list in deps.items():
        for child_dep in child_dep_list:
            all_keys += all_keys_dep(child_dep)
    return all_keys + list(deps.keys())

for query in queries:
    print(query)
    doc = nlp(query)
    toks = doc.to_json()['tokens']
    
    # verb is m, am, is
    data = []
    for tok in toks:
        if tok['pos'] in ['VERB', 'NOUN', 'ADJ', 'DET']:
            data.append(tok_text(query, tok))
    
    print(data)
    
    for tok in toks:
        print_tok(query, tok)
    print()

    print()

In [12]:
symptom_kws = {}
for symp in symptom_data_dict:
    for tok in nlp(symp).to_json()['tokens']:
        symptom_kws[tok_text(symp, tok)] = []

for cond, data in condition_data_dict.items():
    queries = data['text']
    for query in queries:
        toks = nlp(query).to_json()['tokens']
        for tok in toks:
            tok['query'] = query
            toktext = tok_text(query, tok)
            if toktext in symptom_kws:
                symptom_kws[toktext].append(tok)

In [15]:
# kw_pos_counts = {}
# for kw, toks in symptom_kws.items():
#     pos_count = {}
#     for tok in toks:
#         pos = tok['pos']
#         pos_count[pos] = pos.get(tok['pos'], 0) + 1
#     kw_pos_counts[kw] = pos_count

# kw_pos_counts

In [6]:
queries = [
    'i have a sore throat',
    'my throat is sore',
    
    'i cant sleep',
    'i have trouble falling asleep',
    
    'my stomach feels weird',
    'ive been having some stomach discomfort',
    
    'im tired all day',
    'i am tired all the time',
    
    'i dont feel like eating anything',
    'i have no appetite',
    
    'my head hurts alot',
    'i have really bad headaches',

    'i keep forgetting where things are',
    'i cant remember where i put things',
    
    'my wrist is swollen',
    'i have some swelling around my wrist'
]

import pandas as pd

In [12]:
# Dependency parse symptoms???

# Identify a part of body or an action + specifics

# Find synonyms?
for query in queries:
    print(query)
    scores = match(query, symptom_data_dict)
    for symp, score in scores.items():
        scores[symp]['final'] = score['bert'] - score['fuzz'] / 30

    scores_sorted = sorted(scores.items(), key=lambda x: x[1]['final'], reverse=False)
    df = pd.DataFrame(i[1] for i in scores_sorted)
    df.index = [i[0] for i in scores_sorted]
    display(df[:10])

i have a sore throat
['sore', 'throat']


Unnamed: 0,fuzz,bert,final
Mouth sores,100,8.272607,4.939274
Sore throat,100,8.315001,4.981668
Gum sores,100,9.139616,5.806283
Sore tongue,75,8.77307,6.27307
Excessive mouth watering,67,8.960791,6.727457
Sore or burning eyes,75,9.317777,6.817777
Difficulty breathing through nose,67,9.335332,7.101999
Throat tightness,83,10.145702,7.379036
Headache (worst ever),50,9.216672,7.550005
Nasal congestion,50,9.320139,7.653472


my throat is sore
['throat', 'sore']


Unnamed: 0,fuzz,bert,final
Mouth sores,100,8.252084,4.91875
Sore throat,100,8.528008,5.194674
Gum sores,100,8.941102,5.607769
Throat tightness,83,8.609891,5.843224
Sore tongue,75,8.605405,6.105405
Sore or burning eyes,75,8.713591,6.213591
Excessive mouth watering,67,8.547827,6.314493
Dry eyes,50,8.302854,6.636187
Difficulty breathing through nose,67,9.119211,6.885878
Craving to eat ice,67,9.182036,6.948703


i cant sleep
['cant', 'sleep']


Unnamed: 0,fuzz,bert,final
Difficult to wake from sleep,100,10.682438,7.349105
Difficulty sleeping,100,10.909912,7.576579
Difficulty staying asleep,100,11.163068,7.829734
Restless (tossing and turning) sleep,100,11.417974,8.084641
Fainting,75,10.72289,8.22289
Punching or kicking in sleep,100,11.845094,8.51176
Muscle wasting,60,10.58319,8.58319
Bleeding,60,10.64515,8.64515
Eye crusting with sleep,100,12.005589,8.672256
Poor concentration,75,11.212084,8.712084


i have trouble falling asleep
['trouble', 'falling', 'asleep']


Unnamed: 0,fuzz,bert,final
Difficult to wake from sleep,83,8.338229,5.571563
Difficulty staying asleep,100,8.941857,5.608524
Restless (tossing and turning) sleep,83,9.266179,6.499512
Punching or kicking in sleep,83,9.740559,6.973892
Difficulty sleeping,83,9.881493,7.114826
Eye crusting with sleep,83,9.985624,7.218958
Food getting stuck (swallowing),71,9.652764,7.286098
Episodes of not breathing during sleep,83,10.148045,7.381378
Trouble distinguishing color shades,86,10.368825,7.502158
Food cravings,57,9.421876,7.521876


my stomach feels weird
['stomach', 'feels', 'weird']


Unnamed: 0,fuzz,bert,final
Upset stomach,100,9.647746,6.314413
Feeling faint,60,8.693288,6.693288
Dry eyes,60,9.286067,7.286067
Distended stomach,100,10.721119,7.387786
Cold feet,67,9.678537,7.445204
Watery eyes,60,9.482992,7.482992
Eye irritation,60,9.515036,7.515036
Excessive mouth watering,60,9.580683,7.580683
Headache,43,9.03353,7.600197
Hot flashes,60,9.740488,7.740488


ive been having some stomach discomfort
['ive', 'stomach', 'discomfort']


Unnamed: 0,fuzz,bert,final
Excessive mouth watering,100,9.548293,6.21496
Excessive sweating,100,9.912832,6.579499
Upset stomach,100,10.224703,6.89137
Pain or discomfort (Abdomen (upper)),100,10.422911,7.089577
Excessive body hair growth,100,10.448868,7.115534
Excessive crying,100,10.501156,7.167823
Pain or discomfort (Abdomen (lower)),100,10.525361,7.192028
Pain or discomfort (Neck (front)),100,10.531149,7.197816
Headache (worst ever),67,9.453246,7.219913
Pain or discomfort,100,10.575148,7.241814


im tired all day
['im', 'tired', 'day']


Unnamed: 0,fuzz,bert,final
Difficulty staying awake during day,100,10.321267,6.987934
Headache,67,9.624934,7.391601
Headache (worst ever),67,9.772853,7.53952
Frequent nighttime urination,100,11.042133,7.7088
Cloudy vision,67,9.944112,7.710778
Body aches or pains,67,10.37954,8.146207
Blank stare,67,10.499051,8.265718
Decreased appetite,67,10.532867,8.299534
Blackouts (memory time loss),100,11.693211,8.359877
Muscle wasting,50,10.047208,8.380541


i am tired all the time
['tired', 'time']


Unnamed: 0,fuzz,bert,final
Frequent nighttime urination,100,10.929619,7.596286
Blackouts (memory time loss),100,11.120838,7.787505
Repetitive behaviors,75,10.370609,7.870609
Decreased appetite,75,10.64038,8.14038
Sensitive to noise,75,10.683317,8.183317
Muscle wasting,50,9.892603,8.225936
Headache (worst ever),50,9.928812,8.262145
Urine leaking (incontinence),75,10.800483,8.300483
Excessive sweating,50,10.252695,8.586028
Craving to eat ice,60,10.600286,8.600286


i dont feel like eating anything
['dont', 'feel', 'like', 'eating', 'anything']


Unnamed: 0,fuzz,bert,final
Binge eating,100,10.42962,7.096286
Food getting stuck (swallowing),83,10.023795,7.257128
Excessive sweating,100,10.784671,7.451337
Muscle wasting,83,10.461022,7.694356
Food cravings,67,9.940948,7.707614
Craving to eat ice,67,9.959132,7.725799
Feeling faint,75,10.290369,7.790369
Unable to move joint,75,10.440932,7.940932
Joint pain (Neck (back)),75,10.598098,8.098098
Joint pain,75,10.715399,8.215399


i have no appetite
['appetite']


Unnamed: 0,fuzz,bert,final
Decreased appetite,100,9.045453,5.71212
Craving to eat ice,50,8.975506,7.308839
Repetitive behaviors,75,9.848291,7.348291
Muscle wasting,38,8.68899,7.422323
Increased thirst,50,9.15526,7.488593
Fatigue,46,9.058907,7.525573
Excessive mouth watering,38,8.819288,7.552622
Short attention span,50,9.313856,7.647189
Headache,25,8.519202,7.685869
Binge eating,38,9.040497,7.77383


my head hurts alot
['head', 'hurts', 'alot']


Unnamed: 0,fuzz,bert,final
Headache,75,9.076237,6.576237
Lightheadedness,100,10.297381,6.964048
Headache (worst ever),75,9.574248,7.074248
Ear ache,50,9.113396,7.446729
Craving alcohol,75,9.984896,7.484896
Diarrhea,86,10.513362,7.646695
Food getting stuck (swallowing),75,10.380326,7.880326
Joint pain,50,9.598236,7.931569
Cold hands,75,10.438898,7.938898
Flushed skin,75,10.439764,7.939764


i have really bad headaches
['really', 'bad', 'headaches']


Unnamed: 0,fuzz,bert,final
Headache (worst ever),78,8.011635,5.411635
Headache,88,9.323527,6.390194
Body aches or pains,67,9.360144,7.12681
Excessive body hair growth,67,9.511827,7.278493
Bad breath,55,9.129824,7.29649
Ear ache,62,9.488682,7.422015
Mouth sores,33,8.657742,7.557742
Numbness or tingling (Abdomen (lower)),67,9.82067,7.587337
Joint pain (Neck (back)),67,9.925432,7.692099
Excessive mouth watering,33,8.896712,7.796712


i keep forgetting where things are
['keep', 'forgetting', 'things']


Unnamed: 0,fuzz,bert,final
Difficulty learning new things,100,10.012945,6.679612
Food getting stuck (swallowing),83,9.579149,6.812483
Food cravings,67,9.469358,7.236025
Noisy breathing,91,10.598441,7.565108
Itching or burning,83,10.418102,7.651436
Itching or burning (Eyes),83,10.422279,7.655613
Difficult to wake from sleep,75,10.249969,7.749969
Slow thinking,67,9.991694,7.758361
Forgetfulness,60,9.790062,7.790062
Muscle wasting,73,10.229636,7.796303


i cant remember where i put things
['cant', 'remember', 'put', 'things']


Unnamed: 0,fuzz,bert,final
Difficulty learning new things,100,10.61225,7.278917
Food getting stuck (swallowing),83,10.230475,7.463809
Itching or burning,83,10.465944,7.699278
Itching or burning (Eyes),83,10.612225,7.845558
Difficult to wake from sleep,67,10.131041,7.897707
Itching or burning (Ears),83,10.701429,7.934763
Muscle wasting,73,10.468785,8.035452
Food cravings,67,10.316406,8.083073
Itching or burning (Neck (front)),83,10.902512,8.135845
Noisy breathing,91,11.19159,8.158257


my wrist is swollen
['wrist', 'swollen']


Unnamed: 0,fuzz,bert,final
Headache (worst ever),80,8.58269,5.916024
Swollen lips,86,9.079855,6.213188
Weakness (Wrist),80,8.949395,6.282729
Swollen tongue,86,9.185746,6.31908
Skin irritation,60,8.653419,6.653419
Eye irritation,60,8.707018,6.707018
Enlarged or swollen glands,100,10.063665,6.730332
Enlarged or swollen glands (Neck (front)),100,10.111427,6.778094
Swelling (Ankle),57,8.776287,6.876287
Mouth sores,43,8.321743,6.88841


i have some swelling around my wrist
['swelling', 'around', 'wrist']


Unnamed: 0,fuzz,bert,final
Swelling (Ankle),88,9.386493,6.453159
Swelling (Shin),88,9.422823,6.48949
Swelling (Fingers),88,9.783404,6.850071
Headache (worst ever),80,9.695603,7.028937
Weakness (Wrist),80,9.736958,7.070291
Swelling (Hand (palm)),88,10.067021,7.133688
Sore or burning eyes,50,9.190754,7.524087
Bruising or discoloration (Foot (top)),60,9.700886,7.700886
Bruising or discoloration (Upper arm (bicep)),60,9.712477,7.712477
Tearing in one eye,50,9.389056,7.72239
