In [27]:
import spacy
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm

#### Data preparation

In [9]:
df=pd.read_csv('/workspaces/de.uke.iam.automapping/experiments/CONCEPT.csv', on_bad_lines='skip', delimiter="\t", low_memory=False)

In [12]:
df_new=df[df['vocabulary_id']=='SNOMED']

In [16]:
df_condition=df_new[df_new['domain_id']=='Condition']

In [18]:
snomed_description=df_condition['concept_name']

#### Check existence

In [21]:
nlp=spacy.load('en_core_web_lg')  

In [28]:
def get_existence(data):
    """Get words which exist and non exist in spaCy english model"""
    existing_words=[] 
    non_existing_words=[]
    for s in tqdm(data):
        doc=nlp(s)
        for token in doc:
            if ((token.pos_!='SPACE') & (token.pos_!='CCONJ') & (token.pos_!='PUNCT') & (token.pos_!='NUM')) and not token.is_stop and not token.is_punct: #drop punctuation, number, spaces, stop words
                if token.has_vector is True:
                    existing_words.append(str(token))
                else:
                    non_existing_words.append(str(token))
    return existing_words, non_existing_words


In [29]:
existing_words, non_existing_words=get_existence(snomed_description)

100%|██████████| 180133/180133 [24:34<00:00, 122.17it/s]


In [32]:
dict_existing_words={}
dict_non_existing_words={}
dict_existing_words=Counter(existing_words)
dict_non_existing_words=Counter(non_existing_words)

print(f"number_of_tokens: {len(existing_words)+len(non_existing_words)}, where {len(existing_words)} words we found and {len(non_existing_words)} words we didn't find")
print(f"percent of existing words: {len(existing_words)/(len(existing_words)+len(non_existing_words)):.1%}")
print(f"Most common words from non existing ones: {dict_non_existing_words.most_common(30)}")
print(f"Most common words from existing ones: {dict_existing_words.most_common(30)}")


number_of_tokens: 725191, where 653001 words we found and 72190 words we didn't find
percent of existing words: 90.0%
Most common words from non existing ones: [('X]Other', 1906), ('Neoplasm', 1438), ('puerperium', 609), ('occurrent', 578), ('/or', 554), ('X]Intentional', 411), ('arthropathy', 366), ('X]Contact', 334), ('nontraffic', 312), ('X]Occupant', 312), ('X]Exposure', 286), ('X]Poisoning', 274), ('X]Unspecified', 251), ('X]Mental', 241), ('X]Assault', 231), ('Arthropathy', 213), ('Infective', 212), ('X]Accidental', 197), ('X]Injury', 168), ('Subacute', 165), ('FIGO', 156), ('malposition', 123), ('Contracture', 119), ('X]Malignant', 114), ('Synovial', 109), ('metatarsophalangeal', 105), ('adnexa', 103), ('D]Abnormal', 101), ('X]Car', 100), ('X]Bus', 100)]
Most common words from existing ones: [('NOS', 12444), ('unspecified', 5622), ('syndrome', 4687), ('left', 4645), ('O', 4642), ('right', 4519), ('E', 4509), ('disease', 4256), ('neoplasm', 4195), ('poisoning', 4121), ('fracture'