In [1]:
import spacy
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm

#### Data preparation

In [2]:
df=pd.read_csv('/workspaces/de.uke.iam.automapping/experiments/CONCEPT.csv', on_bad_lines='skip', delimiter="\t", low_memory=False)

In [3]:
df_new=df[df['vocabulary_id']=='SNOMED']

In [4]:
df_condition=df_new[df_new['domain_id']=='Condition']

In [24]:
df_condition[df_condition['concept_name']=='[X]Occupant of streetcar injured in transport accident']['concept_name'].replace({'X]':'X] '}, regex=True)

534513    [X] Occupant of streetcar injured in transport...
Name: concept_name, dtype: object

In [22]:
pr='[X]Occupant of streetcar injured in transport accident'
pr.replace('[X]', '[X] ')

'[X] Occupant of streetcar injured in transport accident'

In [25]:
snomed_description=df_condition['concept_name'].replace({'X]': 'X] '}, regex=True).str.lower().unique()

In [30]:
len(snomed_description)

149633

#### Check existence

In [26]:
nlp=spacy.load('en_core_web_lg')  

In [27]:
def get_existence(data):
    """Get words which exist and non exist in spaCy english model"""
    existing_words=[] 
    non_existing_words=[]
    for s in tqdm(data):
        doc=nlp(s)
        for token in doc:
            if ((token.pos_!='SPACE') & (token.pos_!='CCONJ') & (token.pos_!='PUNCT') & (token.pos_!='NUM')) and not token.is_stop and not token.is_punct: #drop punctuation, number, spaces, stop words
                if token.has_vector is True:
                    existing_words.append(str(token))
                else:
                    non_existing_words.append(str(token))
    return existing_words, non_existing_words


In [28]:
existing_words, non_existing_words=get_existence(snomed_description)

100%|██████████| 149633/149633 [19:02<00:00, 130.95it/s]


In [29]:
dict_existing_words={}
dict_non_existing_words={}
dict_existing_words=Counter(existing_words)
dict_non_existing_words=Counter(non_existing_words)

print(f"number_of_tokens: {len(existing_words)+len(non_existing_words)}, where {len(existing_words)} words we found and {len(non_existing_words)} words we didn't find")
print(f"percent of existing words: {len(existing_words)/(len(existing_words)+len(non_existing_words)):.1%}")
print(f"Most common words from non existing ones: {dict_non_existing_words.most_common(30)}")
print(f"Most common words from existing ones: {dict_existing_words.most_common(30)}")


number_of_tokens: 611540, where 565796 words we found and 45744 words we didn't find
percent of existing words: 92.5%
Most common words from non existing ones: [('occurrent', 576), ('puerperium', 436), ('arthropathy', 424), ('/or', 386), ('nontraffic', 323), ('thrombophlebitis', 150), ('dermatosis', 148), ('malposition', 124), ('suppurative', 109), ('aplasia', 104), ('synovitis', 103), ('gonococcal', 101), ('metatarsophalangeal', 92), ('ankylosis', 89), ('tenosynovitis', 88), ('lumbosacral', 86), ('subtalar', 85), ('noncollision', 83), ('chondrocalcinosis', 82), ('extradural', 77), ('nonmotor', 76), ('adnexa', 74), ('intraepithelial', 73), ('bullosa', 73), ('trichophyton', 73), ('lymphangitis', 71), ('microdeletion', 70), ('nondependent', 68), ('megaloblastic', 67), ('ectropion', 67)]
Most common words from existing ones: [('x', 6969), ('nos', 6476), ('neoplasm', 4789), ('left', 4745), ('right', 4681), ('poisoning', 4594), ('syndrome', 4210), ('injury', 4092), ('unspecified', 3984), ('