In [150]:
import spacy
import pandas as pd
import numpy as np
from collections import Counter

#### Data preparation

In [182]:
HCHS_data=pd.read_excel('/workspaces/de.uke.iam.automapping/experiments/VM_Soarian_HCHS_20210422.xlsx')
print(f"Data length {len(HCHS_data['Langname']) } and length of unique data {len(HCHS_data['Langname'].unique())}")

Data length 1033 and length of unique data 959


In [183]:
HCHS_data['Langname']=HCHS_data['Langname'].replace([' ', ''], np.nan)
HCHS_Langname=HCHS_data['Langname'].str.strip()
HCHS_Langname=HCHS_Langname.dropna(axis=0)

#### Check existence

In [185]:
nlp=spacy.load('de_core_news_lg') #load model with german language

In [186]:
def get_existence(data):
    """Get words which exist and non exist in spaCy german model"""
    existing_words=[] 
    non_existing_words=[]
    for s in data:
        doc=nlp(s)
        for token in doc:
            if ((token.pos_!='SPACE') & (token.pos_!='CCONJ') & (token.pos_!='PUNCT') & (token.pos_!='NUM')) and not token.is_stop and not token.is_punct: #drop punctuation, number, spaces, stop words
                if token.has_vector is True:
                    existing_words.append(str(token))
                else:
                    non_existing_words.append(str(token))
    return existing_words, non_existing_words



In [187]:
existing_words, non_existing_words=get_existence(HCHS_Langname)

In [188]:
dict_existing_words={}
dict_non_existing_words={}
dict_existing_words=Counter(existing_words)
dict_non_existing_words=Counter(non_existing_words)

print(f"number_of_tokens: {len(existing_words)+len(non_existing_words)}, where {len(existing_words)} words we found and {len(non_existing_words)} words we didn't find")
print(f"percent of existing words: {len(existing_words)/(len(existing_words)+len(non_existing_words)):.1%}")
print(f"Most common words from non existing ones: {dict_non_existing_words.most_common(10)}")
print(f"Most common words from existing ones: {dict_existing_words.most_common(10)}")


number_of_tokens: 2937, where 2315 words we found and 622 words we didn't find
percent of existing words: 78.8%
Most common words from non existing ones: [('Echogenität', 22), ('extracraniell', 20), ('intracraniell', 20), ('T1_nativ', 17), ('T1_postKM', 17), ('L-G-E', 16), ('Stenosegrad', 15), ('abhängigem', 15), ('Nebenbefund', 14), ('Diffusionsrestriktion', 12)]
Most common words from existing ones: [('Segment', 96), ('links', 69), ('rechts', 67), ('Stenose', 64), ('Diameter', 55), ('Mapping', 52), ('Zeile', 37), ('PD', 32), ('vorhanden', 31), ('|', 30)]


#### Playing around spaCy and data 

In [166]:
docs=nlp.pipe(HCHS_Langname)

In [167]:
def extract_tokens_plus_meta(doc:spacy.tokens.doc.Doc):
    """Extract tokens and metadata from individual spaCy doc."""
    return [
        (i.text, i.i, i.lemma_, i.ent_type_, i.tag_, 
         i.dep_, i.pos_, i.is_stop, i.is_alpha, 
         i.is_digit, i.is_punct, i.has_vector) for i in doc
    ]

In [168]:
def tidy_tokens(docs):
    """Extract tokens and metadata from list of spaCy docs."""
    
    cols = [
        "doc_id", "token", "token_order", "lemma", 
        "ent_type", "tag", "dep", "pos", "is_stop", 
        "is_alpha", "is_digit", "is_punct", "has_vector"
    ]
    
    meta_df = []
    for ix, doc in enumerate(docs):
        meta = extract_tokens_plus_meta(doc)
        meta = pd.DataFrame(meta)
        #print(meta)
        meta.columns = cols[1:]
        meta = meta.assign(doc_id = ix).loc[:, cols]
        meta_df.append(meta)
        
    return pd.concat(meta_df)   

In [169]:
df=tidy_tokens(docs)
df.head(5)

Unnamed: 0,doc_id,token,token_order,lemma,ent_type,tag,dep,pos,is_stop,is_alpha,is_digit,is_punct,has_vector
0,0,Liegt,0,Liegt,,VVFIN,ROOT,VERB,False,True,False,False,True
1,0,bei,1,bei,,APPR,mo,ADP,True,True,False,False,True
2,0,Ihnen,2,ich,,PPER,nk,PRON,True,True,False,False,True
3,0,eine,3,einen,,ART,nk,DET,True,True,False,False,True
4,0,vom,4,vom,,APPRART,sbp,ADP,True,True,False,False,True


In [189]:
df[df['has_vector']==False].head(10)

Unnamed: 0,doc_id,token,token_order,lemma,ent_type,tag,dep,pos,is_stop,is_alpha,is_digit,is_punct,has_vector
2,13,Medikamentenerfassung,2,Medikamentenerfassung,,NN,sb,NOUN,False,True,False,False,False
0,15,Berichtetes,0,Berichtetes,PER,NE,ag,ADJ,False,True,False,False,False
0,16,Gemessener,0,Gemessener,PER,ADJA,nk,ADJ,False,True,False,False,False
0,17,Gemessener,0,Gemessener,PER,ADJA,nk,ADJ,False,True,False,False,False
0,19,HB-Schnelltest,0,HB-Schnelltest,,NE,ROOT,PROPN,False,False,False,False,False
0,20,HB-Wert,0,HB-Wert,,VVPP,ROOT,VERB,False,False,False,False,False
3,22,1xEDTA,3,1xEDTA,,ADV,ROOT,ADV,False,False,False,False,False
4,22,Monovette,4,Monovette,,NN,ROOT,ADJ,False,True,False,False,False
3,23,1xHeparin,3,1xHeparin,,NN,mo,NOUN,False,False,False,False,False
4,23,Monovette,4,Monovette,PER,NE,pnc,PROPN,False,True,False,False,False
