In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import pickle

from tokenizers import Tokenizer
from tokenizers.models import WordPiece,BPE
from tokenizers.trainers import WordPieceTrainer, BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

In [None]:
train = pd.read_csv('../data/eng.word.train.tsv',sep='\t',header=None)
dev = pd.read_csv('../data/eng.word.dev.tsv',sep='\t',header=None)
test = pd.read_csv('../data/eng.word.test.tsv',sep='\t',header=None)

In [None]:
umls = pd.read_csv('../data/mrconso_eng_strings.csv',sep='\t',header=None)

In [None]:
umls

In [None]:
from tokenizers.pre_tokenizers import Sequence, Whitespace, Punctuation, Split

whitespace_pretokenizer = Sequence([Whitespace(), Punctuation()])
 
umls_words = {}
umls_idf = {}

for phrase in tqdm(umls[0]):
    
    phrase = str(phrase).lower()
    
    tokens = [t[0] for t in whitespace_pretokenizer.pre_tokenize_str(phrase)]
    
    for token in tokens:
        umls_words[token] = umls_words.get(token,0) + 1

In [None]:
umls_df = pd.DataFrame(umls_words.items())
umls_df['word'] = [re.match('^[a-z]+$',w) is not None for w in umls_df[0]]
umls_df['len'] = [len(w) for w in umls_df[0]]

In [None]:
umls_df = umls_df[umls_df['len'] > 4].sort_values(1,ascending=False)
umls_df = umls_df[umls_df['word']]

In [None]:
umls_df

In [None]:
wiki_words = pickle.load(open('../data/wiki_vocab.p','rb'))

In [None]:
pubmed_words = pickle.load(open('../data/words_by_freq.p','rb'))

In [None]:
umls_df['wiki_freq'] = [wiki_words[0].get(w,0) for w in umls_df[0]]

In [None]:
umls_df['pubmed_freq'] = [pubmed_words.get(w,0) for w in umls_df[0]]

In [None]:
umls_df['norm_wiki_freq'] = umls_df['wiki_freq']/umls_df['wiki_freq'].sum()
umls_df['norm_pubmed_freq'] = umls_df.pubmed_freq/umls_df.pubmed_freq.sum()

In [None]:
umls_df['norm_pubmedness'] = (umls_df.norm_pubmed_freq - umls_df.norm_wiki_freq)
umls_df['pubmedness'] = (umls_df.pubmed_freq - umls_df.wiki_freq)/(umls_df.pubmed_freq + 1)

In [None]:
umls_df.describe()

In [None]:
umls_df

In [None]:
umls_df = umls_df[umls_df.norm_pubmedness > 0]

In [None]:
umls_df.sort_values('norm_pubmedness',ascending=False)[:100]

In [None]:
umls_df['combined_stat'] = umls_df[1]*umls_df.pubmedness

In [None]:
umls_df = umls_df.sort_values('combined_stat',ascending=False)

In [None]:
umls_df

In [None]:
umls_df = umls_df[umls_df.len < 29]

In [None]:
len(train),len(train.merge(umls_df,on=0,how='inner'))

In [None]:
len(dev),len(dev.merge(umls_df,on=0,how='inner'))

In [None]:
len(test),len(test.merge(umls_df,on=0,how='inner'))

In [None]:
dev_bio_words = dev.merge(umls_df,on=0,how='inner')
dev_bio_words.to_csv('../data/dev_bio_words.tsv')