In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import pickle

from tokenizers import Tokenizer
from tokenizers.models import WordPiece,BPE
from tokenizers.trainers import WordPieceTrainer, BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

from transformers import AutoModelForTokenClassification, AutoTokenizer
from torch import Tensor
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import os
import pickle
from glob import glob

from data_utils import *

In [None]:
umls = pd.read_csv('../data/mrconso_eng_strings.csv',sep='\t',header=None)

In [None]:
from tokenizers.pre_tokenizers import Sequence, Whitespace, Punctuation, Split

whitespace_pretokenizer = Sequence([Whitespace(), Punctuation()])
 
umls_words = {}
umls_idf = {}

for phrase in tqdm(umls[0]):
    
    phrase = str(phrase).lower()
    
    tokens = [t[0] for t in whitespace_pretokenizer.pre_tokenize_str(phrase)]
    
    for token in tokens:
        umls_words[token] = umls_words.get(token,0) + 1

In [None]:
umls_df = pd.DataFrame(umls_words.items())
umls_df['word'] = [re.match('^[a-z]+$',w) is not None for w in umls_df[0]]
umls_df['len'] = [len(w) for w in umls_df[0]]

In [None]:
umls_df = umls_df[umls_df['len'] > 4].sort_values(1,ascending=False)
umls_df = umls_df[umls_df['word']]

In [None]:
wiki_words = pickle.load(open('../data/wiki_vocab.p','rb'))

In [None]:
pubmed_words = pickle.load(open('../data/words_by_freq.p','rb'))

In [None]:
umls_df['wiki_freq'] = [wiki_words[0].get(w,0) for w in umls_df[0]]

In [None]:
umls_df['pubmed_freq'] = [pubmed_words.get(w,0) for w in umls_df[0]]

In [None]:
umls_df['norm_wiki_freq'] = umls_df['wiki_freq']/umls_df['wiki_freq'].sum()
umls_df['norm_pubmed_freq'] = umls_df.pubmed_freq/umls_df.pubmed_freq.sum()

In [None]:
umls_df['norm_pubmedness'] = (umls_df.norm_pubmed_freq - umls_df.norm_wiki_freq)
umls_df['pubmedness'] = (umls_df.pubmed_freq - umls_df.wiki_freq)/(umls_df.pubmed_freq + 1)

In [None]:
umls_df = umls_df[umls_df.norm_pubmedness > 0]

In [None]:
umls_df['combined_stat'] = umls_df[1]*umls_df.pubmedness

In [None]:
umls_df = umls_df.sort_values('combined_stat',ascending=False)

In [None]:
umls_df = umls_df[umls_df.len < 29]

In [None]:
best_f1 = 0
best_index = None

for folder in glob('../output/canine_exps/*'):
    results = pickle.load(open('{}/results.p'.format(folder),'rb'))
    
    max_f1 = np.max([r[0]['overall_f1'] for r in results])
    max_f1_ind = np.argmax([r[0]['overall_f1'] for r in results])
    
    if best_f1 < max_f1:
        best_f1 = max_f1
        
        best_index = (folder, max_f1_ind)

best_model = AutoModelForTokenClassification.from_pretrained('{}/best_model'.format(best_index[0]))

In [None]:
tokenized_words = tokenize_long_list(umls_df[0],best_model)

In [None]:
umls_df['supervised_tok'] = tokenized_words

In [None]:
import json

In [None]:
split_words = umls_df[umls_df[0] != umls_df.supervised_tok]

In [None]:
subword_freq = {}

for i,row in tqdm(split_words.iterrows()):

    word = row.supervised_tok
    freq = row[1]
    
    subwords = word.split()

    for token in subwords:
        subword_freq[token] = subword_freq.get(token,0) + freq

In [None]:
subword_freq_df = pd.DataFrame(subword_freq.items())
subword_freq_df['len'] = [len(w.replace('##','')) for w in subword_freq_df[0]]

In [None]:
subword_freq_df.sort_values(1,ascending=False)

In [None]:
subword_freq_df = subword_freq_df[subword_freq_df[1] > 1]

In [None]:
chosen_subwords = subword_freq_df
chosen_vocab = set(list(chosen_subwords[0].values))

In [None]:
len(chosen_vocab)

In [None]:
letters = set()

for token in tqdm(chosen_vocab):
    for letter in token:
        if letter != '#':
            letters.add(letter)
            letters.add('##'+letter)                

In [None]:
chosen_vocab = chosen_vocab.union(letters)

In [None]:
len(chosen_vocab)

In [None]:
import os

In [None]:
def build_wordpiece_tokenizer(vocab, tokenizer_save_dir, original_tokenizer='microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', add_wiki_tokens=False):
    
    tokenizer = AutoTokenizer.from_pretrained(original_tokenizer)
    
    if add_wiki_tokens:
        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

        for token in tokenizer.vocab:
            vocab.append(token)
    else:
        for special in tokenizer.special_tokens_map.values():
            if special not in vocab:
                vocab.append(special)

        for token in tokenizer.vocab:
            if len(token.replace('##','')) == 1:
                vocab.append(token)
            
    vocab = list(set(vocab))
    
    tokenizer.save_pretrained(tokenizer_save_dir)
    
    tokenizer_config = json.load(open('{}/tokenizer.json'.format(tokenizer_save_dir),'r'))
    
    with open('{}/vocab.txt'.format(tokenizer_save_dir),'w') as f:
        f.writelines([w+'\n' for w in vocab])
        
    vocab_dict = {w:i for i,w in enumerate(vocab)}
    tokenizer_config['model']['vocab'] = vocab_dict
    
    json.dump(tokenizer_config, open('{}/tokenizer.json'.format(tokenizer_save_dir),'w'))
    
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_dir)
    
    return tokenizer

In [None]:
tokenizers = {}

biovocabbert = build_wordpiece_tokenizer(list(chosen_vocab), '../output/biovocabbert_tokenizer', add_wiki_tokens=True)