In [None]:
from transformers import AutoModel, AutoTokenizer, AutoModelForTokenClassification

import torch
from torch import Tensor
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from datasets import ClassLabel, load_dataset

import pandas as pd
import numpy as np
from tqdm import tqdm
import re

from data_utils import *

In [None]:
train = pd.read_csv('../data/eng.word.train.tsv',sep='\t',header=None)
dev = pd.read_csv('../data/eng.word.dev.tsv',sep='\t',header=None)

In [None]:
train = get_bio_tags(train)
dev = get_bio_tags(dev)

In [None]:
train['len'] = [len(str(w)) for w in train[0]]
dev['len'] = [len(str(w)) for w in dev[0]]

In [None]:
dev['bio_tokens'] = tokenize_word_from_bio(dev[0], dev.bio)

In [None]:
print(len(train),len(dev))
train = train[train.len < 30]
dev = dev[dev.len < 30]

print(len(train),len(dev))

In [None]:
medical_subwords = pd.read_csv('../data/dev_bio_words.tsv',header=0,index_col=0)

In [None]:
medical_subwords[0] = medical_subwords['0']

In [None]:
bio_dev = dev.merge(medical_subwords,on=0, how='inner')

In [None]:
def bio_with_tokenizer(dev, tokenizer_name):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    if tokenizer_name == 't5-base':
        dev['{}'.format(tokenizer_name)] = [' '.join(tokenizer.tokenize(w))[1:].split() for w in dev[0]]
        dev['{}'.format(tokenizer_name)] = [' '.join([p[0]] + ['@@' + n for n in p[1:]]) for p in dev['{}'.format(tokenizer_name)]]
    else:
        dev['{}'.format(tokenizer_name)] = [' '.join(tokenizer.tokenize(w)).replace('##','@@') for w in dev[0]]
        
    dev['{}_bio'.format(tokenizer_name)] = get_bio_tag_col(dev[0], dev['{}'.format(tokenizer_name)])
    dev['{}_len'.format(tokenizer_name)] = [len(w.split()) for w in dev['{}'.format(tokenizer_name)]]
    
    return dev, tokenizer

In [None]:
toks = ['bert-base-uncased','microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract','osunlp/BioVocabBERT']

for tok in toks:
    bio_dev, _ = bio_with_tokenizer(bio_dev, tok)

In [None]:
bio_dev

In [None]:
from evaluation.evaluate_mod import *

In [None]:
bio_dev[[0,1]].to_csv('../output/gold.dev.tsv',header=None,index=None,sep='\t')

for tok in toks:
    print(tok)
    bio_dev[[0,tok]].to_csv('../output/pred.dev.tsv',header=None,index=None,sep='\t')
    
    evaluate_inline('../output/gold.dev.tsv','../output/pred.dev.tsv', False)
    print()

## Character-Based LM Tokenizer Evaluation

In [None]:
import os
import pickle
from glob import glob

best_f1 = 0
best_index = None

for folder in glob('canine_exps/*'):
    results = pickle.load(open('{}/results.p'.format(folder),'rb'))
    
    max_f1 = np.max([r[0]['overall_f1'] for r in results])
    max_f1_ind = np.argmax([r[0]['overall_f1'] for r in results])
    
    if best_f1 < max_f1:
        best_f1 = max_f1
        
        best_index = (folder, max_f1_ind)
        
best_model = AutoModelForTokenClassification.from_pretrained('{}/best_model'.format(best_index[0]))

In [None]:
tokenizer = AutoTokenizer.from_pretrained('google/canine-c')
bio_dev['canine'] = [tokenize_phrase(phrase, best_model) for phrase in tqdm(bio_dev[0])]

In [None]:
bio_dev['canine'] = [phrase.replace('##','@@') for phrase in bio_dev['canine']]

In [None]:
bio_dev[[0,1]].to_csv('../output/gold.dev.tsv',header=None,index=None,sep='\t')

for tok in ['canine']:
    print(tok)
    bio_dev[[0,tok]].to_csv('../output/pred.dev.tsv',header=None,index=None,sep='\t')
    
    evaluate_inline('../output/gold.dev.tsv','../output/pred.dev.tsv', False)
    print()