In [7]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.__version__


'1.11.0+cpu'

In [8]:
from transformers import AutoTokenizer, AutoModel
med_tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
med_model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) 
bert_model = BertForTokenClassification.from_pretrained("bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [11]:
text = """CCCVA, MANOVA, my black hen. Comments on repeated measures. Nikolsky sign page from notable contributors to the knowledge of dermatology.
[Obesity as a concomitant cause in the complex etiology of arteriosclerosis ]. Tropical mixtures of star tree metrics.
We study three metrics that can be realized as a mixture of two-star tree metrics.
We prove that the only trees admitting such a decomposition are the ones coming from a tree with at most one internal edge, and whose weight satisfies certain linear inequalities.
We also characterize the fibers of the corresponding mixture map. In addition, we discuss the general framework of tropical secant varieties and we interpret our results within this setting.
Finally, we show that the set of tree metric ranks of metrics on $ n $ taxa is unbounded. 
Comment: 19 pages, 5 figures. Major revision of the exposition following suggestions by the referee.
To appear in Annals of Combinatoric Pasteurellosis in japanese quail (Coturnix coturnix japonica) caused by Pasteurella multocida multocida A:4. 
NUTRITIONAL WELL-BEING IN THE U.S.A.Counseling professional nurses. Evaluation of transdermal penetration enhancers using a novel skin alternative . 
A novel alternative to animal skin models was developed in order to aid in the screening of transdermal penetration enhancer . 
The skin alternative consists of a dermal layer containing human fibroblasts dispersed in a collagen matrix and an epidermal layer of differentiated and stratified human keratinocytes."""


In [12]:
# Split the sentence into tokens, with both BERT and SciBERT.
bert_tokens = bert_tokenizer.tokenize(text)
med_tokens = med_tokenizer.tokenize(text)

# Pad out the scibert list to be the same length.
while len(med_tokens) < len(bert_tokens):
    med_tokens.append("")

# Label the columns.
print('{:<12} {:<12}'.format("BERT", "Med_BERT"))
print('{:<12} {:<12}'.format("----", "-------"))

# Display the tokens.
for tup in zip(bert_tokens, med_tokens):
    print('{:<12} {:<12}'.format(tup[0], tup[1]))

BERT         Med_BERT    
----         -------     
CC           cc          
##C          ##c         
##VA         ##va        
,            ,           
MA           man         
##N          ##ova       
##O          ,           
##VA         my          
,            black       
my           he          
black        ##n         
he           .           
##n          comments    
.            on          
Co           repeated    
##mme        measures    
##nts        .           
on           ni          
repeated     ##ko        
measures     ##ls        
.            ##ky        
Nik          sign        
##ols        page        
##ky         from        
sign         notable     
page         contributors
from         to          
notable      the         
contributors knowledge   
to           of          
the          der         
knowledge    ##mat       
of           ##ology     
der          .           
##mat        [           
##ology      o           
.           