<a href="https://colab.research.google.com/github/RxHeatherT/project/blob/main/Capstone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers
!pip install -q simpletransformers

In [3]:
import urllib.request
from pathlib import Path

def download_file(url, output_file):
  Path(output_file).parent.mkdir(parents=True, exist_ok=True)
  urllib.request.urlretrieve (url, output_file)

download_file('https://raw.githubusercontent.com/shreyashub/BioFLAIR/master/data/ner/bc5cdr/dev.txt', '/content/data/dev.txt')
download_file('https://raw.githubusercontent.com/shreyashub/BioFLAIR/master/data/ner/bc5cdr/test.txt', '/content/data/test.txt')
download_file('https://raw.githubusercontent.com/shreyashub/BioFLAIR/master/data/ner/bc5cdr/train.txt', '/content/data/train.txt')

In [4]:
import pandas as pd
def read_conll(filename):
  df = pd.read_csv(filename,
                   sep = '\t', header = None, keep_default_na = False,
                   names = ['words', 'pos', 'chunk', 'labels'],
                   quoting = 3, skip_blank_lines = False)
  df = df[~df['words'].astype(str).str.startswith('-DOCSTART-')] #Remove the -DOCSTART header]
  df['sentence_id'] = (df.words == '').cumsum()
  return df[df.words !='']

In [5]:
train_df = read_conll('/content/data/train.txt')
test_df = read_conll('/content/data/test.txt')
dev_df = read_conll('/content/data/dev.txt')
train_df.head(100)

Unnamed: 0,words,pos,chunk,labels,sentence_id
2,Naloxone,PROPN,O,I-Entity,1
3,reverses,VERB,O,O,1
4,the,DET,O,O,1
5,antihypertensive,ADJ,O,O,1
6,effect,NOUN,O,O,1
...,...,...,...,...,...
102,not,ADV,O,O,6
103,influence,VERB,O,O,6
104,stereoselective,ADJ,O,O,6
105,binding,NOUN,O,O,6


In [6]:
data = [[train_df['sentence_id'].nunique(), test_df['sentence_id'].nunique(), dev_df['sentence_id'].nunique()]]
# Prints out the dataset sizes of train and test sets per label.
pd.DataFrame(data, columns=["Train", "Test", "Dev"])

Unnamed: 0,Train,Test,Dev
0,3942,4139,3949


In [7]:
train_args = {
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'sliding_widow': True,
    'max_seq_length': 64,
    'num_train_epochs': 10,
    'train_batch_size': 32,
    'fp16': True,
    'output_dir': '/outputs/',
    'best_model_dir': '/outputs/best_model/',
    'evaluate_during_training': True,
}

custom_labels = list(train_df['labels'].unique())
print(custom_labels)

['I-Entity', 'O', 'B-Entity']


In [8]:
from simpletransformers.ner import NERModel
from transformers import AutoTokenizer
import pandas as pd
import logging

logging.basicConfig(level=logging.DEBUG)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)

#We use the bio BERT pre-trained model
model = NERModel('bert', 'dmis-lab/biobert-v1.1', labels=custom_labels, args=train_args)

# Train the model
# https://simpletransformers.ai/docs/tips-and-tricks/#using-early-stopping
model.train_model(train_df, eval_data=dev_df)

# Evaluate the model in terms of accuracy score
result, model_ouputs, preds_list = model.eval_model(test_df)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /dmis-lab/biobert-v1.1/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:filelock:Attempting to acquire lock 140073334764496 on /root/.cache/huggingface/transformers/f048b8136bae2b3abe91e9e82949295fb205887c84db3be2775e1cdb0ecfeeb9.d7812d36d3371e4d43299a0c4a938622c5251db0efa17a5d4d9b57037fcec823.lock
INFO:filelock:Lock 140073334764496 acquired on /root/.cache/huggingface/transformers/f048b8136bae2b3abe91e9e82949295fb205887c84db3be2775e1cdb0ecfeeb9.d7812d36d3371e4d43299a0c4a938622c5251db0efa17a5d4d9b57037fcec823.lock
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /dmis-lab/biobert-v1.1/resolve/main/config.json HTTP/1.1" 200 462


Downloading:   0%|          | 0.00/462 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140073334764496 on /root/.cache/huggingface/transformers/f048b8136bae2b3abe91e9e82949295fb205887c84db3be2775e1cdb0ecfeeb9.d7812d36d3371e4d43299a0c4a938622c5251db0efa17a5d4d9b57037fcec823.lock
INFO:filelock:Lock 140073334764496 released on /root/.cache/huggingface/transformers/f048b8136bae2b3abe91e9e82949295fb205887c84db3be2775e1cdb0ecfeeb9.d7812d36d3371e4d43299a0c4a938622c5251db0efa17a5d4d9b57037fcec823.lock
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /dmis-lab/biobert-v1.1/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
DEBUG:filelock:Attempting to acquire lock 140073340117520 on /root/.cache/huggingface/transformers/65231a5792b14eb81b9a6bdccccfffda18575eb3bafbb730c9fa4235e56c3c17.74cc2087932cb523a583bd5e65732ee1aaade59dfc0b62f88101de7567d92e42.lock
INFO:filelock:Lock 140073340117520 acquired on /root/.cache/huggingface/transformers/65231a5792b1

Downloading:   0%|          | 0.00/433M [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140073340117520 on /root/.cache/huggingface/transformers/65231a5792b14eb81b9a6bdccccfffda18575eb3bafbb730c9fa4235e56c3c17.74cc2087932cb523a583bd5e65732ee1aaade59dfc0b62f88101de7567d92e42.lock
INFO:filelock:Lock 140073340117520 released on /root/.cache/huggingface/transformers/65231a5792b14eb81b9a6bdccccfffda18575eb3bafbb730c9fa4235e56c3c17.74cc2087932cb523a583bd5e65732ee1aaade59dfc0b62f88101de7567d92e42.lock
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/models/dmis-lab/biobert-v1.1 HTTP/1.1" 200 584
DEBUG:urllib3.connectionpool:Starting new HTTPS connecti

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140073333141904 on /root/.cache/huggingface/transformers/cda52d3a8283b321708097045e27f11cd70bbf3ad8cdefa2c0a56f187855f5d5.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791.lock
INFO:filelock:Lock 140073333141904 released on /root/.cache/huggingface/transformers/cda52d3a8283b321708097045e27f11cd70bbf3ad8cdefa2c0a56f187855f5d5.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791.lock
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /dmis-lab/biobert-v1.1/resolve/main/added_tokens.json HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /dmis-lab/biobert-v1.1/resolve/main/special_tokens_map.json HTTP/1.1" 200 0
DEBUG:filelock:Attempting to acquire lock 140073307757200 on /root/.cache/huggingface/transformers/118da8438a7

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140073307757200 on /root/.cache/huggingface/transformers/118da8438a7854000cfcf052566f83ae4f4159ac25796e49e16c3b18746041b4.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
INFO:filelock:Lock 140073307757200 released on /root/.cache/huggingface/transformers/118da8438a7854000cfcf052566f83ae4f4159ac25796e49e16c3b18746041b4.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /dmis-lab/biobert-v1.1/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
DEBUG:filelock:Attempting to acquire lock 140073307887824 on /root/.cache/huggingface/transformers/2fb6c5805404829e9c10c33b38ae59ae3011225799f3177f769a06a7411fa46c.25d8d06fb0679146a3ed2a3463e3585380bff882fe6e1ebc497196e40dbbd7fa.lock
INFO:filelock:Lock 140073307887824 acquired on /root/.cache/huggingface/transformers/2fb6c580

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140073307887824 on /root/.cache/huggingface/transformers/2fb6c5805404829e9c10c33b38ae59ae3011225799f3177f769a06a7411fa46c.25d8d06fb0679146a3ed2a3463e3585380bff882fe6e1ebc497196e40dbbd7fa.lock
INFO:filelock:Lock 140073307887824 released on /root/.cache/huggingface/transformers/2fb6c5805404829e9c10c33b38ae59ae3011225799f3177f769a06a7411fa46c.25d8d06fb0679146a3ed2a3463e3585380bff882fe6e1ebc497196e40dbbd7fa.lock
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /dmis-lab/biobert-v1.1/resolve/main/tokenizer.json HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /dmis-lab/biobert-v1.1/resolve/main/config.json HTTP/1.1" 200 0
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/124 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/494 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/124 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/494 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/124 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/494 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/124 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/494 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/124 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/494 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/124 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/494 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/124 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/494 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/124 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/494 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/124 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/494 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/124 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/494 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Training of bert model complete. Saved to /outputs/.
INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/518 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.15694018643420118, 'precision': 0.8765394088669951, 'recall': 0.9073621587166685, 'f1_score': 0.8916845017487081}
