In [8]:
import json

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForTokenClassification, BertConfig, AdamW, AutoTokenizer

In [6]:
DATA_PATH = "../data/preprocessed/ddi-corpus/drug_ner/"

In [12]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer, max_length=512):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword.
    """

    tokenized_sentence = []
    labels = []
    # relationship_labels = []
    for word, label in zip(sentence, text_labels):

        # tokenize word and count # of subword tokens
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # add tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # add label and multiply by subword length
        labels.extend([label] * n_subwords)

    tokenized_sentence += [tokenizer.pad_token] * (max_length - len(tokenized_sentence))
    labels += ['O'] * (max_length - len(labels))
    #tokenized_sentence.extend(tokenizer.pad_token * (max_length - len(tokenized_sentence)))
    #print(tokenized_sentence)
    
#     input_ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)
#     attention_mask = [1 if token != tokenizer.pad_token else 0 for token in tokenized_sentence]
#     return input_ids, attention_mask, labels
    return tokenized_sentence, labels

In [11]:
# Read the JSON file
train_data = []
with open(DATA_PATH + 'train.json', 'r') as f:
    data = json.load(f)
for instance in data:
    train_data.append(instance)
    
dev_data = []
with open(DATA_PATH + 'dev.json', 'r') as f:
    data = json.load(f)
for instance in data:
    dev_data.append(instance)

In [20]:
train_data[0]

{'tokens': ['Probenecid',
  ':',
  'Probenecid',
  'interferes',
  'with',
  'renal',
  'tubular',
  'secretion',
  'of',
  'ciprofloxacin',
  'and',
  'produces',
  'an',
  'increase',
  'in',
  'the',
  'level',
  'of',
  'ciprofloxacin',
  'in',
  'serum',
  '.'],
 'tags': ['B-drug',
  'O',
  'B-drug',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-drug',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-drug',
  'O',
  'O',
  'O'],
 'relations': [['Probenecid', 'ciprofloxacin', 'mechanism']]}

In [21]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
new_data = tokenize_and_preserve_labels(train_data[0]['tokens'], train_data[0]['tags'], tokenizer)
new_data

(['probe',
  '##ne',
  '##ci',
  '##d',
  ':',
  'probe',
  '##ne',
  '##ci',
  '##d',
  'interfere',
  '##s',
  'with',
  'renal',
  'tubular',
  'secret',
  '##ion',
  'of',
  'ci',
  '##pro',
  '##fl',
  '##ox',
  '##ac',
  '##in',
  'and',
  'produces',
  'an',
  'increase',
  'in',
  'the',
  'level',
  'of',
  'ci',
  '##pro',
  '##fl',
  '##ox',
  '##ac',
  '##in',
  'in',
  'serum',
  '.',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[P

In [23]:
tokenizer.convert_tokens_to_ids(new_data[0])

[15113,
 2638,
 6895,
 2094,
 1024,
 15113,
 2638,
 6895,
 2094,
 15115,
 2015,
 2007,
 25125,
 25147,
 3595,
 3258,
 1997,
 25022,
 21572,
 10258,
 11636,
 6305,
 2378,
 1998,
 7137,
 2019,
 3623,
 1999,
 1996,
 2504,
 1997,
 25022,
 21572,
 10258,
 11636,
 6305,
 2378,
 1999,
 20194,
 1012,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0

In [19]:
tokenizer.tokenize(train_data[0]['tokens'])

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [16]:
train_data[0]

{'tokens': ['Probenecid',
  ':',
  'Probenecid',
  'interferes',
  'with',
  'renal',
  'tubular',
  'secretion',
  'of',
  'ciprofloxacin',
  'and',
  'produces',
  'an',
  'increase',
  'in',
  'the',
  'level',
  'of',
  'ciprofloxacin',
  'in',
  'serum',
  '.'],
 'tags': ['B-drug',
  'O',
  'B-drug',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-drug',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-drug',
  'O',
  'O',
  'O'],
 'relations': [['Probenecid', 'ciprofloxacin', 'mechanism']]}