In [None]:
!pip install transformers
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
import sys
import torch# Loading the pre-trained BERT model
###################################
# Embeddings will be derived from
# the outputs of this model

from google.colab import drive

drive.mount('/content/gdrive/',force_remount=True)

sys.path.append('/content/gdrive/MyDrive/corpora')

model = BertModel.from_pretrained('bert-base-multilingual-cased',
           output_hidden_states = True,)# Setting up the tokenizer
###################################
# This is the same tokenizer that
# was used in the model to generate
# embeddings to ensure consistency
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors

def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings



Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 5.8MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 32.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 31.5MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1
Mounted at /content/gdrive/


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1961828.0, style=ProgressStyle(descript…




In [None]:
print(sys.path)

['', '/content', '/env/python', '/usr/lib/python37.zip', '/usr/lib/python3.7', '/usr/lib/python3.7/lib-dynload', '/usr/local/lib/python3.7/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.7/dist-packages/IPython/extensions', '/root/.ipython', '/content/gdrive/MyDrive/corpora']


In [None]:
from scripts.anntools import Collection
from pathlib import Path
from our_tagger import get_tag_list
import numpy as np
import string
import torch
import torch.nn as nn

c = Collection()

c.load(Path("/content/gdrive/MyDrive/corpora/2021/ref/training/medline.1200.es.txt"))
#         1      2      3     4      5      6     7      8      9     10     11    12      13    14     15     16     17
TAGS = ['B_C', 'I_C', 'L_C','B_A', 'I_A', 'L_A','B_P', 'I_P', 'L_P','B_R', 'I_R', 'L_R', 'U_C', 'U_A', 'U_P', 'U_R', 'V_C', 'V_A', 'V_P', 'V_R', 'O', 'V' ] 

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input, hidden):
        input = input.unsqueeze(0)
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size) 

def category_from_output(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return TAGS[category_i], category_i

criterion = nn.NLLLoss()
learning_rate = 0.005
optimizer = None

def train(rnn, category_tensor, line_tensor):
    hidden = rnn.initHidden()
    
    rnn.zero_grad()
    for i in range(line_tensor.size()[0]):
         output, hidden = rnn(line_tensor[i], hidden)
    
    loss = criterion(output, category_tensor)
    loss.backward()
    
    optimizer.step()
    
    return output, loss.item()

def strip_punctutation(text):
  punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  for ele in text: 
    if ele in punc: 
        text = text.replace(ele, "") 
  return text

def training(rnn, sentences):
  for s in sentences:
    tags = get_tag_list(s)
    s.text=strip_punctutation(s.text)
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(s.text, tokenizer)
    list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)
    j=0
    i=1
    while i < len(tokenized_text)-1:
      embedding=list_token_embeddings[i][:178]
      i+=1
      while i<len(tokenized_text) and tokenized_text[i][0]=='#':

        embedding=[x+y for (x, y) in zip(embedding, list_token_embeddings[i])]    
        i+=1
      train(rnn,torch.tensor([TAGS.index(tags[j])], dtype=torch.long), torch.FloatTensor(embedding).view(1, -1))
      j+=1

def evaluate(rnn, line_tensor):
    hidden = rnn.initHidden()
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
    return output

def predict(rnn, sentence):
    with torch.no_grad():
        sentence.text=strip_punctutation(sentence.text)
        tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence.text, tokenizer)
        list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)
        outputs=[]
        i=1
        while i< len(tokenized_text)-1:
          emb=list_token_embeddings[i][:178]
          i+=1
          while i< len(tokenized_text) and tokenized_text[i][0]=='#':
            emb=[x+y for (x, y) in zip(list_token_embeddings[i][:178], emb)]
            i+=1
          outputs.append(category_from_output(evaluate(rnn, torch.FloatTensor(emb).view(1, -1))))
        return outputs

if __name__=='__main__':
    n_hidden = 128
    rnn = RNN(178, n_hidden, len(TAGS))
    optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)
    training(rnn, c.sentences[:-100])
    test_s=c.sentences[-100:]
    test_l=[get_tag_list(s) for s in test_s]
    acc=[]
    for i,s in enumerate(test_s):
        cat_list=predict(rnn, s)
        for j,cat in enumerate(cat_list):
          print(f'Correct {test_l[i][j]}, got {cat[0]}')
          acc.append((test_l[i][j]==cat[0] and cat[0]!='O'))
    print(sum(acc)/len(acc))

Correct O, got O
Correct B_C, got U_C
Correct L_C, got U_C
Correct U_A, got U_A
Correct O, got O
Correct U_C, got O
Correct O, got O
Correct U_C, got U_C
Correct O, got O
Correct O, got O
Correct U_A, got U_A
Correct O, got O
Correct B_C, got O
Correct O, got O
Correct B_C, got O
Correct V, got U_C
Correct O, got O
Correct U_C, got B_C
Correct O, got I_C
Correct U_C, got U_C
Correct U_C, got U_C
Correct O, got O
Correct O, got O
Correct U_P, got B_C
Correct U_C, got U_C
Correct O, got O
Correct O, got O
Correct B_C, got O
Correct L_C, got B_C
Correct O, got O
Correct O, got O
Correct U_C, got U_C
Correct U_A, got O
Correct O, got O
Correct B_C, got U_C
Correct I_C, got O
Correct L_C, got U_C
Correct O, got O
Correct O, got O
Correct O, got O
Correct U_C, got U_C
Correct O, got O
Correct U_A, got U_A
Correct O, got O
Correct O, got O
Correct U_A, got U_A
Correct O, got O
Correct O, got O
Correct U_A, got U_A
Correct O, got O
Correct U_A, got U_A
Correct O, got O
Correct U_A, got O
Corre