# ConNER: Text Preprocessing

This notebook prepares a pipeline that preprocesses an incoming text (abstract) and updates it to the format expected by ConNER.

## 1. Imports

In [24]:
## Model definition related
import os
import bs4
import numpy as np
import json
import pandas as pd
from pprint import pprint

## 2. Loading the data
- Expects an incoming text with only the fields of "title" and "abstract"

In [30]:
## Loads the processed CSV files from Nomita and Woojae.
## If we want to use data from PubMed database direct, we will have to build a processing pipeline for that.
train_path = "./data/OfficialTrainingSet1.csv"
test_path = "./data/OfficialTestSet1.csv"
val_path = "./data/OfficialValidationSet1.csv"

# Reading the files but only retaining the title and abstract columns
df_train = pd.read_csv(train_path)[['title', 'abstract']]
df_val = pd.read_csv(test_path)[['title', 'abstract']]
df_test = pd.read_csv(val_path)[['title', 'abstract']]

# Forming a new column with the merged texts
df_train['text'] = df_train["title"] + " " + df_train["abstract"]
df_val['text'] = df_val["title"] + " " + df_val["abstract"]
df_test['text'] = df_test["title"] + " " + df_test["abstract"]

# This will be the starting point for further preprocessing.
df_test['text'][0]

'Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant. A newborn with massive tricuspid regurgitation, atrial flutter, congestive heart failure, and a high serum lithium level is described. This is the first patient to initially manifest tricuspid regurgitation and atrial flutter, and the 11th described patient with cardiac disease among infants exposed to lithium compounds in the first trimester of pregnancy. Sixty-three percent of these infants had tricuspid valve involvement. Lithium carbonate may be a factor in the increasing incidence of congenital heart disease when taken during early pregnancy. It also causes neurologic depression, cyanosis, and cardiac arrhythmia when consumed prior to delivery.'

In [47]:
def convert_text_to_ConNER_format(df):
    '''
    Takes in a dataframe and returns an "example" object that can be taken by the
    "load_and_cache_examples" function from the data_utils.py function from the ConNER repo.
    
    Inputs:
    - df: dataframe with a "text" column that contains the paragraph combining the title and abstract of a journal
    
    Output:
    - example object derived from the InputExample function
    
    Other Prerequisites:
    - InputExample functionm, imported from the data_utils.py module
    '''
    from data_utils import InputExample
    
    mode = "doc_dev"  ## just inherited from the ConNER codes, stands for document-based evaluation for the dev set.
    
    texts = df['text']
    guid_index = 1
    examples = []
    
    for text in texts:
        
        words =  text.split()
        labels = [0] * len(words)  ## just set labels to 0 as a dummy as we are doing inference
        
        if "tags_hp" in labels:
            hp_labels = item["tags_hp"]
        else:
            hp_labels = [None]*len(labels)
                
        examples.append(InputExample(guid="%s-%d".format(mode, guid_index),
                                     words=words,
                                     labels=labels,
                                     hp_labels=hp_labels))
        guid_index += 1
        
    return examples

In [48]:
test_set = convert_text_to_ConNER_format(df_test)
test_set

[<data_utils.InputExample at 0x21d08a8ed60>,
 <data_utils.InputExample at 0x21d07e1c370>,
 <data_utils.InputExample at 0x21d07e1c640>,
 <data_utils.InputExample at 0x21d07e1cd00>,
 <data_utils.InputExample at 0x21d08a93ee0>,
 <data_utils.InputExample at 0x21d08a93130>,
 <data_utils.InputExample at 0x21d08a933a0>,
 <data_utils.InputExample at 0x21d07fcdf40>,
 <data_utils.InputExample at 0x21d07fcdca0>,
 <data_utils.InputExample at 0x21d07fcd850>,
 <data_utils.InputExample at 0x21d07fcddf0>,
 <data_utils.InputExample at 0x21d07fcd670>,
 <data_utils.InputExample at 0x21d07fcd970>,
 <data_utils.InputExample at 0x21d07fcdf10>,
 <data_utils.InputExample at 0x21d07fcddc0>,
 <data_utils.InputExample at 0x21d07f83d00>,
 <data_utils.InputExample at 0x21d07f83c70>,
 <data_utils.InputExample at 0x21d07f83e20>,
 <data_utils.InputExample at 0x21d07f83bb0>,
 <data_utils.InputExample at 0x21d07f83d60>,
 <data_utils.InputExample at 0x21d07f83c40>,
 <data_utils.InputExample at 0x21d07f83a30>,
 <data_uti

## 3. Converting the Dataset Format & Loading Model
- Built based on the load_and_cache_examples from the data_utils.py from the ConNER repo.

In [50]:
from transformers import BertPreTrainedModel,BertForTokenClassification, BertModel, RobertaModel, RobertaTokenizer, BertPreTrainedModel, RobertaConfig
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import torch.nn as nn
import torch.nn.functional as F
from  torch.nn.utils.rnn  import pack_padded_sequence

from torch.autograd import Variable
from torch.nn import CrossEntropyLoss, KLDivLoss

from transformers import BertConfig, RobertaConfig

In [70]:
## Eval related
import argparse
import logging
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm

#Remember to copy the "data_utils.py" file from ConNER's repo
from data_utils import tag_to_id, get_chunks, get_labels, convert_examples_to_features
from flashtool import Logger
logger = logging.getLogger(__name__)

In [59]:
from ConNER_model_definition import RobertaForTokenClassification_v2

## Loading model
model_path = "./ConNER"

## It appears the checkpoint is a Roberta-based model as loading it using BERT model yields an error.
#test_model  = BERTForTokenClassification_v2.from_pretrained(model_path)

test_model = RobertaForTokenClassification_v2.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)



In [71]:
def load_and_cache_examples(args, df, tokenizer, labels, pad_token_label_id, mode,
                            entity_name='bc5cdr', remove_labels=False):
    
    examples = convert_text_to_ConNER_format(df)
    features = convert_examples_to_features(
        examples,
        labels,
        args.max_seq_length,
        tokenizer,
        cls_token_at_end=bool(args.model_type in ["xlnet"]),
        # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
        sep_token=tokenizer.sep_token,
        sep_token_extra=bool(args.model_type in ["roberta"]),
        # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
        pad_on_left=bool(args.model_type in ["xlnet"]),
        # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
        pad_token_label_id=pad_token_label_id,
        entity_name=entity_name,
    )

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
    all_full_label_ids = torch.tensor([f.full_label_ids for f in features], dtype=torch.long)
    all_hp_label_ids = torch.tensor([f.hp_label_ids for f in features], dtype=torch.long)
    all_entity_ids = torch.tensor([f.entity_ids for f in features], dtype=torch.long)
    if remove_labels:
        all_full_label_ids.fill_(pad_token_label_id)
        all_hp_label_ids.fill_(pad_token_label_id)
    all_ids = torch.tensor([f for f in range(len(features))], dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_full_label_ids, all_hp_label_ids, all_entity_ids, all_ids)
    
    return dataset

In [76]:
device = torch.device("cuda")
pad_token_label_id = CrossEntropyLoss().ignore_index
labels = ['O', 'B-Chemical', 'B-Disease', 'I-Chemical', 'I-Disease']

parser = argparse.ArgumentParser()
args = parser.parse_args("")

args.model_type = "roberta"
args.model_name_or_path = "./ConNER"
args.max_seq_length = 512   ## modified from 128
args.per_gpu_train_batch_size = 8
args.per_gpu_eval_batch_size = 8
args.n_gpu = 1
args.device = device
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
args.local_rank = -1

args.gradient_accumulation_steps = 1
args.learning_rate = 5e-5
args.weight_decay = 0.0
args.adam_epsilon = 1e-8
args.adam_beta1 = 0.9
args.adam_beta2 = 0.98
args.max_grad_norm = 1.0
args.num_train_epochs = 3.0
args.max_steps = -1
args.warmup_steps = 0
args.logging_steps = 10000
args.save_steps = 10000
args.seed = 1


eval_dataset = load_and_cache_examples(args, df_test, tokenizer, labels, pad_token_label_id, mode="doc_dev")
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

## 4. Testing Out Inferencing
- Things look fine.

In [77]:
test_model.to(device)

test_model.eval()

nb_eval_steps = 0
preds = None
out_label_ids = None

for batch in tqdm(eval_dataloader, desc="Evaluating"):
    batch = tuple(t.to(args.device) for t in batch)

    with torch.no_grad():
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
        if args.model_type != "distilbert":
            inputs["token_type_ids"] = (
                batch[2] if args.model_type in ["bert", "xlnet"] else None
            )  # XLM and RoBERTa don"t use segment_ids
        outputs = test_model(**inputs)
        tmp_eval_loss, logits = outputs[:2]

        if args.n_gpu > 1:
            tmp_eval_loss = tmp_eval_loss.mean()

    nb_eval_steps += 1
    if preds is None:
        preds = logits.detach().cpu().numpy()
        out_label_ids = inputs["labels"].detach().cpu().numpy()
    else:
        preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

preds = np.argmax(preds, axis=2)

Evaluating: 100%|██████████| 63/63 [00:24<00:00,  2.58it/s]


In [78]:
## This is the result from this live run
preds

array([[0, 2, 4, ..., 0, 0, 0],
       [0, 1, 3, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 2, 2, 2],
       [0, 2, 4, ..., 0, 0, 0]], dtype=int64)

In [79]:
preds.shape

(500, 512)

In [80]:
print(f"Shape of the prediction numpy array: {preds[0].shape}")
preds[0]

Shape of the prediction numpy array: (512,)


array([0, 2, 4, 4, 4, 4, 4, 4, 4, 0, 1, 3, 3, 3, 2, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 4, 4, 4, 4, 4, 0, 2, 4, 4, 0, 2, 4, 4, 4, 4, 0,
       0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4,
       4, 4, 4, 4, 0, 2, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
       0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4,
       4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 4, 0, 2, 4, 0, 0, 2,
       4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## 5. Comparing Against Old Results
- Looks the same, so the current pipeline is the correct implementation

In [62]:
# Compare against the old results inherited from the previous notebook
preds  ## DO NOT RE-RUN THIS CELL.  THIS SIMPLY SHOWS THE ACTUAL RESULTS WHEN FEEDING IN THE DEFAULT DATASET

array([[0, 2, 4, ..., 0, 0, 0],
       [0, 1, 3, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 2, 4, ..., 0, 0, 0]], dtype=int64)

In [63]:
## 500 samples, with each having 512 tokens (max token length):
preds.shape

(500, 512)

In [67]:
## Let's also look at the output for the first sample:
print(f"Shape of the prediction numpy array: {preds[0].shape}")
preds[0]

Shape of the prediction numpy array: (512,)


array([0, 2, 4, 4, 4, 4, 4, 4, 4, 0, 1, 3, 3, 3, 2, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 4, 4, 4, 4, 4, 0, 2, 4, 4, 0, 2, 4, 4, 4, 4, 0,
       0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4,
       4, 4, 4, 4, 0, 2, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
       0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4,
       4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 4, 0, 2, 4, 0, 0, 2,
       4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,