# Performance on AI CUP NER Task


|  Type                 |          complete match          |       right boundary match       |       left boundary match        |
| ------------------- | ---------------------------------- | ---------------------------------- | ---------------------------------- | 
|   Gene    ( 9002) |  7873 (87.46% / 82.88% / 85.11%) |  8236 (91.49% / 86.70% / 89.03%) |  8228 (91.40% / 86.62% / 88.95%) |
| Chemical  ( 3861) |  3144 (81.43% / 78.35% / 79.86%) |  3229 (83.63% / 80.46% / 82.02%) |  3238 (83.86% / 80.69% / 82.25%) |
|  Disease  ( 4224) |  3521 (83.36% / 79.71% / 81.50%) |  3743 (88.61% / 84.74% / 86.63%) |  3659 (86.62% / 82.84% / 84.69%) |
| Partial_G (   53) |    25 (47.17% / 73.53% / 57.47%) |    25 (47.17% / 73.53% / 57.47%) |    25 (47.17% / 73.53% / 57.47%) |
|  [-ALL-]  (17140) | 14563 (84.96% / 81.07% / 82.97%) | 15233 (88.87% / 84.80% / 86.79%) | 15150 (88.39% / 84.34% / 86.32%) |


In [None]:
import torch
import logging
from spacy import displacy

import tokenization
from modeling import BertConfig, BertForNER
from run_bioner import convert_examples_to_features, InputExample, AICupProcessor

logger = logging.getLogger('run_bioner')
logger.setLevel(logging.WARNING)
bert_config_file = "pretrained_model/bert_config_bioner.json"
bert_config = BertConfig.from_json_file(bert_config_file)
vocab_file = "pretrained_model/vocab.txt"
processor = AICupProcessor()
label_list = processor.get_labels()
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=False)

max_seq_length = 400
device = 'cpu' # 'cuda' or 'cpu'
init_checkpoint = "model_step_2564.pt"

model = BertForNER(bert_config, len(label_list))
model_params_dict = model.state_dict()
pretrained_dict = torch.load(init_checkpoint, map_location='cpu')
model_params_dict.update(pretrained_dict)
model.load_state_dict(model_params_dict)
model.eval()
model.to(device)
print("Loaded model")


In [None]:
def get_model_predictions(model, sentence, label_id_to_name, device='cpu'):

    sentence = sentence.split()
    input_example = InputExample(1, sentence ,label=['O']*len(sentence))
    tmp_feats = convert_examples_to_features([input_example, ], label_list, max_seq_length, tokenizer)
    tmp_input_ids = torch.tensor([f.input_ids for f in tmp_feats], dtype=torch.long).to(device)
    tmp_input_mask = torch.tensor([f.input_mask for f in tmp_feats], dtype=torch.long).to(device)
    tmp_segment_ids = torch.tensor([f.segment_ids for f in tmp_feats], dtype=torch.long).to(device)
    
    with torch.no_grad():
        logits = model(tmp_input_ids, tmp_segment_ids, tmp_input_mask)
    logits = logits[0]
    pred_labels_id = logits.argmax(dim=-1)
    
    if pred_labels_id.device != 'cpu':
        pred_labels_id = pred_labels_id.cpu()
    pred_labels_id = pred_labels_id.numpy().tolist()
    pred_labels = [label_id_to_name[l] for l in pred_labels_id]
    pred_labels = pred_labels[1:]
    sent_toks = input_example.text_a
    sent_toks_map = input_example.text_a_map

    restored_tags = []
    for i_prd, pred_ne_tag in enumerate(pred_labels):
        if i_prd >= len(sent_toks_map):
            break
        if (i_prd > 0) and (sent_toks_map[i_prd - 1] == sent_toks_map[i_prd]):
            continue
        restored_tags.append(pred_ne_tag)
    return zip(sent_toks, restored_tags)


In [None]:
sentence = "Lenalidomide induces degradation of the lymphoid transcription factors Ikaros and Aiolos ( also known as IKZF1 and IKZF3 ) , and casein kinase 1α ( CK1α ) , which contributes to its clinical efficacy in the treatment of multiple myeloma and 5q-deletion associated myelodysplastic syndrome ( del ( 5q ) MDS ) , respectively ."

with torch.no_grad():
    predictions =  get_model_predictions(model, sentence, label_list, device)
    predictions = list(predictions)

In [None]:
dis_colors = {
            "GENE": "#7aecec",
            "PARTIAL_GENE": "#bfeeb7",
            "DISEASE": "#feca74",
            "CHEMICAL": "#ff9561",
        }
dis_options = {"colors": dis_colors}

dis_ents = []
curr_pos = 0
for (word, ent_label) in predictions:
    if ent_label.startswith("B"):
        dis_ents.append({"start": curr_pos, "end": curr_pos + len(word), "label": ent_label[2:]})
    if ent_label.startswith("I"):
        dis_ents[-1]["end"] += len(word) + 1
    curr_pos += len(word)
    curr_pos += 1
    
dis_sentence = {
    "text": sentence,
    "ents": dis_ents,
}

In [None]:
displacy.render(dis_sentence, style="ent", manual=True, jupyter=True, options=dis_options)