In [77]:
import spacy
import re
from tqdm import tqdm
tag_names = ["Sign_symptom" , "Diagnostic_procedure", "Biological_structure", "Disease_disorder","Medication"]
# TODO:Change these paths
parent_path = "/MACCROBAT2018/"
ner_model_path ="/ner_model"

# Get Annotations


In [78]:
# get all annotations and text file

def get_annotations(ann_file_path : str , txt_file_path:str, tag_names:list):
  """
  return format : [text_file_content, (entity_label, start, end, term), ... (entity_label, start, end, term)]
  """
  # Get all file contents
  ann_fhand = open(ann_file_path, "r")
  text_fhand = open(txt_file_path, "r")

  ann_file_content = ann_fhand.read()
  text_file_content = text_fhand.read();

  ann_fhand.close()
  text_fhand.close()

  annotation_group = [text_file_content]

  # Extracting the lines that include desired labels/enities
  regex = r'^T.*(?:' + '|'.join(tag_names) + r').*$'
  all_valid = re.findall(regex , ann_file_content , flags = re.MULTILINE)

  # Processing into Annottaions
  for valid_line in all_valid:
    line_components = valid_line.split()
    entity_type,start_index, end_index , *term = line_components[1:]
    entity_type = entity_type.upper()
    term = " ".join(term)
    if(not(start_index.isdigit() and end_index.isdigit())):
      continue
    start_index = int(start_index)
    end_index = int(end_index)
    entity_type = entity_type.strip()
    annotation = (entity_type, start_index,end_index, term)

    # Add formed annotation to annotation group
    annotation_group.append(annotation)

  return annotation_group


# Transformation of annotations to Dictionary of annotation groups with frequency of words


In [79]:
# transform to dict format for entites
def transform_to_dict_format(annotation_group):
  entities = annotation_group[1:]

  main_dict = {}
  for entity in entities:
    entity_type, start_index, end_index, term = entity
    if(entity_type not in main_dict):
      main_dict[entity_type] = {};
    main_dict[entity_type][term] = main_dict[entity_type].get(term,0) +1
  return [annotation_group[0], main_dict]

# Functions and Evaluation metrics


In [80]:
my_model = spacy.load(ner_model_path)

In [81]:
def evaluate_ner(filepath):

  total_entities = 0
  found_entities =0
  correct_entities = 0
  incorrect_entities = 0
  incorrect_dict = dict()
  txt_file= filepath+".txt"
  ann_file = filepath + ".ann"

  ann_group = get_annotations(ann_file_path=ann_file, txt_file_path=txt_file, tag_names=tag_names)
  ann_dict = transform_to_dict_format(ann_group)

  content = ann_dict[0]

  entities_dict = ann_dict[1]

  count = 0;
  for i in entities_dict:
    for j in (list(entities_dict[i].keys())):
      count += (entities_dict[i][j])

  total_entities += count

  # run ner model on text
  doc = my_model(content)
  for ent in doc.ents:
    # print(ent.text, ent.label_)

    if(ent.label_ in entities_dict):
      # declare entity's existance
      found_entities += 1

      if(ent.text in entities_dict[ent.label_] and entities_dict[ent.label_][ent.text] > 0):
        # declare correct entity label
        correct_entities += 1
        # remove term count
        entities_dict[ent.label_][ent.text] -= 1
      else:
        # declare incorrect entity label
        incorrect_dict[ent.label_] = incorrect_dict.get(ent.label_,0) + 1
        incorrect_entities += 1
  # print(entities_dict)
  unidentified_entities = 0
  unidentified_dict = dict()
  for i in entities_dict:
    for j in (list(entities_dict[i].keys())):
      if(entities_dict[i][j] > 0):
        unidentified_entities += (entities_dict[i][j])
        unidentified_dict[i] = unidentified_dict.get(i,0) + entities_dict[i][j]
  # print(unidentified_dict)
  return (total_entities, found_entities, correct_entities, incorrect_entities, unidentified_entities, incorrect_dict, unidentified_dict)



In [82]:


def evaluate_all(path):
  fhand_filenames = open(parent_path +"all_files.txt","r") # This file contains all the names of the ann/txt file
  all_filenames = fhand_filenames.readlines()
  fhand_filenames.close()

  all_total_entities = 0
  all_found_entities =0
  all_correct_entities = 0
  all_incorrect_entities = 0
  all_unidentified_entities = 0
  all_incorrect_dict = dict()
  all_unidentified_dict = dict()

  basenames = [filename[:-5] for filename in all_filenames if filename.endswith(".txt\n")]

  for basename in tqdm(basenames):
    if basename == "all_files":
      continue
    total_entities, found_entities, correct_entities, incorrect_entities,unidentified_entities, incorrect_dict ,unidentified_dict= evaluate_ner(parent_path + basename)
    all_total_entities += total_entities
    all_found_entities += found_entities
    all_correct_entities += correct_entities
    all_incorrect_entities += incorrect_entities
    all_unidentified_entities += unidentified_entities
    for i in incorrect_dict:
      all_incorrect_dict[i] = all_incorrect_dict.get(i,0) + incorrect_dict[i]
    for i in unidentified_dict:
      all_unidentified_dict[i] = all_unidentified_dict.get(i,0) + unidentified_dict[i]

  print("\nTotal entities: ", all_total_entities)
  print("Found entities: ", all_found_entities, " " , (all_found_entities/all_total_entities)*100)
  print("Correct entities: ", all_correct_entities, " " , (all_correct_entities/all_found_entities)*100)
  print("Incorrect entities: ", all_incorrect_entities , " ",  (all_incorrect_entities/all_found_entities)*100)
  print("Unidentified entities: ", all_unidentified_entities , " ",  (all_unidentified_entities/all_total_entities)*100)
  print("Incorrect Dict : ", str(all_incorrect_dict))
  print("Unidentified Dict : ", str(all_unidentified_dict))


In [83]:
evaluate_all(parent_path)

100%|██████████| 201/201 [00:09<00:00, 22.19it/s]


Total entities:  13261
Found entities:  13288   100.20360455470929
Correct entities:  13063   98.30674292594821
Incorrect entities:  225   1.6932570740517758
Unidentified entities:  198   1.493100067868185
Incorrect Dict :  {'DIAGNOSTIC_PROCEDURE': 79, 'SIGN_SYMPTOM': 53, 'DISEASE_DISORDER': 23, 'BIOLOGICAL_STRUCTURE': 57, 'MEDICATION': 13}
Unidentified Dict :  {'BIOLOGICAL_STRUCTURE': 44, 'DISEASE_DISORDER': 31, 'DIAGNOSTIC_PROCEDURE': 71, 'SIGN_SYMPTOM': 39, 'MEDICATION': 13}



