Evaluate trained text-tagger models on the test set.

---

In [1]:
import os
import re
from flair.models import SequenceTagger
from flair.data import Corpus
from flair.datasets import ColumnCorpus

In [None]:
# # included for convenience to help find correct paths
# import os
# os.getcwd()
# os.listdir("..")

Constants.

In [2]:
# top level project directory containing code, data, .gitignore, etc
MEDRED_REPRODUCIBLE_DIR = "../"
# corpus (preprocessed) paths
MEDRED_LABELS_DIR = MEDRED_REPRODUCIBLE_DIR + "data/AMT/labels"
MICROMED_LABELS_DIR = MEDRED_REPRODUCIBLE_DIR + "data/Micromed"
CADEC_LABELS_DIR = MEDRED_REPRODUCIBLE_DIR + "data/cadec"
# model paths
TAGGER_MODEL_DIR = MEDRED_REPRODUCIBLE_DIR + "resources/taggers/"
MEDRED_MODEL = TAGGER_MODEL_DIR + "FA_MedRed_glove_roberta/final-model.pt"
MICROMED_MODEL = TAGGER_MODEL_DIR + "FA_Micromed_glove_roberta/final-model.pt"
CADEC_MODEL = TAGGER_MODEL_DIR + "FA_CADEC_glove_roberta/final-model.pt"
MEDRED_ALT_MODEL = TAGGER_MODEL_DIR + "FA_MedRed_pooled-flair_roberta/final-model.pt"

Evaluation code.

In [3]:
def read_in_data(path):
	files = os.listdir(path)
	regex = re.compile(".*(_train|_dev|_test).*")
	files = list(filter(regex.match, files))

	# define columns
	columns = {0: 'text', 1: 'ner'}

	# init a corpus using column format, data folder and the names of the train, dev and test files
	corpus: Corpus = ColumnCorpus(path, columns,
	                              train_file=files[2],
	                              test_file=files[1],
	                              dev_file=files[0])
	return corpus

def eval(model, alt_medred=False):
	'''
    
	'''
	if model == "MedRed":
		corpus = read_in_data(MEDRED_LABELS_DIR)
		if alt_medred:
			model_path = MEDRED_ALT_MODEL
		else:
			model_path = MEDRED_MODEL
	elif model == "Micromed":
		corpus = read_in_data(MICROMED_LABELS_DIR)
		model_path = MICROMED_MODEL
	elif model == "CADEC":
		corpus = read_in_data(CADEC_LABELS_DIR)
		model_path = CADEC_MODEL
	else:
		raise ValueError("`model` must be one of MedRec, Micromed, CADEC.")	

	# load the model you trained
	model = SequenceTagger.load(model_path)

	# evaluate on test set
	result = model.evaluate(corpus.test, gold_label_type='ner')
	print(result)
	return result

In [15]:
medred_eval = eval("MedRed")

2022-05-07 13:48:36,423 Reading data from ..\data\AMT\labels
2022-05-07 13:48:36,423 Train: ..\data\AMT\labels\NER_Reddit_AMT_labels_href_train.csv
2022-05-07 13:48:36,424 Dev: ..\data\AMT\labels\NER_Reddit_AMT_labels_href_dev.csv
2022-05-07 13:48:36,425 Test: ..\data\AMT\labels\NER_Reddit_AMT_labels_href_test.csv
2022-05-07 13:48:38,045 loading file ../resources/taggers/FA_MedRed_glove_roberta/final-model.pt
2022-05-07 13:48:40,670 SequenceTagger predicts: Dictionary with 11 tags: O, S-DIS, B-DIS, E-DIS, I-DIS, S-DRUG, B-DRUG, E-DRUG, I-DRUG, <START>, <STOP>


100%|██████████| 17/17 [00:37<00:00,  2.19s/it]

2022-05-07 13:49:18,052 Evaluating as a multi-label problem: False

Results:
- F-score (micro) 0.6909
- F-score (macro) 0.7097
- Accuracy 0.5307

By class:
              precision    recall  f1-score   support

         DIS     0.6943    0.6528    0.6729       769
        DRUG     0.7240    0.7702    0.7464       235

   micro avg     0.7020    0.6803    0.6909      1004
   macro avg     0.7092    0.7115    0.7097      1004
weighted avg     0.7013    0.6803    0.6901      1004

Loss: 0.18928715586662292'





In [16]:
micromed_eval = eval("Micromed")

2022-05-07 13:49:18,137 Reading data from ..\data\Micromed
2022-05-07 13:49:18,138 Train: ..\data\Micromed\NER_Micromed_labels_train.csv
2022-05-07 13:49:18,139 Dev: ..\data\Micromed\NER_Micromed_labels_dev.csv
2022-05-07 13:49:18,140 Test: ..\data\Micromed\NER_Micromed_labels_test.csv
2022-05-07 13:49:18,206 loading file ../resources/taggers/FA_Micromed_glove_roberta/final-model.pt
2022-05-07 13:49:21,416 SequenceTagger predicts: Dictionary with 11 tags: O, S-DIS, B-DIS, E-DIS, I-DIS, S-DRUG, B-DRUG, E-DRUG, I-DRUG, <START>, <STOP>


100%|██████████| 3/3 [00:00<00:00,  3.85it/s]

2022-05-07 13:49:22,504 Evaluating as a multi-label problem: False

Results:
- F-score (micro) 0.7293
- F-score (macro) 0.7301
- Accuracy 0.5739

By class:
              precision    recall  f1-score   support

         DIS     0.6711    0.7969    0.7286        64
        DRUG     0.7895    0.6818    0.7317        22

   micro avg     0.6947    0.7674    0.7293        86
   macro avg     0.7303    0.7393    0.7301        86
weighted avg     0.7013    0.7674    0.7294        86

Loss: 0.12414750456809998'





In [17]:
cadec_eval = eval("CADEC")

2022-05-07 13:49:22,634 Reading data from ..\data\cadec
2022-05-07 13:49:22,635 Train: ..\data\cadec\NER_CADEC_labels_train.csv
2022-05-07 13:49:22,635 Dev: ..\data\cadec\NER_CADEC_labels_dev.csv
2022-05-07 13:49:22,636 Test: ..\data\cadec\NER_CADEC_labels_test.csv
2022-05-07 13:49:23,785 loading file ../resources/taggers/FA_CADEC_glove_roberta/final-model.pt
2022-05-07 13:49:26,262 SequenceTagger predicts: Dictionary with 11 tags: O, S-DRUG, B-DRUG, E-DRUG, I-DRUG, S-DIS, B-DIS, E-DIS, I-DIS, <START>, <STOP>


100%|██████████| 7/7 [00:15<00:00,  2.16s/it]

2022-05-07 13:49:41,703 Evaluating as a multi-label problem: False

Results:
- F-score (micro) 0.8475
- F-score (macro) 0.5902
- Accuracy 0.7353

By class:
              precision    recall  f1-score   support

        DRUG     0.8943    0.9026    0.8984       431
         DIS     0.6111    0.1833    0.2821        60

   micro avg     0.8830    0.8147    0.8475       491
   macro avg     0.7527    0.5429    0.5902       491
weighted avg     0.8597    0.8147    0.8231       491

Loss: 0.018556272611021996'





In [4]:
medred_alt_eval = eval("MedRed", alt_medred=True)

2022-05-08 12:25:55,734 Reading data from ..\data\AMT\labels
2022-05-08 12:25:55,734 Train: ..\data\AMT\labels\NER_Reddit_AMT_labels_href_train.csv
2022-05-08 12:25:55,735 Dev: ..\data\AMT\labels\NER_Reddit_AMT_labels_href_dev.csv
2022-05-08 12:25:55,735 Test: ..\data\AMT\labels\NER_Reddit_AMT_labels_href_test.csv
2022-05-08 12:25:57,226 loading file ../resources/taggers/FA_MedRed_pooled-flair_roberta/final-model.pt
2022-05-08 12:26:03,552 SequenceTagger predicts: Dictionary with 11 tags: O, S-DIS, B-DIS, E-DIS, I-DIS, S-DRUG, B-DRUG, E-DRUG, I-DRUG, <START>, <STOP>


100%|██████████| 17/17 [00:55<00:00,  3.24s/it]

2022-05-08 12:26:59,488 Evaluating as a multi-label problem: False

Results:
- F-score (micro) 0.683
- F-score (macro) 0.7
- Accuracy 0.5221

By class:
              precision    recall  f1-score   support

         DIS     0.7617    0.5943    0.6676       769
        DRUG     0.8168    0.6638    0.7324       235

   micro avg     0.7750    0.6106    0.6830      1004
   macro avg     0.7892    0.6291    0.7000      1004
weighted avg     0.7746    0.6106    0.6828      1004

Loss: 0.1037561297416687'



