In [1]:
import numpy as np
import torch
from overrides import overrides
from typing import Iterable, Dict, List
import jsonlines
import warnings

from allennlp.models.archival import load_archive
# from allennlp.models.basic_classifier import BasicClassifier
# from allennlp.modules.seq2vec_encoders.cnn_encoder import CnnEncoder
# from allennlp.modules import FeedForward, Seq2SeqEncoder, Seq2VecEncoder, TextFieldEmbedder
from allennlp.predictors import Predictor #, TextClassifierPredictor

from allennlp.data import DatasetReader, Instance, TokenIndexer, Vocabulary #Tokenizer,
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.tokenizers import Token #, Tokenizer, WordTokenizer
from allennlp.data.fields import MetadataField, TextField, LabelField

from allennlp.common.util import JsonDict




In [2]:
class ModalityDatasetReader(DatasetReader):
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}

    def _read(self, file_path: str) -> Iterable[Instance]:
        logger.info("Reading Modal Sense instances from {}".format(file_path))
        with open(file_path,"r") as file:
            for line in file:
                json_line = json.loads(line)
                json_line.pop("modal_verb", None)
                yield self.sentence_to_instance(**json_line)
    
    @overrides
    def text_to_instance(self, sentence:List[str], label:str=None) -> Instance:
        instance_dict = {"tokens": TextField(sentence, self._token_indexers)}
        
        if label is not None:
            instance_dict["label"] = LabelField(label)

        return Instance(instance_dict)

In [3]:
class ModalityPredictor(Predictor):

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict["tokens"]
        if not hasattr(self._dataset_reader, "tokenizer") and not hasattr(
            self._dataset_reader, "_tokenizer"
        ):
            sentence = [Token(t) for t in sentence.split()]            
        return self._dataset_reader.text_to_instance(sentence=sentence)

    
    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"tokens": sentence})

In [4]:
def reset_mv_and_labels_counts(targets):
    modal_targets = {t: {"total": 0, "errors": 0} for t in targets}

    labels = {
        "dy": {"total": 0, "errors": 0},
        "de": {"total": 0, "errors": 0},
        "ep": {"total": 0, "errors": 0}
    }
    return modal_targets, labels

In [57]:
def collect_errors_and_accuarcy(predictor, filepath, labels_golds):
    with jsonlines.open(filepath, "r") as src:
        all_modal_targets = {line['modal_verb'].lower() for line in src}
        modal_verbs, labels = reset_mv_and_labels_counts(all_modal_targets)
    with jsonlines.open(filepath, "r") as test:
        predictions, golds = [], []
        for line in test:
            try:
                sentence = line["sentence"] if len(line["sentence"].split()) > 4 else line["sentence"] + ". . . "
                prediction = predictor.predict(sentence=sentence)
                predicted_label = prediction["label"]
                gold = line["label"]
                mv = line["modal_verb"].lower()
                if mv == "shall": # in EPOS and MASC the 'shall' instances have wrong json lines
                    continue
                modal_verbs[mv]["total"] += 1
                try:
                    labels[gold]["total"] += 1
                except:
                    print(gold, line)
                    continue
                if gold != predicted_label:
                    modal_verbs[mv]["errors"] += 1
                    labels[gold]["errors"] += 1
                predictions.append(prediction["probs"])
                golds.append(labels_golds[gold])
            except RuntimeError:
                sent, l = line["sentence"], line["label"]
                warnings.warn(f"sentence too short: {sent}, {l}")

        print(accuracy(np.array(predictions), np.array(golds)))
        for mv, count in modal_verbs.items():
            if count["total"] > 0:
                modal_verbs[mv]["proportion"] = count["errors"] / count["total"]
                
        for label, count in labels.items():
            if count["total"] > 0:
                labels[label]["proportion"] = count["errors"] / count["total"]
    return  modal_verbs, labels

In [59]:
def get_label_golds(predictor, model_name):    
    if "gme" in model_name.lower():
        labels_golds = {
            "dy": "all birds can fly in the winter",
            "ep": "it could have gone somewhere",
            "de": "you must wash your hands"
        }        
    elif "epos" in model_name.lower():
        labels_golds = {
            "dy": "I hope we can still convince people of this and make progress",
            "ep": "It must have been stillborn .",
            "de": "The evil is real , and it must be combated"
        }
    
    
    for sense, sentence in labels_golds.items():
        prediction = predictor.predict(sentence=sentence)
        probs = prediction["probs"]
        sense_index = probs.index(max(probs))
        labels_golds[sense] = [0 if x != sense_index else 1 for x in range(3) ]
    return labels_golds
    

In [7]:
def accuracy(predictions, labels):
    return 100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0]

In [38]:
def eval_model(model_name, test_filepath,device=1):
    archive = load_archive(f"../../models/{model_name}/model.tar.gz", cuda_device=device)  
    model = archive.model
    model.eval()
    predictor = ModalityPredictor(model, dataset_reader=ModalityDatasetReader())
    label_golds = get_label_golds(predictor=predictor, model_name=model_name)
    return collect_errors_and_accuarcy(predictor, test_filepath,label_golds )

In [91]:
eval_model(model_name="rnn_gme_all", test_filepath="../../data/GME/test_modal-BIOSE-coarse_only_modal_verbs.jsonl")

Did not use initialization regex that was passed: .*linear_layers.*weight


{'logits': [0.09839901328086853, -0.5935354828834534, 0.7567489147186279], 'probs': [0.29135751724243164, 0.14585553109645844, 0.5627869367599487], 'label': 'dy'} {'dy': [0, 0, 1], 'ep': 'it could have gone somewhere', 'de': 'you must wash your hands'}
{'logits': [-0.3284309208393097, 0.17560245096683502, 0.18500059843063354], 'probs': [0.23113901913166046, 0.3826240003108978, 0.38623693585395813], 'label': 'dy'} {'dy': [0, 0, 1], 'ep': [0, 0, 1], 'de': 'you must wash your hands'}
{'logits': [-0.031949542462825775, -0.16448062658309937, 0.21305033564567566], 'probs': [0.31710830330848694, 0.2777474522590637, 0.40514427423477173], 'label': 'dy'} {'dy': [0, 0, 1], 'ep': [0, 0, 1], 'de': [0, 0, 1]}
2.912621359223301


({'should': {'total': 34, 'errors': 15, 'proportion': 0.4411764705882353},
  'could': {'total': 23, 'errors': 17, 'proportion': 0.7391304347826086},
  'must': {'total': 17, 'errors': 7, 'proportion': 0.4117647058823529},
  'may': {'total': 5, 'errors': 2, 'proportion': 0.4},
  'can': {'total': 24, 'errors': 16, 'proportion': 0.6666666666666666}},
 {'dy': {'total': 31, 'errors': 28, 'proportion': 0.9032258064516129},
  'de': {'total': 58, 'errors': 28, 'proportion': 0.4827586206896552},
  'ep': {'total': 14, 'errors': 1, 'proportion': 0.07142857142857142}})

In [90]:
eval_model(model_name="rnn_gme_all", test_filepath="../../data/EPOS_E/test_EPOS+MPQA_re-balanced.jsonl")

Did not use initialization regex that was passed: .*linear_layers.*weight


{'logits': [0.09839901328086853, -0.5935354828834534, 0.7567489147186279], 'probs': [0.29135751724243164, 0.14585553109645844, 0.5627869367599487], 'label': 'dy'} {'dy': [0, 0, 1], 'ep': 'it could have gone somewhere', 'de': 'you must wash your hands'}
{'logits': [-0.3284309208393097, 0.17560245096683502, 0.18500059843063354], 'probs': [0.23113901913166046, 0.3826240003108978, 0.38623693585395813], 'label': 'dy'} {'dy': [0, 0, 1], 'ep': [0, 0, 1], 'de': 'you must wash your hands'}
{'logits': [-0.031949542462825775, -0.16448062658309937, 0.21305033564567566], 'probs': [0.31710830330848694, 0.2777474522590637, 0.40514427423477173], 'label': 'dy'} {'dy': [0, 0, 1], 'ep': [0, 0, 1], 'de': [0, 0, 1]}
6.094182825484765


({'should': {'total': 28, 'errors': 14, 'proportion': 0.5},
  'could': {'total': 38, 'errors': 17, 'proportion': 0.4473684210526316},
  'must': {'total': 117, 'errors': 60, 'proportion': 0.5128205128205128},
  'may': {'total': 123, 'errors': 28, 'proportion': 0.22764227642276422},
  'can': {'total': 55, 'errors': 42, 'proportion': 0.7636363636363637},
  'shall': {'total': 0, 'errors': 0}},
 {'dy': {'total': 53, 'errors': 42, 'proportion': 0.7924528301886793},
  'de': {'total': 113, 'errors': 60, 'proportion': 0.5309734513274337},
  'ep': {'total': 195, 'errors': 59, 'proportion': 0.30256410256410254}})

In [77]:
eval_model(model_name="rnn_gme_all", test_filepath="../../data/EPOS_E/test_EPOS+MPQA_re-balanced.jsonl")

Did not use initialization regex that was passed: .*linear_layers.*weight


{'logits': [0.09839901328086853, -0.5935354828834534, 0.7567489147186279], 'probs': [0.29135751724243164, 0.14585553109645844, 0.5627869367599487], 'label': 'dy'} {'dy': [0, 0, 1], 'ep': 'it could have gone somewhere', 'de': 'you must wash your hands'}
{'logits': [-0.3284309208393097, 0.17560245096683502, 0.18500059843063354], 'probs': [0.23113901913166046, 0.3826240003108978, 0.38623693585395813], 'label': 'dy'} {'dy': [0, 0, 1], 'ep': [0, 0, 1], 'de': 'you must wash your hands'}
{'logits': [-0.031949542462825775, -0.16448062658309937, 0.21305033564567566], 'probs': [0.31710830330848694, 0.2777474522590637, 0.40514427423477173], 'label': 'dy'} {'dy': [0, 0, 1], 'ep': [0, 0, 1], 'de': [0, 0, 1]}
6.094182825484765


({'should': {'total': 28, 'errors': 14, 'proportion': 0.5},
  'could': {'total': 38, 'errors': 17, 'proportion': 0.4473684210526316},
  'must': {'total': 117, 'errors': 60, 'proportion': 0.5128205128205128},
  'may': {'total': 123, 'errors': 28, 'proportion': 0.22764227642276422},
  'can': {'total': 55, 'errors': 42, 'proportion': 0.7636363636363637},
  'shall': {'total': 0, 'errors': 0}},
 {'dy': {'total': 53, 'errors': 42, 'proportion': 0.7924528301886793},
  'de': {'total': 113, 'errors': 60, 'proportion': 0.5309734513274337},
  'ep': {'total': 195, 'errors': 59, 'proportion': 0.30256410256410254}})

In [68]:
eval_model(model_name="cnn_gme_only_modal_verbs", test_filepath="../../data/GME/test_modal-BIOSE-coarse_only_modal_verbs.jsonl")

Did not use initialization regex that was passed: .*linear_layers.*weight


{'logits': [-0.13299435377120972, 0.5873146653175354, -0.5189329385757446], 'probs': [0.2677461802959442, 0.550236701965332, 0.18201713263988495], 'label': 'dy'} {'dy': [0, 1, 0], 'ep': 'it could have gone somewhere', 'de': 'you must wash your hands'}
{'logits': [-0.32890403270721436, 0.4845108389854431, -0.14231166243553162], 'probs': [0.22417837381362915, 0.5056561827659607, 0.27016544342041016], 'label': 'dy'} {'dy': [0, 1, 0], 'ep': [0, 1, 0], 'de': 'you must wash your hands'}
{'logits': [0.5168034434318542, -0.06670152395963669, -0.7430666089057922], 'probs': [0.5429971814155579, 0.30295950174331665, 0.15404334664344788], 'label': 'de'} {'dy': [0, 1, 0], 'ep': [0, 1, 0], 'de': [1, 0, 0]}
83.49514563106796


({'should': {'total': 34, 'errors': 3, 'proportion': 0.08823529411764706},
  'could': {'total': 23, 'errors': 14, 'proportion': 0.6086956521739131},
  'must': {'total': 17, 'errors': 0, 'proportion': 0.0},
  'may': {'total': 5, 'errors': 4, 'proportion': 0.8},
  'can': {'total': 24, 'errors': 7, 'proportion': 0.2916666666666667}},
 {'dy': {'total': 31, 'errors': 7, 'proportion': 0.22580645161290322},
  'de': {'total': 58, 'errors': 7, 'proportion': 0.1206896551724138},
  'ep': {'total': 14, 'errors': 14, 'proportion': 1.0}})

In [73]:
eval_model(model_name="gme_all", test_filepath="../../data/EPOS_E/test_EPOS+MPQA_re-balanced.jsonl")

Did not use initialization regex that was passed: .*linear_layers.*weight


{'logits': [-0.952390730381012, -0.9419597387313843, 1.464697241783142], 'probs': [0.07562211900949478, 0.0764150619506836, 0.8479628562927246], 'label': 'dy'} {'dy': [0, 0, 1], 'ep': 'it could have gone somewhere', 'de': 'you must wash your hands'}
{'logits': [-1.0116074085235596, 0.44092532992362976, 0.4288536608219147], 'probs': [0.10530120134353638, 0.4500495195388794, 0.444649338722229], 'label': 'ep'} {'dy': [0, 0, 1], 'ep': [0, 1, 0], 'de': 'you must wash your hands'}
{'logits': [0.6141778826713562, -1.4141558408737183, 0.25848403573036194], 'probs': [0.5457795858383179, 0.07179978489875793, 0.38242068886756897], 'label': 'de'} {'dy': [0, 0, 1], 'ep': [0, 1, 0], 'de': [1, 0, 0]}
59.2797783933518


({'should': {'total': 28, 'errors': 6, 'proportion': 0.21428571428571427},
  'could': {'total': 38, 'errors': 24, 'proportion': 0.631578947368421},
  'must': {'total': 117, 'errors': 62, 'proportion': 0.5299145299145299},
  'may': {'total': 123, 'errors': 36, 'proportion': 0.2926829268292683},
  'can': {'total': 55, 'errors': 19, 'proportion': 0.34545454545454546},
  'shall': {'total': 0, 'errors': 0}},
 {'dy': {'total': 53, 'errors': 14, 'proportion': 0.2641509433962264},
  'de': {'total': 113, 'errors': 28, 'proportion': 0.24778761061946902},
  'ep': {'total': 195, 'errors': 105, 'proportion': 0.5384615384615384}})

In [72]:
eval_model(model_name="epos_balanced", test_filepath="../../data/GME/test_modal-BIOSE-coarse_only_modal_verbs.jsonl")

Did not use initialization regex that was passed: .*linear_layers.*weight


{'logits': [-0.5733402371406555, -0.019110115244984627, 0.4306058883666992], 'probs': [0.18282771110534668, 0.3182299733161926, 0.49894222617149353], 'label': 'dy'} {'dy': [0, 0, 1], 'ep': 'It must have been stillborn .', 'de': 'The evil is real , and it must be combated'}
{'logits': [3.168025493621826, -0.2793894112110138, -3.4387171268463135], 'probs': [0.9678864479064941, 0.03080570325255394, 0.0013078334741294384], 'label': 'ep'} {'dy': [0, 0, 1], 'ep': [1, 0, 0], 'de': 'The evil is real , and it must be combated'}
{'logits': [-0.6486820578575134, 1.5259498357772827, -1.4638288021087646], 'probs': [0.09764176607131958, 0.8591445088386536, 0.04321374371647835], 'label': 'de'} {'dy': [0, 0, 1], 'ep': [1, 0, 0], 'de': [0, 1, 0]}
71.84466019417475


({'should': {'total': 34, 'errors': 1, 'proportion': 0.029411764705882353},
  'could': {'total': 23, 'errors': 12, 'proportion': 0.5217391304347826},
  'must': {'total': 17, 'errors': 1, 'proportion': 0.058823529411764705},
  'may': {'total': 5, 'errors': 1, 'proportion': 0.2},
  'can': {'total': 24, 'errors': 14, 'proportion': 0.5833333333333334}},
 {'dy': {'total': 31, 'errors': 19, 'proportion': 0.6129032258064516},
  'de': {'total': 58, 'errors': 6, 'proportion': 0.10344827586206896},
  'ep': {'total': 14, 'errors': 4, 'proportion': 0.2857142857142857}})