In [None]:
import os
os.chdir('..')

In [None]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging

predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz")

In [None]:
import srsly
import spacy
from scripts.labeling_rules import *

nlp = spacy.load('en_core_web_trf')
# nlp.add_pipe("expand_entities")
nlp.add_pipe("labeling_ruler")
nlp.add_pipe("labeling_extend_ruler")

In [None]:
items = srsly.read_jsonl('corpus/sentences_dev.jsonl')

In [None]:
items = [{'text': 
"Remediation Consulting Services, one of the segments analyzed in the report , is projected to grow at a 5.2 % to reach US$ 15.3 Billion by the end of the analysis period . "
}]

In [None]:
for doc in nlp.pipe(item['text'] for item in items):
    if not any(ent.label_ == 'IX_REFERENCE_VALUE' for ent in doc.ents):
        continue
    prediction = predictor.predict(sentence=doc.text)
    break

In [None]:
[(ent, ent.label_) for ent in doc.ents]

In [None]:
[p['description'] for p in prediction['verbs']]

In [None]:
import re
if found := re.match(r"ARG(\d)", "ARG4"):
    print(found[1])

In [None]:
def ie_to_spans(doc, prediction):
    for verb in prediction["verbs"]:
        spans = []
        found = None
        for ent in doc.ents:
            if ent.label_ != "IX_REFERENCE_VALUE": continue
            for tok in ent:
                if found := re.match(r".*ARG(\d)", verb["tags"][tok.i]):
                    found = int(found[1])
                    obj_inds = [
                        obj_i
                        for obj_i, tag in enumerate(verb["tags"])
                        if tag.endswith(f"ARG{found}")
                    ]
                    obj_span = tok_inds_to_span(obj_inds, "IX_REFERENCE_VALUE")
                    spans.append(obj_span)
                    break
            if found is not None:
                break
        if found is not None:
            verb_inds = [
                verb_i
                for verb_i, tag in enumerate(verb["tags"])
                if tag.endswith(f"-V")
            ]
            if not verb_inds:
                continue
            verb_span = tok_inds_to_span(verb_inds, "PREDICATE")
            spans.append(verb_span)
            subj_inds = [
                subj_i
                for subj_i, tag in enumerate(verb["tags"])
                if tag.endswith(f"ARG{found - 1}")
            ]
            if not subj_inds:
                continue
            subj_span = tok_inds_to_span(subj_inds, "METRIC")
            if not spans or not any(
                (
                    (span["token_start"] <= subj_span['token_start']
                    and subj_span['token_start'] <= span["token_end"])
                    or 
                    (span["token_end"] <= subj_span['token_end']
                    and subj_span['token_end'] >= span["token_start"])
                    
                )
                and span["token_end"] - span["token_start"] >= subj_span['token_end'] - subj_span['token_start']
                for span in spans
            ):
                spans.append(subj_span)
        yield spans

def tok_inds_to_span(inds, label):
    token_start = inds[0]
    token_end = inds[-1]
    char_start = doc[token_start].idx
    char_end = doc[token_end].idx + len(doc[token_end])
    return {
        "start": char_start,
        "end": char_end,
        "token_start": token_start,
        "token_end": token_end,
        "label": label,
    }

In [None]:
spans = ie_to_spans(doc, prediction)

In [None]:
print(doc)
for span in spans:
    print(span['label'], doc[span['token_start']: span['token_end']+1])