## NLP Setup

This is for setting up the NLP processing pipeline, including any custom components

In [None]:
# Takes about 1 minute
!python -m coreferee install en

In [None]:
!pip install tokenizers==0.12.1

In [None]:
import operator
import pickle
import sys
import spacy
from spacy import displacy
import benepar
import coreferee


In [None]:
# Set up and store NLP
#nlp = spacy.load("en_core_web_trf") #en_core_web_md
nlp = spacy.load('en_core_web_md') # maybe try lg??
benepar.download('benepar_en3')
nlp.add_pipe('benepar', config={'model': 'benepar_en3'})
nlp.add_pipe('coreferee')

In [None]:
print(nlp.pipe_names)

In [None]:
doc = nlp("The seller shall deliver the order in one delivery to the buyer at its warehouse.")

In [None]:
doc._.coref_chains

doc._.coref_chains.print()

In [None]:
for c in doc._.coref_chains:
    print('-', c.index, c.mentions, c.most_specific_mention_index)
    msmi = c.most_specific_mention_index
    ti = c.mentions[msmi]
    print('- msm', ti.pretty_representation)

    for m in c.mentions:
        print('---', m)

        for mi in m.token_indexes:
            print('-----', doc[mi])

### Domain event tagging

In [None]:
from spacy.matcher import PhraseMatcher, Matcher

matcher = Matcher(nlp.vocab)

# Use information from the domain model to set up tagging
contract_verbs = ['activate', 'terminate', 'suspend', ]
contract_event_nouns = ['payment', 'delivery']
contract_event_verbs = ['pay', 'deliver']

# separate ones for contract event, ob event, power event, etc?
domain_event_patterns = [
    [{"LOWER": {'IN': ['contract']}, "POS": "NOUN" } , {"POS": "VERB", "LEMMA": {"IN": contract_verbs}} ],
    [{"LOWER": {"IN": contract_event_nouns}, "POS": "NOUN"}, {"POS": {"IN": ["VERB", "AUX"]}, "OP": "+"}]
]

matcher.add("DOMAIN_EVENT", domain_event_patterns)

In [None]:
doc = nlp('Seller shall deliver the goods before the contract terminates')
matches = matcher(doc)

for m_id, start, end in matches:
    print(nlp.vocab.strings[m_id], doc[start:end])

In [None]:
# sentence = 'before payment is made'
# doc = nlp(sentence)
# summarizer.summarize(sentence)
# matches = matcher(doc)

# for m_id, start, end in matches:
#     print(nlp.vocab.strings[m_id], doc[start:end])

In [None]:
# SKIP THIS
from spacy.language import Language
from spacy.tokens import Span, Doc

Doc.set_extension("domain_components", default=[])

# Define the custom component
@Language.component("domain_component")
def domain_component_function(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [Span(doc, start, end, label="DOMAIN_EVENT") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc._.domain_components = spans
    #doc.ents = spans
    return doc

# Add the component to the pipeline after the "ner" component
nlp.add_pipe("domain_component", last=True)
print(nlp.pipe_names)

In [None]:
# Entity Ruler add patterns
ruler = nlp.add_pipe("entity_ruler")

for i, x in enumerate(domain_event_patterns):
    print(x)
    er_patterns = [
        {"label": "DOMAIN_EVENT", "pattern": x }
    ]
    ruler.add_patterns(er_patterns)

In [None]:
with open('./nlp/nlp.pickle', 'wb') as f:
    pickle.dump(nlp, f)

In [None]:
from app.src.sentence_summarizer import SentenceSummarizer

summarizer = SentenceSummarizer(nlp)

sentence = "Seller shall deliver the goods before the contract terminates"
#sentence = "I will arrive before the payment is made"
summarizer.summarize(sentence)


In [None]:
doc = nlp(sentence)

# colors = {'Fruit': "#85C1E9"}
# options = {"ents": ['Fruit'], "colors": colors} 
# spacy.displacy.render(doc, style="ent", jupyter=True, options=options)
spacy.displacy.render(doc, style="ent", jupyter=True)

print([(ent.text, ent.label_) for ent in doc.ents])