## NLP Setup

This is for setting up the NLP processing pipeline, including any custom components

In [19]:
import operator
import pickle
import sys
import spacy
from spacy import displacy
import benepar

In [20]:
# Set up and store NLP
nlp = spacy.load("en_core_web_md")
benepar.download('benepar_en3')
nlp.add_pipe('benepar', config={'model': 'benepar_en3'})

[nltk_data] Downloading package benepar_en3 to
[nltk_data]     C:\Users\regan\AppData\Roaming\nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!


<benepar.integrations.spacy_plugin.BeneparComponent at 0x2473847fc70>

In [32]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer', 'benepar', 'entity_ruler']


In [22]:
from spacy.matcher import PhraseMatcher, Matcher

matcher = Matcher(nlp.vocab)

# Use information from the domain model to set up tagging
contract_verbs = ['activate', 'terminate', 'suspend', ]
contract_event_nouns = ['payment', 'delivery']
contract_event_verbs = ['pay', 'deliver']

# separate ones for contract event, ob event, power event, etc?
domain_event_patterns = [
    [{"LOWER": {'IN': ['contract']}, "POS": "NOUN" } , {"POS": "VERB", "LEMMA": {"IN": contract_verbs}} ],
    [{"LOWER": {"IN": contract_event_nouns}, "POS": "NOUN"}, {"POS": {"IN": ["VERB", "AUX"]}, "OP": "+"}]
]

matcher.add("DOMAIN_EVENT", domain_event_patterns)

In [None]:
# sentence = 'before payment is made'
# doc = nlp(sentence)
# summarizer.summarize(sentence)
# matches = matcher(doc)

# for m_id, start, end in matches:
#     print(nlp.vocab.strings[m_id], doc[start:end])

In [16]:
from spacy.language import Language
from spacy.tokens import Span, Doc

Doc.set_extension("domain_components", default=[])

# Define the custom component
@Language.component("domain_component")
def domain_component_function(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [Span(doc, start, end, label="DOMAIN_EVENT") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc._.domain_components = spans
    #doc.ents = spans
    return doc

# Add the component to the pipeline after the "ner" component
nlp.add_pipe("domain_component", last=True)
print(nlp.pipe_names)

In [34]:
# Entity Ruler add patterns

er_patterns = [
    {"label": "DOMAIN_EVENT", "pattern": domain_event_patterns[1] }
]

ruler = nlp.add_pipe("entity_ruler")

for x in enumerate(domain_event_patterns):
    er_patterns = [
        {"label": "DOMAIN_EVENT", "pattern": x }
    ]
    ruler.add_patterns(er_patterns)

In [35]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer', 'benepar', 'entity_ruler']


In [37]:
with open('./nlp/nlp.pickle', 'wb') as f:
    pickle.dump(nlp, f)

In [36]:
from app.src.sentence_summarizer import SentenceSummarizer

summarizer = SentenceSummarizer(nlp)

#sentence = "Seller shall deliver the goods before the contract terminates"
sentence = "I will arrive before the payment is made"
summarizer.summarize(sentence)



Sentence: I will arrive before the payment is made

i     TEXT            POS             TAG             DEP             LEMMA           HEAD            ENT            
--------------------------------------------------------------------------------------------------------------
0     I               PRON            PRP             nsubj           I               arrive                         
1     will            AUX             MD              aux             will            arrive                         
2     arrive          VERB            VB              ROOT            arrive          arrive                         
3     before          ADP             IN              prep            before          arrive                         
4     the             DET             DT              det             the             payment                        
5     payment         NOUN            NN              pobj            payment         before          DOMAIN_EVENT   
6     is  



In [None]:
doc = nlp(sentence)

# colors = {'Fruit': "#85C1E9"}
# options = {"ents": ['Fruit'], "colors": colors} 
# spacy.displacy.render(doc, style="ent", jupyter=True, options=options)
spacy.displacy.render(doc, style="ent", jupyter=True)

print([(ent.text, ent.label_) for ent in doc.ents])