## NLP Setup

This is for setting up the NLP processing pipeline, including any custom components

## Init NLP

### Spacy

In [None]:
!python -m pip install -U pydantic spacy

In [None]:
# May need to restart the environment after running this command
!python -m spacy download en_core_web_md

In [None]:
# Benepar (Optional) - for consituency parsing
# Can take 3 minutes
%pip install benepar

In [None]:
import pickle
import spacy
from spacy import displacy

In [None]:
# Set up and store NLP
nlp = spacy.load('en_core_web_md')

### Benepar (Optional)

In [None]:
import benepar

benepar.download('benepar_en3')
nlp.add_pipe('benepar', config={'model': 'benepar_en3'})

### Coreferee (Optional)

In [None]:
# Optional: Takes about 1 minute
!python -m coreferee install en

In [None]:
# Optional
import coreferee
nlp.add_pipe('coreferee')

### Store

In [None]:
print(nlp.pipe_names)

In [None]:
doc = nlp("The seller shall deliver the order in one delivery to the buyer at its warehouse.")

In [None]:
with open('./nlp/nlp.pickle', 'wb') as f:
    pickle.dump(nlp, f)

## Other Customization (Optional)

#### Coref

In [None]:
doc._.coref_chains

doc._.coref_chains.print()

In [None]:
for c in doc._.coref_chains:
    print('-', c.index, c.mentions, c.most_specific_mention_index)
    msmi = c.most_specific_mention_index
    ti = c.mentions[msmi]
    print('- msm', ti.pretty_representation)

    for m in c.mentions:
        print('---', m)

        for mi in m.token_indexes:
            print('-----', doc[mi])

#### Domain event tagging

In [None]:
from spacy.matcher import PhraseMatcher, Matcher

matcher = Matcher(nlp.vocab)

# I'll eventually need to write tests for this...
# Will want this to auto-generate based on the domain model...
# Use information from the domain model to set up tagging
contract_verbs = ['activate', 'terminate', 'suspend', ]
contract_verb_nouns = ['activation', 'termination', 'suspension']
contract_event_nouns = ['payment', 'delivery']
contract_event_verbs = ['make', 'complete']

# separate ones for contract event, ob event, power event, etc?
domain_event_patterns = [
    [{"LOWER": {'IN': ['contract']}, "POS": "NOUN" } , {"POS": "VERB", "LEMMA": {"IN": contract_verbs}} ],
    [{"LOWER": {"IN": contract_event_nouns}, "POS": "NOUN"}, {"POS": {"IN": ["VERB", "AUX"]}, "OP": "+"}, {"LOWER": "not", "OP": "?"}, {"LEMMA": {"IN": contract_event_verbs}, "OP": "?"}],
    [{"POS": "NOUN", "LEMMA": {"IN": contract_verb_nouns}}, {"LOWER": "of"}, {"LOWER": "the"}, {"LOWER": {'IN': ['contract']}, "POS": "NOUN" }]
]

matcher.add("DOMAIN_EVENT", domain_event_patterns)

In [None]:
doc = nlp('if payment is made')
matches = matcher(doc)

for m_id, start, end in matches:
    print(nlp.vocab.strings[m_id], doc[start:end])

In [None]:
# SKIP THIS
from spacy.language import Language
from spacy.tokens import Span, Doc

Doc.set_extension("domain_components", default=[])

# Define the custom component
@Language.component("domain_component")
def domain_component_function(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [Span(doc, start, end, label="DOMAIN_EVENT") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc._.domain_components = spans
    #doc.ents = spans
    return doc

# Add the component to the pipeline after the "ner" component
nlp.add_pipe("domain_component", last=True)
print(nlp.pipe_names)

In [None]:
# Entity Ruler add patterns
ruler = nlp.add_pipe("entity_ruler")

for i, x in enumerate(domain_event_patterns):
    print(x)
    er_patterns = [
        {"label": "DOMAIN_EVENT", "pattern": x }
    ]
    ruler.add_patterns(er_patterns)

## Test

In [None]:
with open('nlp/nlp.pickle', 'rb') as f:
    new_nlp = pickle.load(f)

type(new_nlp)

In [None]:
from tests.helpers.test_nlp import TestNLP

new_nlp = TestNLP.get_nlp()

print(type(new_nlp))
print(new_nlp.pipe_names)

In [None]:
from app.src.nlp.sentence_summarizer import SentenceSummarizer

summarizer = SentenceSummarizer(nlp)

sentence = "Seller shall deliver the goods before the contract terminates"
#sentence = "I will arrive before the payment is made"
summarizer.summarize(sentence)
