In [1]:
import os
import pickle
import re
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from tqdm import tqdm

In [156]:
def getTime():
    return time.strftime("%Y%m%d", time.gmtime())
# env
print('{}\n{}'.format(getTime(),os.getcwd()))

20190109
C:\Github\training_2\NLP\03_NER


# NER and Information Extraction (IE) using SpaCy
[Here is a reading list of subjects](https://pubweb.eng.utah.edu/~cs6390/schedule.html)

[Sample 20 News Group data](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html)

In [161]:
# # Train & Test data
# from sklearn.datasets import fetch_20newsgroups

# twenty_train = fetch_20newsgroups(subset='train',
#                                   remove=('headers', 'footers', 'quotes'),
#                                   shuffle=False)
# twenty_test = fetch_20newsgroups(subset='test', shuffle=False)
# twenty_train.keys()

# Named Entity Recognition & Extraction
 - [Basic walkthrough SPACY](https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da)
 - https://spacy.io/usage/linguistic-features#section-named-entities
 - https://spacy.io/api/annotation#section-named-entities
 - https://spacy.io/models/en
 - [SpaCy training models](https://spacy.io/usage/training)

In [162]:
import spacy
from spacy import displacy
import en_core_web_sm
nlp = en_core_web_sm.load()

In [163]:
# doc = re.sub('^[a-zA-Z0-9]+','',twenty_train['data'][15]).replace('\n',' ').replace('\t',' ').replace('_','')
# doc = nlp(doc)
# displacy.render(doc, jupyter=True, style='ent')

In [164]:
doc = """Departures at Heathrow were temporarily stopped after a drone was reported to have been sighted. Flights from the west London airport resumed about an hour after the Police said a drone had been seen. A Heathrow spokeswoman had said it was a "precautionary measure" to "prevent any threat to operational safety". It comes after last month's disruption at Gatwick Airport which saw thousands of people stranded when drones were sighted. The spokeswoman said: "We continue to work closely with the Met Police to respond to reports of drones at Heathrow. "Based on standard operating procedures, working with Air Traffic Control and the Met Police, we have resumed departures out of Heathrow following a short suspension. We continue to monitor this situation and apologise to any passengers that were affected by this disruption." """
doc = nlp(doc)
displacy.render(doc, jupyter=True, style='ent')

### Extracting Document [Named Entities](https://spacy.io/usage/linguistic-features#named-entities)
 - [Kaggle example](https://www.kaggle.com/hubert0527/spacy-name-entity-recognition)
 - [Train your own NER](https://towardsdatascience.com/a-review-of-named-entity-recognition-ner-using-automatic-summarization-of-resumes-5248a75de175)
 - [Train your own NER 2](https://stackoverflow.com/questions/50580262/how-to-use-spacy-to-create-a-new-entity-and-learn-only-from-keyword-list)

In [165]:
for ent in doc.ents:
    print('ID:{}\t{}\t"{}"\t'.format(ent.label,ent.label_,ent.text,))
displacy.render(doc, jupyter=True, style='ent')

ID:9191306739292312949	FAC	"Heathrow"	
ID:382	GPE	"London"	
ID:389	TIME	"about an hour"	
ID:381	ORG	"Police"	
ID:381	ORG	"Heathrow"	
ID:388	DATE	"last month's"	
ID:9191306739292312949	FAC	"Gatwick Airport"	
ID:394	CARDINAL	"thousands"	
ID:381	ORG	"the Met Police"	
ID:9191306739292312949	FAC	"Heathrow"	
ID:381	ORG	"Air Traffic Control"	
ID:381	ORG	"the Met Police"	
ID:9191306739292312949	FAC	"Heathrow"	


In [166]:
def getENTS(nlpObject):
    ents = dict() # empty entities dict
    for eTyp in set([x.label_ for x in nlpObject.ents]): # get ent types in doc
        ents[eTyp] = [x.text for x in nlpObject.ents if x.label_ == eTyp] # put in dictionary
    return ents

In [167]:
getENTS(doc)

{'CARDINAL': ['thousands'],
 'DATE': ["last month's"],
 'FAC': ['Heathrow', 'Gatwick Airport', 'Heathrow', 'Heathrow'],
 'GPE': ['London'],
 'ORG': ['Police',
  'Heathrow',
  'the Met Police',
  'Air Traffic Control',
  'the Met Police'],
 'TIME': ['about an hour']}

# Information Extraction

### [Navigate parse tree](https://spacy.io/usage/linguistic-features#navigating)

In [168]:
from spacy.symbols import nsubj, VERB
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
print(verbs)

{said, was, said, saw, continue, said, comes, resumed, resumed, continue, respond}


In [66]:
# Finding a verb with a subject from above — less good
verbs = []
for possible_verb in doc:
    if possible_verb.pos == VERB:
        for possible_subject in possible_verb.children:
            if possible_subject.dep == nsubj:
                verbs.append(possible_verb)
                break
print(verbs)

[resumed, said, said, was, comes, saw, said, continue, respond, resumed, continue]


### [Noun Chunks or Phrases](https://www.analyticsvidhya.com/blog/2017/04/natural-language-processing-made-easy-using-spacy-%E2%80%8Bin-python/)
Dependency trees can also be used to generate noun phrases:

In [761]:
for chunk in doc.noun_chunks:
    print(chunk.root.text,chunk.root.dep_,chunk.root.head.text)

Flights nsubjpass stopped
Heathrow pobj at
Drones pobj by
runners ROOT runners
park pobj in
It nsubj is
Londons attr is
airport dobj busiest


# Informatin Extraction

## [Singular Entity relations](https://github.com/explosion/spaCy/blob/master/examples/information_extraction/entity_relations.py)

In [762]:
def extract_relations(doc,ent_type):
    # merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    for span in spans:
        span.merge()

    relations = []
    for ent in filter(lambda w: w.ent_type_ == ent_type, doc):
        if ent.dep_ in ('attr', 'dobj'):
            subject = [w for w in ent.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations.append((subject, ent))
        elif ent.dep_ == 'pobj' and ent.head.dep_ == 'prep':
            relations.append((ent.head.head, ent))
    return relations

In [763]:
t = """Net income was $9.4 million compared to the prior year of $2.7 million. Revenue exceeded twelve billion dollars, with a loss of $1b."""
t = nlp(t)
print(extract_relations(doc,"FAC"))

displacy.render(doc, jupyter=True, style='ent')

[(Flights, Heathrow)]


### [Parsing a subtree](https://github.com/explosion/spaCy/blob/master/examples/information_extraction/parse_subtrees.py)
To break sentences down into small sub thoughts/chunks.

In [72]:
displacy.render(doc, jupyter=True, style='ent')

In [119]:
# The easiest way is to find the head of the subtree you want, and then use
# the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
# is the one that does what you're asking for most directly:
for word in doc:
    if word.ent_type_ in ('FAC', 'DATE', 'ORG', 'CARDINAL', 'TIME'):
        print(''.join(w.text_with_ws for w in word.subtree))

Heathrow 
about 
an 
about an hour 
the Police 
Heathrow 
last 
last month's 
's 
Gatwick 
Gatwick Airport 
thousands of people stranded when drones were sighted
the 
Met 
the Met Police 
Heathrow
Air 
Traffic 
Air Traffic Control and the Met Police
the 
Met 
the Met Police
Heathrow 


In [122]:
# It'd probably be better for `word.subtree` to return a `Span` object
# instead of a generator over the tokens. If you want the `Span` you can
# get it via the `.right_edge` and `.left_edge` properties. The `Span`
# object is nice because you can easily get a vector, merge it, etc.
for word in doc:
    if word.dep_ in ('xcomp', 'ccomp'):
        subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
        print(subtree_span.text, '|', subtree_span.root.text,)

# You might also want to select a head, and then select a start and end
# position by walking along its children. You could then take the
# `.left_edge` and `.right_edge` of those tokens, and use it to calculate
# a span.

to have been sighted | sighted
a drone had been seen | seen
it was a "precautionary measure" to "prevent any threat to operational safety" | was
We continue to work closely with the Met Police to respond to reports of drones at Heathrow | continue
to work closely with the Met Police to respond to reports of drones at Heathrow | work
to monitor this situation and apologise to any passengers that were affected by this disruption | monitor


## [IE using parse tree](https://stackoverflow.com/questions/40453503/nlp-information-extraction-in-python-spacy)

In [536]:
doc = "Departures at Heathrow were stopped by Drones." #" Departures at Heathrow were temporarily stopped after a Drone was reported to have been sighted."
doc = nlp(doc)
displacy.render(doc, jupyter=True, style='dep')

In [271]:
# Word, POS and type
for w in doc:
    print("'{}'".format(w), w.pos_ , w.dep_)

'Departures' NOUN nsubjpass
'at' ADP prep
'Heathrow' PROPN pobj
'were' VERB auxpass
'stopped' VERB ROOT
'by' ADP agent
'Drones' NOUN pobj
'.' PUNCT punct


In [274]:
# find subjects or entities in text
entities = ['ORG','FAC','PERSON']
SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
NOUNS = ['NOUN','PROPN']

for w in doc:
    #entities = [w for w in doc if w.ent_type_ == 'FAC' or w.ent_type_ == 'ORG']
    #entities = [w for w in doc if w.dep_ == 'nsubj' or w.dep_ == 'nsubjpass'  or w.dep_ == 'pobj']
    entities = [w for w in doc if w.pos_ in NOUNS]

entities

[Departures, Heathrow, Drones]

In [276]:
# see dependencies and heads
for ent in entities:
    print('"{}" {}, head="{}"'.format(ent, ent.dep_, ent.head.text))

"Departures" nsubjpass, head="stopped"
"Heathrow" pobj, head="at"
"Drones" pobj, head="by"


In [286]:
# entity / noun subtrees
for ent in entities:
    print([x for x in ent.subtree])
    print([(x,x.dep_,x.head) for x in ent.subtree],end='\n\n')

[Departures, at, Heathrow]
[(Departures, 'nsubjpass', stopped), (at, 'prep', Departures), (Heathrow, 'pobj', at)]

[Heathrow]
[(Heathrow, 'pobj', at)]

[Drones]
[(Drones, 'pobj', by)]



In [287]:
# entity / noun subtrees
for ent in entities:
    for x in ent.subtree:
        print((x,x.dep_,x.head,x.head.dep_))
        if x.dep_ in ('xcomp', 'ccomp'):
            print('Whoa',x)

(Departures, 'nsubjpass', stopped, 'ROOT')
(at, 'prep', Departures, 'nsubjpass')
(Heathrow, 'pobj', at, 'prep')
(Heathrow, 'pobj', at, 'prep')
(Drones, 'pobj', by, 'agent')


## [find Subject, Verb, Objects](https://github.com/NSchrading/intro-spacy-nlp/blob/master/subject_object_extraction.py)

In [764]:
# 1. find verbs
verbs = [{'verb':tok} for tok in doc if tok.pos_ == "VERB" and tok.dep_ != "aux"]
verbs = dict([(e,v) for (e,v) in enumerate(verbs)])
verbs

{0: {'verb': were},
 1: {'verb': stopped},
 2: {'verb': Reported runners},
 3: {'verb': is}}

In [765]:
# 2. find subjects or nouns or entities related to these verbs
# could be left or right
verb_subjects = list()
for k,verb in verbs.items():
    verb['subj_left'] = [x for x in verb['verb'].lefts if x.dep_ in SUBJECTS or x.pos_ in NOUNS]
    verb['subj_right'] = [x for x in verb['verb'].rights if x.dep_ in SUBJECTS or x.pos_ in NOUNS]
verbs

{0: {'subj_left': [], 'subj_right': [], 'verb': were},
 1: {'subj_left': [Flights], 'subj_right': [by], 'verb': stopped},
 2: {'subj_left': [], 'subj_right': [], 'verb': Reported runners},
 3: {'subj_left': [It], 'subj_right': [Londons], 'verb': is}}

In [766]:
for k,verb in verbs.items():
    for sl,sr in zip(verb['subj_left'],verb['subj_right']):
        try:
            verb['subj_right'].append([x for x in list(sr.rights) if x.dep_ in SUBJECTS or x.pos_ in NOUNS][0])
        except:
            pass
        try:
            verb['subj_right'].append([x for x in list(sr.lefts) if x.dep_ in SUBJECTS or x.pos_ in NOUNS][0])
        except:
            pass
        try:
            verb['subj_left'].append([x for x in list(sl.rights) if x.dep_ in SUBJECTS or x.pos_ in NOUNS][0])
        except:
            pass
        try:
            verb['subj_left'].append([x for x in list(sl.lefts) if x.dep_ in SUBJECTS or x.pos_ in NOUNS][0])
        except:
            pass

In [767]:
verbs

{0: {'subj_left': [], 'subj_right': [], 'verb': were},
 1: {'subj_left': [Flights], 'subj_right': [by, Drones], 'verb': stopped},
 2: {'subj_left': [], 'subj_right': [], 'verb': Reported runners},
 3: {'subj_left': [It], 'subj_right': [Londons], 'verb': is}}

great, but it doesnt focus on nouns or objects.

I want to know triples. Like:
 - Departures at Heathrow.

## What about Noun or Entity triplets?

 - 1st order: Departures > at > heathrow.
 - 2nd order: Drones > stopped > departures.
 - 3rd order: Drones > at > heatrow.

In [776]:
doc = "Flights at Heathrow were stopped by Drones. Reported runners in the big park. It is London busiest airport"#" Heathrow is London busiest airport"
doc = nlp(doc)
displacy.render(doc,  jupyter=True, style='ent')

In [777]:
displacy.render(doc, jupyter=True, style='dep')

In [778]:
# # Word, POS and type
# for w in doc:
#     print("'{}'".format(w), w.pos_ , w.dep_)

In [779]:
NOUNS = ['NOUN','PROPN']
for w in doc:
    subjects = [w for w in doc if w.pos_ in NOUNS]
subjects

[Flights, Heathrow, Drones, runners, park, London, airport]

In [780]:
def tokenDeps(token):
    """looks for dependent tokens to the left and right"""
    deps = list(token.lefts) + list(token.rights)
    return deps

In [781]:
# noun to noun phrases
for subj in subjects:
    deps = tokenDeps(subj)
    if len(deps) > 0: # if subject has dependancies
        for dep in deps:
            moreSubjects = [x for x in tokenDeps(dep) if x.pos_ in NOUNS]
            if len(moreSubjects)>0:
                print((subj,dep,moreSubjects[0]))

(Flights, at, Heathrow)
(runners, in, park)


In [789]:
# It'd probably be better for `word.subtree` to return a `Span` object
# instead of a generator over the tokens. If you want the `Span` you can
# get it via the `.right_edge` and `.left_edge` properties. The `Span`
# object is nice because you can easily get a vector, merge it, etc.
for word in doc:
    if word.dep_ in ('xcomp', 'ccomp'):
        subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
        print(subtree_span.text, '|', subtree_span.root.text,)

# You might also want to select a head, and then select a start and end
# position by walking along its children. You could then take the
# `.left_edge` and `.right_edge` of those tokens, and use it to calculate
# a span.

London busiest airport | busiest


In [788]:
for chunk in doc.noun_chunks:
    print(chunk.root.text,chunk.root.head.text, chunk.root.dep_,)

Flights stopped nsubjpass
Heathrow at pobj
Drones by pobj
It is nsubj
airport busiest dobj



## [SVO package](https://nicschrading.com/project/Intro-to-NLP-with-spaCy/)
[github](https://github.com/NSchrading/intro-spacy-nlp/blob/master/subject_object_extraction.py)

In [718]:
#from nltk.stem.wordnet import WordNetLemmatizer
#from spacy.en import English

SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
OBJECTS = ["dobj", "dative", "attr", "oprd"]

def getSubsFromConjunctions(subs):
    moreSubs = []
    for sub in subs:
        # rights is a generator
        rights = list(sub.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
            if len(moreSubs) > 0:
                moreSubs.extend(getSubsFromConjunctions(moreSubs))
    return moreSubs

def getObjsFromConjunctions(objs):
    moreObjs = []
    for obj in objs:
        # rights is a generator
        rights = list(obj.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
            if len(moreObjs) > 0:
                moreObjs.extend(getObjsFromConjunctions(moreObjs))
    return moreObjs

def getVerbsFromConjunctions(verbs):
    moreVerbs = []
    for verb in verbs:
        rightDeps = {tok.lower_ for tok in verb.rights}
        if "and" in rightDeps:
            moreVerbs.extend([tok for tok in verb.rights if tok.pos_ == "VERB"])
            if len(moreVerbs) > 0:
                moreVerbs.extend(getVerbsFromConjunctions(moreVerbs))
    return moreVerbs

def findSubs(tok):
    head = tok.head
    while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
        head = head.head
    if head.pos_ == "VERB":
        subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
        if len(subs) > 0:
            verbNegated = isNegated(head)
            subs.extend(getSubsFromConjunctions(subs))
            return subs, verbNegated
        elif head.head != head:
            return findSubs(head)
    elif head.pos_ == "NOUN":
        return [head], isNegated(tok)
    return [], False

def isNegated(tok):
    negations = {"no", "not", "n't", "never", "none"}
    for dep in list(tok.lefts) + list(tok.rights):
        if dep.lower_ in negations:
            return True
    return False

def findSVs(tokens):
    svs = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        if len(subs) > 0:
            for sub in subs:
                svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
    return svs

def getObjsFromPrepositions(deps):
    objs = []
    for dep in deps:
        if dep.pos_ == "ADP" and dep.dep_ == "prep":
            objs.extend([tok for tok in dep.rights if tok.dep_  in OBJECTS or (tok.pos_ == "PRON" and tok.lower_ == "me")])
    return objs

def getObjsFromAttrs(deps):
    for dep in deps:
        if dep.pos_ == "NOUN" and dep.dep_ == "attr":
            verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
            if len(verbs) > 0:
                for v in verbs:
                    rights = list(v.rights)
                    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
                    objs.extend(getObjsFromPrepositions(rights))
                    if len(objs) > 0:
                        return v, objs
    return None, None

def getObjFromXComp(deps):
    for dep in deps:
        if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
            v = dep
            rights = list(v.rights)
            objs = [tok for tok in rights if tok.dep_ in OBJECTS]
            objs.extend(getObjsFromPrepositions(rights))
            if len(objs) > 0:
                return v, objs
    return None, None

def getAllSubs(v):
    verbNegated = isNegated(v)
    subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
    if len(subs) > 0:
        subs.extend(getSubsFromConjunctions(subs))
    else:
        foundSubs, verbNegated = findSubs(v)
        subs.extend(foundSubs)
    return subs, verbNegated

def getAllObjs(v):
    # rights is a generator
    rights = list(v.rights)
    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
    objs.extend(getObjsFromPrepositions(rights))

    #potentialNewVerb, potentialNewObjs = getObjsFromAttrs(rights)
    #if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
    #    objs.extend(potentialNewObjs)
    #    v = potentialNewVerb

    potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
    if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
        objs.extend(potentialNewObjs)
        v = potentialNewVerb
    if len(objs) > 0:
        objs.extend(getObjsFromConjunctions(objs))
    return v, objs

def findSVOs(tokens):
    svos = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        # hopefully there are subs, if not, don't examine this verb any longer
        if len(subs) > 0:
            v, objs = getAllObjs(v)
            for sub in subs:
                for obj in objs:
                    objNegated = isNegated(obj)
                    svos.append((sub.lower_, "!" + v.lower_ if verbNegated or objNegated else v.lower_, obj.lower_))
    return svos

def getAbuserOntoVictimSVOs(tokens):
    maleAbuser = {'he', 'boyfriend', 'bf', 'father', 'dad', 'husband', 'brother', 'man'}
    femaleAbuser = {'she', 'girlfriend', 'gf', 'mother', 'mom', 'wife', 'sister', 'woman'}
    neutralAbuser = {'pastor', 'abuser', 'offender', 'ex', 'x', 'lover', 'church', 'they'}
    victim = {'me', 'sister', 'brother', 'child', 'kid', 'baby', 'friend', 'her', 'him', 'man', 'woman'}

    svos = findSVOs(tokens)
    wnl = WordNetLemmatizer()
    passed = []
    for s, v, o in svos:
        s = wnl.lemmatize(s)
        v = "!" + wnl.lemmatize(v[1:], 'v') if v[0] == "!" else wnl.lemmatize(v, 'v')
        o = "!" + wnl.lemmatize(o[1:]) if o[0] == "!" else wnl.lemmatize(o)
        if s in maleAbuser.union(femaleAbuser).union(neutralAbuser) and o in victim:
            passed.append((s, v, o))
    return passed

def printDeps(toks):
    for tok in toks:
        print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_, [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights])


# Entity 2 Entity Relations
# [Train you own entity on NER](https://spacy.io/usage/training#ner)
 - https://github.com/explosion/spaCy/issues/2183
 - [train ur own recipe](https://support.prodi.gy/t/is-there-any-recipes-to-train-a-relation-extraction-model/182/2)
 - https://github.com/explosion/spaCy/blob/master/examples/training/train_intent_parser.py