In [39]:
import spacy 
import pandas as pd
import re
import string
import nltk
import spacy
import numpy as np
import math
from tqdm import tqdm

from spacy.matcher import Matcher 
from spacy.tokens import Span
from spacy import displacy

pd.set_option('display.max_colwidth', 200)

In [40]:
# ! pip install spacy

In [41]:
# !python -m spacy download en_core_web_sm

### Rule Based Matcher:

extract hypernym-hyponym pairs by using these patterns/rules

In [42]:
# load Spacy model
nlp= spacy.load('en_core_web_sm')

In [43]:
# mine information from text based on these Hearst Patterns.

# sample text 
text = "GDP in developing countries such as Vietnam will continue growing at a high rate." 

# create a spaCy object
doc= nlp(text)

In [44]:
doc

GDP in developing countries such as Vietnam will continue growing at a high rate.

#### Expected Pattern: X such as Y

In [45]:
# understand its syntactic structure – things like the subject, object, modifiers, and 
# parts-of-speech (POS) in the sentence.


# print token, dependency, POS tag 
for tok in doc:
    print(tok.text, '-->', tok.dep_, '-->', tok.pos_)

GDP --> nsubj --> NOUN
in --> prep --> ADP
developing --> amod --> VERB
countries --> pobj --> NOUN
such --> amod --> ADJ
as --> prep --> ADP
Vietnam --> pobj --> PROPN
will --> aux --> AUX
continue --> ROOT --> VERB
growing --> xcomp --> VERB
at --> prep --> ADP
a --> det --> DET
high --> amod --> ADJ
rate --> pobj --> NOUN
. --> punct --> PUNCT


In [46]:
# define the pattern 
pattern= [{'POS':'NOUN'},{'LOWER':'such'},{'LOWER':'as'},{'POS':'PROPN'}]   # Proper noun

In [47]:
pattern

[{'POS': 'NOUN'}, {'LOWER': 'such'}, {'LOWER': 'as'}, {'POS': 'PROPN'}]

In [48]:
# Matcher class object
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
matcher.add('matching_1', patterns=[pattern])

In [49]:
matches= matcher(doc)

In [50]:
matches

[(11840699188806025751, 3, 7)]

In [54]:
span= doc[matches[0][1]:matches[0][2]]
print(span.text)

countries such as Vietnam


However, if we could get “developing countries” instead of just “countries”, then the output would make more sense.

In [55]:
# lets modify our pattern a bit
pattern = [{'DEP': 'amod', 'OP': '?'}, # ‘OP’: ‘?’ in the pattern above means that the modifier(‘amod’)can occur once or not at all. 
           {'POS':'NOUN'},
           {'LOWER':'such'},
           {'LOWER':'as'},
           {'POS':'PROPN'}]   # Proper noun


In [56]:
matcher= Matcher(nlp.vocab)
matcher.add('matching_1', patterns= [pattern])

In [57]:
matches= matcher(doc)
span= doc[matches[0][1]:matches[0][2]]

In [58]:
span

developing countries such as Vietnam

### Lets create some more Hearts Pattern


In [59]:
doc= nlp("Here is how you can keep your car and other vehicles clean.")
doc

Here is how you can keep your car and other vehicles clean.

In [60]:
for tok in doc:
    print(tok.text, "-->", tok.dep_, '-->', tok.pos_)

Here --> advmod --> ADV
is --> ROOT --> AUX
how --> advmod --> SCONJ
you --> nsubj --> PRON
can --> aux --> AUX
keep --> ccomp --> VERB
your --> poss --> PRON
car --> dobj --> NOUN
and --> cc --> CCONJ
other --> amod --> ADJ
vehicles --> conj --> NOUN
clean --> oprd --> ADJ
. --> punct --> PUNCT


#### Pattern: X and/or Y

In [61]:
pattern= [{'DEP':'amd', 'OP':"?"},
        {'POS':'NOUN'},
         {'LOWER':'and', 'OP':"?"},
         {'LOWER':'or', 'OP':"?"},
         {'LOWER':'other'},
         {'POS':'NOUN'}]

In [62]:
matcher= Matcher(nlp.vocab)
matcher.add('matching_1', patterns= [pattern])

In [65]:
matches= matcher(doc)
span= doc[matches[0][1]:matches[0][2]]

In [66]:
span

car and other vehicles

In [67]:
# replaced 'and' with 'or'
doc = nlp("Here is how you can keep your car or other vehicles clean.")

In [68]:
matches= matcher(doc)
span= doc[matches[0][1]:matches[0][2]]

In [69]:
span

car or other vehicles

#### Pattern: X, including Y

In [70]:
doc = nlp("Eight people, including two children, were injured in the explosion") 

for tok in doc:
    print(tok.text, '-->', tok.dep_, '-->', tok.pos_)

Eight --> nummod --> NUM
people --> nsubjpass --> NOUN
, --> punct --> PUNCT
including --> prep --> VERB
two --> nummod --> NUM
children --> pobj --> NOUN
, --> punct --> PUNCT
were --> auxpass --> AUX
injured --> ROOT --> VERB
in --> prep --> ADP
the --> det --> DET
explosion --> pobj --> NOUN


In [76]:
pattern= [{'DEP':'nummod','OP':"?"}, # numeric modifier 
           {'DEP':'amod','OP':"?"}, # adjectival modifier 
           {'POS':'NOUN'}, 
           {'IS_PUNCT': True}, 
           {'LOWER': 'including'}, 
           {'DEP':'nummod','OP':"?"}, 
           {'DEP':'amod','OP':"?"}, 
           {'POS':'NOUN'}] 

matcher.add("matching_1",patterns= [pattern]) 

matches = matcher(doc) 
span = doc[matches[0][1]:matches[0][2]] 
print(span.text)

Eight people, including two children


#### Subtree Matching for Relation Extraction

In [77]:
text = "Tableau was recently acquired by Salesforce." 

# Plot the dependency graph 
doc = nlp(text) 
displacy.render(doc, style='dep',jupyter=True)

In [78]:
text = "Tableau was recently acquired by Salesforce." 
doc = nlp(text) 

for tok in doc: 
    print(tok.text,"-->",tok.dep_,"-->",tok.pos_)

Tableau --> nsubjpass --> PROPN
was --> auxpass --> AUX
recently --> advmod --> ADV
acquired --> ROOT --> VERB
by --> agent --> ADP
Salesforce --> pobj --> PROPN
. --> punct --> PUNCT


In [79]:
def subtree_matcher(doc):
    
    x= ''
    y= ''
    
    # iterate through all the tokens in the input sentence 
    for i, tok in enumerate(doc):
        
        # extract subject
        if tok.dep_.find('subjpass') == True:
            y= tok.text
            
            
        # extract subject
        if tok.dep_.find('obj') == True:
            x= tok.text
            
    return x,y       

In [80]:
text_2 = "Careem, a ride hailing major in middle east, was acquired by Uber." 

doc_2 = nlp(text_2) 
subtree_matcher(doc_2)

('Uber', 'Careem')