In [1]:
# Import libraries
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import pandas as pd
nlp = spacy.load("en_core_web_sm")

### Lets check our rule on a larger corpus

In [6]:
# load the dataset csv file
active_passive = pd.read_csv("Datasets/active_passive.csv")
active_passive.head(5)

Unnamed: 0,Active,Passive
0,He reads a novel.,A novel is read.
1,He does not cook food.,Food is not cooked by him.
2,Does he purchase books?,Are books being purchased by him?
3,They grow plants.,Plants are grown by them.
4,She teaches me.,I am taught by her.


In [7]:
# Print the shape of the dataframe.
active_passive.shape

(40, 2)

In [8]:
# Separate out active and passive sentences in arrays.
active = active_passive['Active']
passive = active_passive['Passive']

### Create the rule

In [9]:
passive_rule = [{'DEP':'nsubjpass'}]
matcher = Matcher(nlp.vocab)
matcher.add('Rule', [passive_rule])

In [10]:
def is_passive(doc, matcher):
    if len(matcher(doc))>0:
        return True
    else:
        return False

### Check rule on active voice sentences

In [12]:
cnt = 0
for sent in active:
    doc = nlp(sent)
    if not is_passive(doc, matcher):
        cnt += 1
print(cnt)

40


### Check rule on passive voice sentences

In [13]:
cnt = 0
for sent in passive:
    doc = nlp(sent)
    if is_passive(doc, matcher):
        cnt += 1
print(cnt)

38


### Let's troubleshoot

In [14]:
cnt = 0
missed = []
for sent in passive:
    doc = nlp(sent)
    if is_passive(doc, matcher):
        cnt += 1
    else:
        missed.append(doc)
print(cnt)

38


In [15]:
missed[0]

Are books being purchased by him?

In [16]:
missed[1]

Is a table being bought by Ritika?

### Let's visualize their dependency trees

In [18]:
for doc in missed:
    displacy.render(doc, style='dep')

In [19]:
spacy.explain('auxpass')

'auxiliary (passive)'

[Dependencies](https://universaldependencies.org/docs/en/dep/)

### Update our rule
[Reference](https://spacy.io/usage/rule-based-matching)

In [25]:
# Update passive rule by including 'nsubjpass', 'auxpass', 'csubjpass'.
# Reference doc: "https://universaldependencies.org/docs/en/dep/"
passive_rule = [{'DEP':{'IN':['nsubjpass', 'auxpass', 'csubjpass']}}]
matcher = Matcher(nlp.vocab)
matcher.add('Rule', [passive_rule])

In [23]:
cnt = 0
for sent in active:
    doc = nlp(sent)
    if not is_passive(doc, matcher):
        cnt += 1
print(cnt)

40


In [24]:
cnt = 0
for sent in passive:
    doc = nlp(sent)
    if is_passive(doc, matcher):
        cnt += 1
print(cnt)

40


## Summary
 - Always test your rules and hueristics on a larger corpus to see the effectiveness of the rules
 - One can write intricate matching rules using `matcher` object