In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

# **Statement 1**

In [2]:
doc = nlp(u"The Council shall consist of twenty-one members appointed by the Governor as follows: the Commissioner of Agriculture and Markets, the Commissioner of Health, the Commissioner of the Office of Temporary Disability Assistance, the Commissioner of Education, the Commissioner of the Department of Economic Development and the Director of the Office for the Aging")

In [3]:
print([w.text for w in doc])

['The', 'Council', 'shall', 'consist', 'of', 'twenty', '-', 'one', 'members', 'appointed', 'by', 'the', 'Governor', 'as', 'follows', ':', 'the', 'Commissioner', 'of', 'Agriculture', 'and', 'Markets', ',', 'the', 'Commissioner', 'of', 'Health', ',', 'the', 'Commissioner', 'of', 'the', 'Office', 'of', 'Temporary', 'Disability', 'Assistance', ',', 'the', 'Commissioner', 'of', 'Education', ',', 'the', 'Commissioner', 'of', 'the', 'Department', 'of', 'Economic', 'Development', 'and', 'the', 'Director', 'of', 'the', 'Office', 'for', 'the', 'Aging']


In [4]:
for token in doc:
    print(token.text, token.lemma_)

The the
Council Council
shall shall
consist consist
of of
twenty twenty
- -
one one
members member
appointed appoint
by by
the the
Governor Governor
as as
follows follow
: :
the the
Commissioner Commissioner
of of
Agriculture Agriculture
and and
Markets Markets
, ,
the the
Commissioner Commissioner
of of
Health Health
, ,
the the
Commissioner Commissioner
of of
the the
Office Office
of of
Temporary Temporary
Disability Disability
Assistance Assistance
, ,
the the
Commissioner Commissioner
of of
Education Education
, ,
the the
Commissioner Commissioner
of of
the the
Department Department
of of
Economic Economic
Development Development
and and
the the
Director Director
of of
the the
Office Office
for for
the the
Aging aging


In [5]:
import pandas as pd
pst = pd.DataFrame([[w.text,w.tag_,w.pos_, w.dep_] for w in  doc])
pst.columns = ['Text', "Part_of_Speech", "Speech", "Dependency"]
pst

Unnamed: 0,Text,Part_of_Speech,Speech,Dependency
0,The,DT,DET,det
1,Council,NNP,PROPN,nsubj
2,shall,MD,VERB,aux
3,consist,VB,VERB,ROOT
4,of,IN,ADP,prep
5,twenty,CD,NUM,compound
6,-,HYPH,PUNCT,punct
7,one,CD,NUM,nummod
8,members,NNS,NOUN,pobj
9,appointed,VBN,VERB,acl


In [6]:
pst_dep_token = pd.DataFrame([w.head.text,w.dep_, w.text] for w in doc)
pst_dep_token.columns = ['Head', "Dependency", "Token"]
pst_dep_token

Unnamed: 0,Head,Dependency,Token
0,Council,det,The
1,consist,nsubj,Council
2,consist,aux,shall
3,consist,ROOT,consist
4,consist,prep,of
5,one,compound,twenty
6,one,punct,-
7,members,nummod,one
8,of,pobj,members
9,members,acl,appointed


In [7]:
head_token_list = []
for w in doc:
  if w.head.text not in head_token_list:
    head_token_list.append(w.head.text)
  else:
    continue

In [8]:
head_token_list

['Council',
 'consist',
 'one',
 'members',
 'of',
 'appointed',
 'Governor',
 'by',
 'follows',
 'Commissioner',
 'Agriculture',
 'Office',
 'Assistance',
 'Department',
 'Development',
 'Director',
 'Aging',
 'for']

In [9]:
from spacy import displacy

displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

#### Extracting the Root of the Sentence

##### *Parsing the Tree*

In [10]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

The det Council PROPN []
Council nsubj consist VERB [The]
shall aux consist VERB []
consist ROOT consist VERB [Council, shall, of]
of prep consist VERB [members]
twenty compound one NUM []
- punct one NUM []
one nummod members NOUN [twenty, -]
members pobj of ADP [one, appointed, :, Commissioner]
appointed acl members NOUN [by, follows]
by agent appointed VERB [Governor]
the det Governor PROPN []
Governor pobj by ADP [the]
as mark follows VERB []
follows advcl appointed VERB [as]
: punct members NOUN []
the det Commissioner PROPN []
Commissioner appos members NOUN [the, of, ,, Commissioner]
of prep Commissioner PROPN [Agriculture]
Agriculture pobj of ADP [and, Markets]
and cc Agriculture PROPN []
Markets conj Agriculture PROPN []
, punct Commissioner PROPN []
the det Commissioner PROPN []
Commissioner conj Commissioner PROPN [the, of, ,, Commissioner]
of prep Commissioner PROPN [Health]
Health pobj of ADP []
, punct Commissioner PROPN []
the det Commissioner PROPN []
Commissioner conj 

In [11]:
lefts = []
rights = []
for w in doc:
  if w.dep_ == "ROOT":
    i = w.i
    for lc in doc[i].lefts:
      if lc.n_lefts >= 1 :
        for lcc in lc.lefts:
          lefts.append(lcc.text)
        lefts.append(lc.text)
      else:
        lefts.append(lc.text)
    lefts.append(doc[i])
    for rc in doc[i].rights:
      if rc.n_rights >= 1 :
        lefts.append(rc.text)
part_one = " ".join([str(token) for token in lefts])

In [12]:
part_one

'The Council shall consist of'

#### Extracting the Other Parts of Sentences
##### Method 1: Using the noun chunks

In [13]:
noun_chunk_token = pd.DataFrame([chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text] for chunk in doc.noun_chunks)
noun_chunk_token.columns = ["Noun", "Root", "Dependency", "Head of Chunk"]
noun_chunk_token

Unnamed: 0,Noun,Root,Dependency,Head of Chunk
0,The Council,Council,nsubj,consist
1,twenty-one members,members,pobj,of
2,the Governor,Governor,pobj,by
3,the Commissioner,Commissioner,appos,members
4,Agriculture,Agriculture,pobj,of
5,Markets,Markets,conj,Agriculture
6,the Commissioner,Commissioner,conj,Commissioner
7,Health,Health,pobj,of
8,the Commissioner,Commissioner,conj,Commissioner
9,the Office,Office,pobj,of


In [14]:
for chunk in doc.noun_chunks:
  if str(chunk.text) in part_one:
    continue
  else:
    print([left for left in chunk.lefts], chunk,[right for right in chunk.rights] )

[] twenty-one members [appointed, :, Commissioner]
[] the Governor []
[] the Commissioner [of, ,, Commissioner]
[] Agriculture [and, Markets]
[] Markets []
[] the Commissioner [of, ,, Commissioner]
[] Health []
[] the Commissioner [of, ,, Commissioner]
[] the Office [of]
[] Temporary Disability Assistance []
[] the Commissioner [of, ,, Commissioner]
[] Education []
[] the Commissioner [of, and, Director]
[] the Department [of]
[] Economic Development []
[] the Director [of, for]
[] the Office []
[] the Aging []


In [15]:
sentences = []

def inorderTraversal(chunk):
    res = []
    for left in chunk.lefts:
      res = inorderTraversal(left)
    res.append(str(chunk.text))
    for right in chunk.rights:
      if (right.dep_ == "conj") & (right.pos_ == "PROPN") & (right.n_rights >= 1):
        continue
      if (right.dep_ == "appos") & (right.pos_ == "PROPN") & (right.n_rights >= 1):
        continue
      if (right.dep_ == "cc") & (right.pos_ == "CCONJ") & (right.head.dep_ == "conj") & (right.head.pos_ == "PROPN"):
        continue
      if (right.dep_ == "advcl") & (right.pos_ == "VERB"):
        continue
      if (right.dep_ == "PUNCT"):
        continue
      else:
        res = res + inorderTraversal(right)
    return res


for chunk in doc.noun_chunks:
  # The chunks dont have a left tree so there is no left list
  if (chunk.root.dep_ == "conj") & (chunk.root.pos_ =="PROPN") & (chunk.root.n_rights >= 1): # Only selecting the commisoner and directors 
    sentences.append(part_one + " " + " ".join(inorderTraversal(chunk)) )      
  if (chunk.root.dep_ == "appos") & (chunk.root.pos_ =="PROPN") & (chunk.root.n_rights >= 1): # Only selecting the commisoner and directors 
    sentences.append(part_one + " " + " ".join(inorderTraversal(chunk)) )   
  if (chunk.root.dep_ == "pobj") & (chunk.root.pos_ =="NOUN") & (chunk.root.n_rights >= 1): # Only selecting the commisoner and directors 
    sentences.append(part_one + " " + " ".join(inorderTraversal(chunk)) )   

In [16]:
import re
for i in sentences:
  new = re.sub(r'[^a-zA-Z0-9\s-]', '',i)
  print(new)

The Council shall consist of twenty-one members appointed by the Governor 
The Council shall consist of the Commissioner of Agriculture and Markets 
The Council shall consist of the Commissioner of Health 
The Council shall consist of the Commissioner of the Office of Disability Assistance 
The Council shall consist of the Commissioner of Education 
The Council shall consist of the Commissioner of the Department of Economic Development
The Council shall consist of the Director of the Office for the Aging
