In [44]:
import nltk
import os
from nltk import ne_chunk

In [45]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk import RegexpParser as regex_parser

In [46]:
path_to_gs = "C:\Program Files\gs\gs9.52\bin"

In [47]:
os.environ['PATH'] += os.pathsep + path_to_gs

## Chunking

### NounPhrase

In [48]:
sent = "The little mouse ate the hot fresh cheese"

In [49]:
sent_tokens = nltk.pos_tag(word_tokenize(sent))

In [50]:
sent_tokens

[('The', 'DT'),
 ('little', 'JJ'),
 ('mouse', 'NN'),
 ('ate', 'VB'),
 ('the', 'DT'),
 ('hot', 'JJ'),
 ('fresh', 'JJ'),
 ('cheese', 'NN')]

In [51]:
grammer_np = r"NP: {<DT>?<JJ>*<NN>}" # ? -- either 0 or 1, * -- either 0 or many

In [52]:
chunk_parser = regex_parser(grammer_np)

In [53]:
chunk_result = chunk_parser.parse(sent_tokens)
print(chunk_result)

(S
  (NP The/DT little/JJ mouse/NN)
  ate/VB
  (NP the/DT hot/JJ fresh/JJ cheese/NN))


### VerbPhrase

In [11]:
grammer_vp = r"VP: {<PRP>?<VB|VBD|VBZ|VBG>*<RB|RBR>?}"
grammer_vp

'VP: {<PRP>?<VB|VBD|VBZ|VBG>*<RB|RBR>?}'

In [12]:
chunk_parser2 = regex_parser(grammer_vp)

In [13]:
sent2 = "She is not walking to the mall"
sent_tokens2 = nltk.pos_tag(word_tokenize(sent2))
sent_tokens2

[('She', 'PRP'),
 ('is', 'VBZ'),
 ('not', 'RB'),
 ('walking', 'VBG'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('mall', 'NN')]

In [14]:
chunk_result2 = chunk_parser2.parse(sent_tokens2)
print(chunk_result2)

(S (VP She/PRP is/VBZ not/RB) (VP walking/VBG) to/TO the/DT mall/NN)


In [15]:
sent3 = "She is walking quickly to the mall"
sent_tokens3 = nltk.pos_tag(word_tokenize(sent3))
sent_tokens3

[('She', 'PRP'),
 ('is', 'VBZ'),
 ('walking', 'VBG'),
 ('quickly', 'RB'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('mall', 'NN')]

In [16]:
chunk_result3 = chunk_parser2.parse(sent_tokens3)
print(chunk_result3)

(S (VP She/PRP is/VBZ walking/VBG quickly/RB) to/TO the/DT mall/NN)


In [17]:
sent4 = "She is walking very quickly to the mall"
sent_tokens4 = nltk.pos_tag(word_tokenize(sent4))
sent_tokens4

[('She', 'PRP'),
 ('is', 'VBZ'),
 ('walking', 'VBG'),
 ('very', 'RB'),
 ('quickly', 'RB'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('mall', 'NN')]

In [18]:
chunk_result4 = chunk_parser2.parse(sent_tokens4)
print(chunk_result4)

(S
  (VP She/PRP is/VBZ walking/VBG very/RB)
  (VP quickly/RB)
  to/TO
  the/DT
  mall/NN)


## Chinking

In [19]:
chink_grammar = r"""
chk_name: 
{<PRP>?<VB|VBD|VBZ|VBG>*<RB|RBR>?} 
}<RB>+{"""

In [20]:
chink_parser = nltk.RegexpParser(chink_grammar)

In [21]:
print(chink_parser.parse(sent_tokens4))

(S
  (chk_name She/PRP is/VBZ walking/VBG)
  very/RB
  quickly/RB
  to/TO
  the/DT
  mall/NN)


## Context Free Grammar

In [22]:
from nltk.parse.generate import generate, demo_grammar

In [23]:
CFG_grammar = nltk.CFG.fromstring("""
S -> NP VP 
VP -> V N 
V -> "saw"|"met" 
NP -> "John"|"Jim" 
N -> "dog"|"cat" 
""")

In [24]:
for sentence in generate(CFG_grammar):
    print(" ".join(sentence))

John saw dog
John saw cat
John met dog
John met cat
Jim saw dog
Jim saw cat
Jim met dog
Jim met cat


### Production Rules

In [25]:
CFG_grammar.productions()

[S -> NP VP,
 VP -> V N,
 V -> 'saw',
 V -> 'met',
 NP -> 'John',
 NP -> 'Jim',
 N -> 'dog',
 N -> 'cat']

In [26]:
CFG_grammar

<Grammar with 8 productions>

## CFG Implementation -- Automatic Text Paraphrasing

In [27]:
def cfg_parse(sentence):
    sent_tk = nltk.pos_tag(word_tokenize(sentence)) 
    for one in sent_tk:
        if one[1] == 'NNP':
            s_NP = "\'" + one[0]  + "\'" 
        if one[1] == 'VBD' or one[1]=='VBN':
            s_V = "\'" + one[0] + "\'" 
        if one[1] == 'NN': s_N = "\'" + one[0] + "\'" 
        else: pass
    cfg_grammar2 = nltk.CFG.fromstring("""
    S -> NP VP
    VP -> V N
    NP -> {}
    V -> {}
    N -> {}
    """.format(s_NP,s_V,s_N))
    for sentence in generate(cfg_grammar2): 
        print(" ".join(sentence)) 
    return

In [28]:
cfg_parse("John saw a long white boat")

John saw boat


In [29]:
nltk.pos_tag(word_tokenize("John saw boat"))

[('John', 'NNP'), ('saw', 'VBD'), ('boat', 'NN')]