### Using the built-in chunker

In [1]:
import nltk

In [2]:
text = "Lalbagh Botanical Gardens is a well known botanical garden in Bengaluru, India."

In [3]:
sentences = nltk.sent_tokenize(text)

In [4]:
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(words)
    chunks = nltk.ne_chunk(tags)
    print(chunks)

(S
  (PERSON Lalbagh/NNP)
  (PERSON Botanical/NNP Gardens/NNP)
  is/VBZ
  a/DT
  well/RB
  known/VBN
  botanical/JJ
  garden/NN
  in/IN
  (GPE Bengaluru/NNP)
  ,/,
  (GPE India/NNP)
  ./.)


### Writing your own simple chunker

In [5]:
text = "Ravi is the CEO of a Company. He is very powerful public speaker also."

In [6]:
grammar = '\n'.join([
    'NP: {<DT>*<NNP>}',
    'NP: {<JJ>*<NN>}',
    'NP: {<NNP>+}',
])

In [7]:
sentences = nltk.sent_tokenize(text)

In [9]:
import warnings
warnings.filterwarnings("ignore")
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(words)
    chunkparser = nltk.RegexpParser(grammar)
    result = chunkparser.parse(tags)
    print(result)

(S
  (NP Ravi/NNP)
  is/VBZ
  (NP the/DT CEO/NNP)
  of/IN
  (NP a/DT Company/NNP)
  ./.)
(S
  He/PRP
  is/VBZ
  very/RB
  (NP powerful/JJ public/JJ speaker/NN)
  also/RB
  ./.)


### Training a chunker 

In [10]:
from nltk.corpus import conll2000

In [11]:
from nltk.corpus import treebank_chunk

In [12]:
def mySimpleChunker():
    grammar = 'NP: {<NNP>+}'
    return nltk.RegexpParser(grammar)

In [13]:
def test_nothing(data):
    cp = nltk.RegexpParser("")
    print(cp.evaluate(data))

In [14]:
def test_mysimplechunker(data):
    schunker = mySimpleChunker()
    print(schunker.evaluate(data))

In [15]:
datasets = [
 conll2000.chunked_sents('test.txt', chunk_types=['NP']),
 treebank_chunk.chunked_sents()
]

In [16]:
for dataset in datasets:
    test_nothing(dataset[:50])
    test_mysimplechunker(dataset[:50])

ChunkParse score:
    IOB Accuracy:  38.6%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
ChunkParse score:
    IOB Accuracy:  48.2%%
    Precision:     71.1%%
    Recall:        17.2%%
    F-Measure:     27.7%%
ChunkParse score:
    IOB Accuracy:  45.0%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
ChunkParse score:
    IOB Accuracy:  50.7%%
    Precision:     51.9%%
    Recall:         8.8%%
    F-Measure:     15.1%%


### Parsing recursive descent

In [17]:
def RDParserExample(grammar, textlist):
    parser = nltk.parse.RecursiveDescentParser(grammar)
    for text in textlist:
        sentence = nltk.word_tokenize(text)
        for tree in parser.parse(sentence):
            print(tree)
            tree.draw()

In [18]:
grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> NNP VBZ
VP -> IN NNP | DT NN IN NNP
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")

In [19]:
text = [
    "Tajmahal is in Agra",
    "Bangalore is the capital of Karnataka",
]

In [20]:
RDParserExample(grammar, text)

(S (NP (NNP Tajmahal) (VBZ is)) (VP (IN in) (NNP Agra)))
(S
  (NP (NNP Bangalore) (VBZ is))
  (VP (DT the) (NN capital) (IN of) (NNP Karnataka)))


### Parsing shift-reduce

In [21]:
def SRParserExample(grammar, textlist):
    parser = nltk.parse.ShiftReduceParser(grammar)
    for text in textlist:
        sentence = nltk.word_tokenize(text)
        for tree in parser.parse(sentence):
            print(tree)
            tree.draw()

In [22]:
text = [
    "Tajmahal is in Agra",
    "Bangalore is the capital of Karnataka",
]

In [23]:
grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> NNP VBZ
VP -> IN NNP | DT NN IN NNP
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")

In [24]:
SRParserExample(grammar, text)

(S (NP (NNP Tajmahal) (VBZ is)) (VP (IN in) (NNP Agra)))


### Parsing dependency grammar and projective dependency

In [25]:
grammar = nltk.grammar.DependencyGrammar.fromstring("""
'savings' -> 'small'
'yield' -> 'savings'
'gains' -> 'large'
'yield' -> 'gains'
""")

In [26]:
sentence = 'small savings yield large gains'

In [27]:
dp = nltk.parse.ProjectiveDependencyParser(grammar)

In [28]:
for t in sorted(dp.parse(sentence.split())):
    print(t)
    t.draw()

(yield (savings small) (gains large))


### Parsing a chart

In [29]:
from nltk.grammar import CFG

In [30]:
from nltk.parse.chart import ChartParser, BU_LC_STRATEGY

In [31]:
grammar = CFG.fromstring("""
S -> T1 T4
T1 -> NNP VBZ
T2 -> DT NN
T3 -> IN NNP
T4 -> T3 | T2 T3
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")

In [32]:
cp = ChartParser(grammar, BU_LC_STRATEGY, trace=True)

In [33]:
sentence = "Bangalore is the capital of Karnataka"

In [34]:
tokens = sentence.split()

In [35]:
chart = cp.chart_parse(tokens)

|.Bangal.  is  . the  .capita.  of  .Karnat.|
|[------]      .      .      .      .      .| [0:1] 'Bangalore'
|.      [------]      .      .      .      .| [1:2] 'is'
|.      .      [------]      .      .      .| [2:3] 'the'
|.      .      .      [------]      .      .| [3:4] 'capital'
|.      .      .      .      [------]      .| [4:5] 'of'
|.      .      .      .      .      [------]| [5:6] 'Karnataka'
|[------]      .      .      .      .      .| [0:1] NNP -> 'Bangalore' *
|[------>      .      .      .      .      .| [0:1] T1 -> NNP * VBZ
|.      [------]      .      .      .      .| [1:2] VBZ -> 'is' *
|[-------------]      .      .      .      .| [0:2] T1 -> NNP VBZ *
|[------------->      .      .      .      .| [0:2] S  -> T1 * T4
|.      .      [------]      .      .      .| [2:3] DT -> 'the' *
|.      .      [------>      .      .      .| [2:3] T2 -> DT * NN
|.      .      .      [------]      .      .| [3:4] NN -> 'capital' *
|.      .      [-------------]      .      .| [2:

In [36]:
parses = list(chart.parses(grammar.start()))

In [37]:
print("Total Edges :", len(chart.edges()))

Total Edges : 24


In [38]:
for tree in parses: print(tree)

(S
  (T1 (NNP Bangalore) (VBZ is))
  (T4 (T2 (DT the) (NN capital)) (T3 (IN of) (NNP Karnataka))))


In [39]:
tree.draw()