In [40]:
import nltk
from PyPDF2 import PdfReader

In [47]:
reader = PdfReader('Corpus.pdf')
page = reader.pages[0] 
text = page.extract_text()

In [48]:
grammar = '\n'.join([
    'NP: {<DT>*<NN>+}',
    'NP: {<DT>*<NNP>+}',
    'NP: {<DT>*<NNS>+}',
    'ADJP: {<DT>*<JJ>+}',
    'ADJP: {<VBZ>*<JJ>}',
    'ADJP: {<DT>*<JJ><CC><JJ>+}',
    'ADVP: {<RB>+}',
    'VP: {<VBZ>*<VB>}',
    'VP: {<VBZ>*<VBD>}',
    'VP: {<VBZ>*<VBN>}',
    'VP: {<DT>*<NN>*<VB>}',
    'PP: {<IN>}'])

## Using grammar

In [53]:
tokens = nltk.word_tokenize(text)
print(tokens)

['What', 'is', 'my', 'happy', 'place', '?', '→', 'Close', 'your', 'eyes', 'and', 'feel', 'this', 'every', 'moment', '.', 'Let', 'it', 'heal', 'you', '!', 'It', 'covers', 'all', 'the', 'important', 'points', 'in', 'your', 'bucket', 'list', '.', 'You', 'are', 'in', 'Iceland', '.', 'There', 'is', 'a', 'beautiful', 'mountain', 'and', 'through', 'the', 'edge', ',', 'you', 'can', 'see', 'a', 'pool', 'of', 'water', '.', 'Sun', 'rays', 'bouncing', 'over', 'bits', 'of', 'glaciers', ',', 'giving', 'a', 'hint', 'of', 'blue', 'to', 'the', 'white', 'snow', '.', 'On', 'the', 'opposite', 'side', ',', 'there', 'is', 'a', 'sharp', 'cliff', 'with', 'beautifully', 'segmented', 'vertical', 'bamboo', '-like', 'rocks', '.', 'Just', 'like', 'this…', '.', 'You', 'are', 'wearing', 'your', 'favourite', 'winter', 'clothing', '–', 'a', 'beanie', ',', 'a', 'jacket', 'pulled', 'over', 'a', 'cardigan', ',', 'scarf', ',', 'snow', 'boots', ',', 'a', 'pair', 'of', 'mittens', 'and', 'are', 'set', 'to', 'build', 'a', 'te

In [54]:
tags = nltk.pos_tag(tokens)
print(tags)

[('What', 'WP'), ('is', 'VBZ'), ('my', 'PRP$'), ('happy', 'JJ'), ('place', 'NN'), ('?', '.'), ('→', 'UH'), ('Close', 'NNP'), ('your', 'PRP$'), ('eyes', 'NNS'), ('and', 'CC'), ('feel', 'VB'), ('this', 'DT'), ('every', 'DT'), ('moment', 'NN'), ('.', '.'), ('Let', 'VB'), ('it', 'PRP'), ('heal', 'VB'), ('you', 'PRP'), ('!', '.'), ('It', 'PRP'), ('covers', 'VBZ'), ('all', 'PDT'), ('the', 'DT'), ('important', 'JJ'), ('points', 'NNS'), ('in', 'IN'), ('your', 'PRP$'), ('bucket', 'NN'), ('list', 'NN'), ('.', '.'), ('You', 'PRP'), ('are', 'VBP'), ('in', 'IN'), ('Iceland', 'NNP'), ('.', '.'), ('There', 'EX'), ('is', 'VBZ'), ('a', 'DT'), ('beautiful', 'JJ'), ('mountain', 'NN'), ('and', 'CC'), ('through', 'IN'), ('the', 'DT'), ('edge', 'NN'), (',', ','), ('you', 'PRP'), ('can', 'MD'), ('see', 'VB'), ('a', 'DT'), ('pool', 'NN'), ('of', 'IN'), ('water', 'NN'), ('.', '.'), ('Sun', 'NNP'), ('rays', 'VBZ'), ('bouncing', 'NN'), ('over', 'IN'), ('bits', 'NNS'), ('of', 'IN'), ('glaciers', 'NNS'), (',', ','

In [55]:
chunkparser = nltk.RegexpParser(grammar)
result = chunkparser.parse(tags)
print(result)

(S
  What/WP
  is/VBZ
  my/PRP$
  (ADJP happy/JJ)
  (NP place/NN)
  ?/.
  →/UH
  (NP Close/NNP)
  your/PRP$
  (NP eyes/NNS)
  and/CC
  (VP feel/VB)
  (NP this/DT every/DT moment/NN)
  ./.
  (VP Let/VB)
  it/PRP
  (VP heal/VB)
  you/PRP
  !/.
  It/PRP
  covers/VBZ
  all/PDT
  (ADJP the/DT important/JJ)
  (NP points/NNS)
  (PP in/IN)
  your/PRP$
  (NP bucket/NN list/NN)
  ./.
  You/PRP
  are/VBP
  (PP in/IN)
  (NP Iceland/NNP)
  ./.
  There/EX
  is/VBZ
  (ADJP a/DT beautiful/JJ)
  (NP mountain/NN)
  and/CC
  (PP through/IN)
  (NP the/DT edge/NN)
  ,/,
  you/PRP
  can/MD
  (VP see/VB)
  (NP a/DT pool/NN)
  (PP of/IN)
  (NP water/NN)
  ./.
  (NP Sun/NNP)
  rays/VBZ
  (NP bouncing/NN)
  (PP over/IN)
  (NP bits/NNS)
  (PP of/IN)
  (NP glaciers/NNS)
  ,/,
  giving/VBG
  (NP a/DT hint/NN)
  (PP of/IN)
  (NP blue/NN)
  to/TO
  (ADJP the/DT white/JJ)
  (NP snow/NN)
  ./.
  (PP On/IN)
  (ADJP the/DT opposite/JJ)
  (NP side/NN)
  ,/,
  there/EX
  is/VBZ
  (ADJP a/DT sharp/JJ)
  (NP cliff/NN)
  (PP

## Using in-built functions

In [56]:
class CustomChunker(nltk.chunk.ChunkParserI):
    def __init__(self, grammar):
        self.chunk_parser = nltk.RegexpParser(grammar)

    def parse(self, tagged_sentence):
        chunks = self.chunk_parser.parse(tagged_sentence)
        return chunks

In [57]:
custom_chunker = CustomChunker(grammar)
chunks = custom_chunker.parse(tags)
print(chunks)

(S
  What/WP
  is/VBZ
  my/PRP$
  (ADJP happy/JJ)
  (NP place/NN)
  ?/.
  →/UH
  (NP Close/NNP)
  your/PRP$
  (NP eyes/NNS)
  and/CC
  (VP feel/VB)
  (NP this/DT every/DT moment/NN)
  ./.
  (VP Let/VB)
  it/PRP
  (VP heal/VB)
  you/PRP
  !/.
  It/PRP
  covers/VBZ
  all/PDT
  (ADJP the/DT important/JJ)
  (NP points/NNS)
  (PP in/IN)
  your/PRP$
  (NP bucket/NN list/NN)
  ./.
  You/PRP
  are/VBP
  (PP in/IN)
  (NP Iceland/NNP)
  ./.
  There/EX
  is/VBZ
  (ADJP a/DT beautiful/JJ)
  (NP mountain/NN)
  and/CC
  (PP through/IN)
  (NP the/DT edge/NN)
  ,/,
  you/PRP
  can/MD
  (VP see/VB)
  (NP a/DT pool/NN)
  (PP of/IN)
  (NP water/NN)
  ./.
  (NP Sun/NNP)
  rays/VBZ
  (NP bouncing/NN)
  (PP over/IN)
  (NP bits/NNS)
  (PP of/IN)
  (NP glaciers/NNS)
  ,/,
  giving/VBG
  (NP a/DT hint/NN)
  (PP of/IN)
  (NP blue/NN)
  to/TO
  (ADJP the/DT white/JJ)
  (NP snow/NN)
  ./.
  (PP On/IN)
  (ADJP the/DT opposite/JJ)
  (NP side/NN)
  ,/,
  there/EX
  is/VBZ
  (ADJP a/DT sharp/JJ)
  (NP cliff/NN)
  (PP