##Import necessary Libraries

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sivavamsi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Sivavamsi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## A sentence under consideration for Information Extraction (NER)

In [2]:
sentence = 'Virat Kohli is an Indian cricketer who currently captains the India national team. A right-handed top-order batsman, Kohli is regarded as one of the best batsmen in the world'

## Apply word tokenization and part-of-speech tagging to the sentence

In [3]:
# def preprocess(sent):
#     sent =  # TOKENIZE THE SENTENCE
#     sent = # GET POS TAG OF THE SENTENCE
#     return sent

In [4]:
def preprocess(sent):
    sent = word_tokenize(sent)
    print(sent,"\n")
    sent = nltk.pos_tag(sent)
    return sent

In [5]:
sent = preprocess(sentence)
sent

['Virat', 'Kohli', 'is', 'an', 'Indian', 'cricketer', 'who', 'currently', 'captains', 'the', 'India', 'national', 'team', '.', 'A', 'right-handed', 'top-order', 'batsman', ',', 'Kohli', 'is', 'regarded', 'as', 'one', 'of', 'the', 'best', 'batsmen', 'in', 'the', 'world'] 



[('Virat', 'NNP'),
 ('Kohli', 'NNP'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('Indian', 'JJ'),
 ('cricketer', 'NN'),
 ('who', 'WP'),
 ('currently', 'RB'),
 ('captains', 'VBZ'),
 ('the', 'DT'),
 ('India', 'NNP'),
 ('national', 'JJ'),
 ('team', 'NN'),
 ('.', '.'),
 ('A', 'DT'),
 ('right-handed', 'JJ'),
 ('top-order', 'NN'),
 ('batsman', 'NN'),
 (',', ','),
 ('Kohli', 'NNP'),
 ('is', 'VBZ'),
 ('regarded', 'VBN'),
 ('as', 'IN'),
 ('one', 'CD'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('best', 'JJS'),
 ('batsmen', 'NNS'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('world', 'NN')]

## Plot a Parse Tree

In [6]:
# pattern = 'NP: {<DT>?<JJ>*<NN>}'
# NPChunker =  # Regex Parse using the pattern
# result = NPChunker.parse(sent)
# result.draw()

# SOLUTION

In [11]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'
NPChunker = nltk.RegexpParser(pattern) # Regex Parse using the pattern
result = NPChunker.parse(sent)
result.draw()

In [13]:
print(result)

(S
  Virat/NNP
  Kohli/NNP
  is/VBZ
  (NP an/DT Indian/JJ cricketer/NN)
  who/WP
  currently/RB
  captains/VBZ
  the/DT
  India/NNP
  (NP national/JJ team/NN)
  ./.
  (NP A/DT right-handed/JJ top-order/NN)
  (NP batsman/NN)
  ,/,
  Kohli/NNP
  is/VBZ
  regarded/VBN
  as/IN
  one/CD
  of/IN
  the/DT
  best/JJS
  batsmen/NNS
  in/IN
  (NP the/DT world/NN))


## POS Tags

In [14]:
sent

[('Virat', 'NNP'),
 ('Kohli', 'NNP'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('Indian', 'JJ'),
 ('cricketer', 'NN'),
 ('who', 'WP'),
 ('currently', 'RB'),
 ('captains', 'VBZ'),
 ('the', 'DT'),
 ('India', 'NNP'),
 ('national', 'JJ'),
 ('team', 'NN'),
 ('.', '.'),
 ('A', 'DT'),
 ('right-handed', 'JJ'),
 ('top-order', 'NN'),
 ('batsman', 'NN'),
 (',', ','),
 ('Kohli', 'NNP'),
 ('is', 'VBZ'),
 ('regarded', 'VBN'),
 ('as', 'IN'),
 ('one', 'CD'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('best', 'JJS'),
 ('batsmen', 'NNS'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('world', 'NN')]

In [15]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(sent)
pprint(iob_tagged)

[('Virat', 'NNP', 'O'),
 ('Kohli', 'NNP', 'O'),
 ('is', 'VBZ', 'O'),
 ('an', 'DT', 'O'),
 ('Indian', 'JJ', 'O'),
 ('cricketer', 'NN', 'O'),
 ('who', 'WP', 'O'),
 ('currently', 'RB', 'O'),
 ('captains', 'VBZ', 'O'),
 ('the', 'DT', 'O'),
 ('India', 'NNP', 'O'),
 ('national', 'JJ', 'O'),
 ('team', 'NN', 'O'),
 ('.', '.', 'O'),
 ('A', 'DT', 'O'),
 ('right-handed', 'JJ', 'O'),
 ('top-order', 'NN', 'O'),
 ('batsman', 'NN', 'O'),
 (',', ',', 'O'),
 ('Kohli', 'NNP', 'O'),
 ('is', 'VBZ', 'O'),
 ('regarded', 'VBN', 'O'),
 ('as', 'IN', 'O'),
 ('one', 'CD', 'O'),
 ('of', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('best', 'JJS', 'O'),
 ('batsmen', 'NNS', 'O'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('world', 'NN', 'O')]


**This is how Information is extracted using heuristics based techniques. Try using another pattern.**