In [56]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [57]:
# word tokenization and part-of-speech tagging applied on the sentence.
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [58]:
ex = 'Mark playing in garden with his friend John and searching for games on Google, on Monday at 11, suddenly they got $4 :)'

### list of tuples containing the individual words in the sentence and their associated part-of-speech.

In [59]:
sent = preprocess(ex)
sent

[('Mark', 'NNP'),
 ('playing', 'NN'),
 ('in', 'IN'),
 ('garden', 'NN'),
 ('with', 'IN'),
 ('his', 'PRP$'),
 ('friend', 'NN'),
 ('John', 'NNP'),
 ('and', 'CC'),
 ('searching', 'VBG'),
 ('for', 'IN'),
 ('games', 'NNS'),
 ('on', 'IN'),
 ('Google', 'NNP'),
 (',', ','),
 ('on', 'IN'),
 ('Monday', 'NNP'),
 ('at', 'IN'),
 ('11', 'CD'),
 (',', ','),
 ('suddenly', 'RB'),
 ('they', 'PRP'),
 ('got', 'VBD'),
 ('$', '$'),
 ('4', 'CD'),
 (':', ':'),
 (')', ')')]

Now we’ll implement noun phrase chunking to identify named entities using a regular expression consisting of rules that indicate how sentences should be chunked.
Our chunk pattern consists of one rule, that a noun phrase, NP, should be formed whenever the chunker finds an optional determiner, DT, followed by any number of adjectives, JJ, and then a noun, NN.

In [60]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

### Chunking

In [61]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  Mark/NNP
  (NP playing/NN)
  in/IN
  (NP garden/NN)
  with/IN
  his/PRP$
  (NP friend/NN)
  John/NNP
  and/CC
  searching/VBG
  for/IN
  games/NNS
  on/IN
  Google/NNP
  ,/,
  on/IN
  Monday/NNP
  at/IN
  11/CD
  ,/,
  suddenly/RB
  they/PRP
  got/VBD
  $/$
  4/CD
  :/:
  )/))


In [62]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
from nltk.chunk import ne_chunk
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('Mark', 'NNP', 'O'),
 ('playing', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('garden', 'NN', 'B-NP'),
 ('with', 'IN', 'O'),
 ('his', 'PRP$', 'O'),
 ('friend', 'NN', 'B-NP'),
 ('John', 'NNP', 'O'),
 ('and', 'CC', 'O'),
 ('searching', 'VBG', 'O'),
 ('for', 'IN', 'O'),
 ('games', 'NNS', 'O'),
 ('on', 'IN', 'O'),
 ('Google', 'NNP', 'O'),
 (',', ',', 'O'),
 ('on', 'IN', 'O'),
 ('Monday', 'NNP', 'O'),
 ('at', 'IN', 'O'),
 ('11', 'CD', 'O'),
 (',', ',', 'O'),
 ('suddenly', 'RB', 'O'),
 ('they', 'PRP', 'O'),
 ('got', 'VBD', 'O'),
 ('$', '$', 'O'),
 ('4', 'CD', 'O'),
 (':', ':', 'O'),
 (')', ')', 'O')]


there is one token per line, each with its part-of-speech tag and its named entity tag. Based on this training corpus, we can construct a tagger that can be used to label new sentences; and use the nltk.chunk.conlltags2tree() function to convert the tag sequences into a chunk tree.

In [63]:
ne_tree = ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

(S
  (PERSON Mark/NNP)
  playing/NN
  in/IN
  garden/NN
  with/IN
  his/PRP$
  friend/NN
  (PERSON John/NNP)
  and/CC
  searching/VBG
  for/IN
  games/NNS
  on/IN
  (GPE Google/NNP)
  ,/,
  on/IN
  Monday/NNP
  at/IN
  11/CD
  ,/,
  suddenly/RB
  they/PRP
  got/VBD
  $/$
  4/CD
  :/:
  )/))


# SpaCy
SpaCy’s named entity recognition has been trained on the OntoNotes 5 corpus

In [64]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [65]:
doc = nlp(ex)
pprint([(X.text, X.label_) for X in doc.ents])

[('Mark', 'PERSON'),
 ('John', 'PERSON'),
 ('Google', 'ORG'),
 ('Monday', 'DATE'),
 ('11', 'CARDINAL'),
 ('4', 'MONEY')]


### Token

In [66]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(Mark, 'B', 'PERSON'),
 (playing, 'O', ''),
 (in, 'O', ''),
 (garden, 'O', ''),
 (with, 'O', ''),
 (his, 'O', ''),
 (friend, 'O', ''),
 (John, 'B', 'PERSON'),
 (and, 'O', ''),
 (searching, 'O', ''),
 (for, 'O', ''),
 (games, 'O', ''),
 (on, 'O', ''),
 (Google, 'B', 'ORG'),
 (,, 'O', ''),
 (on, 'O', ''),
 (Monday, 'B', 'DATE'),
 (at, 'O', ''),
 (11, 'B', 'CARDINAL'),
 (,, 'O', ''),
 (suddenly, 'O', ''),
 (they, 'O', ''),
 (got, 'O', ''),
 ($, 'O', ''),
 (4, 'B', 'MONEY'),
 (:), 'O', '')]


"B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set

In [67]:
displacy.render(nlp(str(ex)), jupyter=True, style='ent')

In [68]:
displacy.render(nlp(str(ex)), style='dep', jupyter = True, options = {'distance': 120})

In [69]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(ex)) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Mark', 'PROPN', 'Mark'),
 ('playing', 'VERB', 'play'),
 ('garden', 'NOUN', 'garden'),
 ('friend', 'NOUN', 'friend'),
 ('John', 'PROPN', 'John'),
 ('searching', 'VERB', 'search'),
 ('games', 'NOUN', 'game'),
 ('Google', 'PROPN', 'Google'),
 ('Monday', 'PROPN', 'Monday'),
 ('11', 'NUM', '11'),
 ('suddenly', 'ADV', 'suddenly'),
 ('got', 'VERB', 'get'),
 ('$', 'SYM', '$'),
 ('4', 'NUM', '4')]

### Named entity extraction

In [70]:
dict([(str(x), x.label_) for x in nlp(str(ex)).ents])


{'Mark': 'PERSON',
 'John': 'PERSON',
 'Google': 'ORG',
 'Monday': 'DATE',
 '11': 'CARDINAL',
 '4': 'MONEY'}

In [71]:
print([(x, x.ent_iob_, x.ent_type_) for x in nlp(str(ex))])

[(Mark, 'B', 'PERSON'), (playing, 'O', ''), (in, 'O', ''), (garden, 'O', ''), (with, 'O', ''), (his, 'O', ''), (friend, 'O', ''), (John, 'B', 'PERSON'), (and, 'O', ''), (searching, 'O', ''), (for, 'O', ''), (games, 'O', ''), (on, 'O', ''), (Google, 'B', 'ORG'), (,, 'O', ''), (on, 'O', ''), (Monday, 'B', 'DATE'), (at, 'O', ''), (11, 'B', 'CARDINAL'), (,, 'O', ''), (suddenly, 'O', ''), (they, 'O', ''), (got, 'O', ''), ($, 'O', ''), (4, 'B', 'MONEY'), (:), 'O', '')]
