In [1]:
from pprint import pprint

# Using NLTK

In [2]:
from nltk import word_tokenize, pos_tag, ne_chunk

### All good!

In [3]:
sentence = "Mark is working at the South Africa offices at Google"
ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))
print(ne_tree)

(S
  (PERSON Mark/NNP)
  is/VBZ
  working/VBG
  at/IN
  the/DT
  (LOCATION South/NNP Africa/NNP)
  offices/NNS
  at/IN
  (ORGANIZATION Google/NNP))


### Not so good...

In [4]:
sentence = "Donald is working at the Netherlands offices of Google"
print(ne_chunk(pos_tag(word_tokenize(sentence))))

(S
  (GPE Donald/NNP)
  is/VBZ
  working/VBG
  at/IN
  the/DT
  (GPE Netherlands/NNP)
  offices/NNS
  of/IN
  (GPE Google/NNP))


### Include BILOU / IOB

In [5]:
from nltk.chunk import conlltags2tree, tree2conlltags

In [6]:
iob_tagged = tree2conlltags(ne_tree)
pprint (iob_tagged)

[('Mark', 'NNP', 'B-PERSON'),
 ('is', 'VBZ', 'O'),
 ('working', 'VBG', 'O'),
 ('at', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('South', 'NNP', 'B-LOCATION'),
 ('Africa', 'NNP', 'I-LOCATION'),
 ('offices', 'NNS', 'O'),
 ('at', 'IN', 'O'),
 ('Google', 'NNP', 'B-ORGANIZATION')]


# Using Spacy

In [7]:
import spacy
nlp = spacy.load("en_core_web_md")

In [8]:
doc = nlp("Mark is working at the South Africa offices at Google")
pprint([(x.text, x.label_) for x in doc.ents])

[('Mark', 'PERSON'), ('South Africa', 'GPE'), ('Google', 'ORG')]


In [9]:
doc = nlp("Donald is working at the Netherlands offices of Google")
pprint([(x.text, x.label_) for x in doc.ents])

[('Donald', 'PERSON'), ('Netherlands', 'GPE'), ('Google', 'ORG')]


### BILOU tags

In [10]:
pprint([(x, x.ent_iob_, x.ent_type_) for x in doc])

[(Donald, 'B', 'PERSON'),
 (is, 'O', ''),
 (working, 'O', ''),
 (at, 'O', ''),
 (the, 'O', ''),
 (Netherlands, 'B', 'GPE'),
 (offices, 'O', ''),
 (of, 'O', ''),
 (Google, 'B', 'ORG')]
