### The purpose of this project is to build a machine learning system that can automatically identify and categorize key entities in text, such as people, places, and organizations. This is a core task in Natural Language Processing (NLP) known as Named Entity Recognition (NER).

In [1]:
import nltk
nltk.download('conll2002')

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


True

In [2]:
corpus = nltk.corpus.conll2002.iob_sents()

In [3]:
data = []
for cor in corpus:
    sent, _, tag = list(zip(*cor))
    data.append([sent, tag])

In [4]:
len(data)

35651

In [5]:
data[0]

[('Sao',
  'Paulo',
  '(',
  'Brasil',
  ')',
  ',',
  '23',
  'may',
  '(',
  'EFECOM',
  ')',
  '.'),
 ('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')]

## Numerization


In [6]:
flatten = lambda l: [item for sublist in l for item in sublist]

sents, tags = list(zip(*data))
vocab  = list(set(flatten(sents)))
tagset = list(set(flatten(tags)))

In [7]:
tagset

['I-LOC', 'B-MISC', 'I-ORG', 'B-LOC', 'I-PER', 'I-MISC', 'B-ORG', 'B-PER', 'O']

In [8]:
word2index = {'<UNK>': 0, '<DUMMY>': 1}
for v in vocab:
    if word2index.get(v) is None:
        word2index[v] = len(word2index)
index2word = {v:k for k, v in word2index.items()}

tag2index = {}
for v in tagset:
    if tag2index.get(v) is None:
        tag2index[v] = len(tag2index)
index2tag = {v:k for k, v in tag2index.items()}