In [None]:
'''
What is Named Entity Recognition?
NER

To understand what is Named Entity Recognition process in NLP, it will be a good starting point to 
first understand the concept of Named Entity.

i) Named Entity
Named entities are proper nouns that refer to specific entities that can be a person, organization, location, date, etc.
Consider this example – “Mount Everest is the tallest mountain”. Here Mount Everest is a named entity of type location 
as it refers to a specific entity.

Some other examples of named entities are listed below in the table.

Named Entity:	Examples
1	ORGANIZATION	SEI, BCCI, Pakistan Cricket Board
2	PERSON	Barack Obama, Narendra Modi, Kohli
3	MONEY	7 million dollars, INR 7 Crore
4	GPE	India, Australia, South East Asia
5	LOCATION	Mount Everest, River Nile
6	DATE	8th June 1998, 7 April
7	TIME	8:45 A.M., two-fifty am
ii) Named Entity Recognition
In information retrieval and natural language processing, Named Entity Recognition (NER) is the process of extracting Named
Entities from the text.

NER is a two steps process, we first perform Part of Speech (POS) tagging on the text, and then using it we extract the named
entities based on the information of POS tagging
'''

In [None]:
'''
Uses of Named Entity Recognition
Named Entity Recognition is useful in –

The field of academics by easy and faster extraction of information for the students and researchers from the searching data.
-In Question Answer system to provide answers from the data by the machine and hence minimizing human efforts.
-In content classification by identifying the theme and subject of the contents and makes the process faster and easy,
    suggesting the best content of interest.
-Helps in customer service by categorizing the user complaint, request, and question in respective fields and filtering by priority keywords.
-Helps to categories the books and articles in the e-library on different subjects and thus making it organized.

'''

In [1]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [4]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [5]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [6]:
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [7]:
def process_content(binary=False):
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            named_entity = nltk.ne_chunk(tagged, binary=binary) ## this is the function for NER
            
            print(named_entity)
            
    except Exception as e:
        print(str(e))

In [8]:
process_content()

(S
  PRESIDENT/NNP
  (PERSON GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (ORGANIZATION ADDRESS/NNP)
  BEFORE/IN
  A/NNP
  (ORGANIZATION JOINT/NNP)
  SESSION/NNP
  OF/IN
  (ORGANIZATION THE/NNP)
  (ORGANIZATION CONGRESS/NNP)
  ON/NNP
  THE/NNP
  (ORGANIZATION STATE/NNP OF/IN)
  (ORGANIZATION THE/NNP)
  (ORGANIZATION UNION/NNP)
  January/NNP
  31/CD
  ,/,
  2006/CD
  (ORGANIZATION THE/NNP)
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP
  all/DT
  ./.)
(S
  (PERSON Mr./NNP Speaker/NNP)
  ,/,
  Vice/NNP
  President/NNP
  (PERSON Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (ORGANIZATION Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (ORGANIZATION Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  who/WP
  called/VBD
  (GPE America/NNP)
  to/TO
  its/PRP$
  founding/NN
  ideals/NNS
  

In [9]:
process_content(True)

(S
  PRESIDENT/NNP
  (NE GEORGE/NNP)
  W./NNP
  BUSH/NNP
  'S/POS
  (NE ADDRESS/NNP)
  BEFORE/IN
  A/NNP
  JOINT/NNP
  SESSION/NNP
  OF/IN
  (NE THE/NNP)
  (NE CONGRESS/NNP)
  ON/NNP
  THE/NNP
  STATE/NNP
  OF/IN
  (NE THE/NNP UNION/NNP)
  January/NNP
  31/CD
  ,/,
  2006/CD
  THE/NNP
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP
  all/DT
  ./.)
(S
  (NE Mr./NNP Speaker/NNP)
  ,/,
  Vice/NNP
  President/NNP
  (NE Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (NE Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (NE Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  who/WP
  called/VBD
  (NE America/NNP)
  to/TO
  its/PRP$
  founding/NN
  ideals/NNS
  and/CC
  carried/VBD
  on/IN
  a/DT
  noble/JJ
  dream/NN
  ./.)
(S
  Tonight/NN
  we/PRP
  are/VBP
  comforted/VBN
  by/IN
  the

In [None]:
##Additional Stuff

In [10]:
import nltk
from nltk import word_tokenize,pos_tag

text = "NASA awarded Elon Musk’s SpaceX a $2.9 billion contract to build the lunar lander."
tokens = word_tokenize(text)
tag=pos_tag(tokens)
print(tag)

ne_tree = nltk.ne_chunk(tag)
print(ne_tree)

[('NASA', 'NNP'), ('awarded', 'VBD'), ('Elon', 'NNP'), ('Musk', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('SpaceX', 'NNP'), ('a', 'DT'), ('$', '$'), ('2.9', 'CD'), ('billion', 'CD'), ('contract', 'NN'), ('to', 'TO'), ('build', 'VB'), ('the', 'DT'), ('lunar', 'NN'), ('lander', 'NN'), ('.', '.')]
(S
  (ORGANIZATION NASA/NNP)
  awarded/VBD
  (PERSON Elon/NNP Musk/NNP)
  ’/NNP
  s/VBD
  (ORGANIZATION SpaceX/NNP)
  a/DT
  $/$
  2.9/CD
  billion/CD
  contract/NN
  to/TO
  build/VB
  the/DT
  lunar/NN
  lander/NN
  ./.)


In [11]:
sent = nltk.corpus.treebank.tagged_sents()
print(nltk.ne_chunk(sent[0]))

(S
  (PERSON Pierre/NNP)
  (ORGANIZATION Vinken/NNP)
  ,/,
  61/CD
  years/NNS
  old/JJ
  ,/,
  will/MD
  join/VB
  the/DT
  board/NN
  as/IN
  a/DT
  nonexecutive/JJ
  director/NN
  Nov./NNP
  29/CD
  ./.)


In [None]:
## NER with SpaCy

In [12]:
import spacy 
nlp = spacy.load("en_core_web_sm")

doc = nlp("NASA awarded Elon Musk’s SpaceX a $2.9 billion contract to build the lunar lander.")
for token in doc:
    print(token.text, token.ent_iob_, token.ent_type_)

NASA B ORG
awarded O 
Elon B PERSON
Musk I PERSON
’s I PERSON
SpaceX O 
a O 
$ B MONEY
2.9 I MONEY
billion I MONEY
contract O 
to O 
build O 
the O 
lunar O 
lander O 
. O 
