In [1]:
import nltk
import pandas as pd
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [2]:
text = "Pichai Sundararajan, better known as Sundar Pichai.\
is an Indian-American business executive. He is the chief executive officer of Alphabet Inc.and its subsidiary Google.\
Born in Madurai, India, Pichai earned his degree from IIT Kharagpur in metallurgical engineering."

In [3]:
#tokenize to words Basic Named Entity (NE) tagging using NLTK - Word based
words = nltk.word_tokenize(text)
words

['Pichai',
 'Sundararajan',
 ',',
 'better',
 'known',
 'as',
 'Sundar',
 'Pichai.is',
 'an',
 'Indian-American',
 'business',
 'executive',
 '.',
 'He',
 'is',
 'the',
 'chief',
 'executive',
 'officer',
 'of',
 'Alphabet',
 'Inc.and',
 'its',
 'subsidiary',
 'Google.Born',
 'in',
 'Madurai',
 ',',
 'India',
 ',',
 'Pichai',
 'earned',
 'his',
 'degree',
 'from',
 'IIT',
 'Kharagpur',
 'in',
 'metallurgical',
 'engineering',
 '.']

In [4]:
#Part of speech tagging
pos_tags = nltk.pos_tag(words)
pos_tags

[('Pichai', 'NNP'),
 ('Sundararajan', 'NNP'),
 (',', ','),
 ('better', 'RB'),
 ('known', 'VBN'),
 ('as', 'IN'),
 ('Sundar', 'NNP'),
 ('Pichai.is', 'NNP'),
 ('an', 'DT'),
 ('Indian-American', 'JJ'),
 ('business', 'NN'),
 ('executive', 'NN'),
 ('.', '.'),
 ('He', 'PRP'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('chief', 'JJ'),
 ('executive', 'JJ'),
 ('officer', 'NN'),
 ('of', 'IN'),
 ('Alphabet', 'NNP'),
 ('Inc.and', 'NNP'),
 ('its', 'PRP$'),
 ('subsidiary', 'NN'),
 ('Google.Born', 'NNP'),
 ('in', 'IN'),
 ('Madurai', 'NNP'),
 (',', ','),
 ('India', 'NNP'),
 (',', ','),
 ('Pichai', 'NNP'),
 ('earned', 'VBD'),
 ('his', 'PRP$'),
 ('degree', 'NN'),
 ('from', 'IN'),
 ('IIT', 'NNP'),
 ('Kharagpur', 'NNP'),
 ('in', 'IN'),
 ('metallurgical', 'JJ'),
 ('engineering', 'NN'),
 ('.', '.')]

In [5]:
#check nltk help for description of the tag
nltk.help.upenn_tagset('NNP')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


In [6]:
chunks = nltk.ne_chunk(pos_tags, binary=True) #either NE or not NE
for chunk in chunks:
    print(chunk)

(NE Pichai/NNP Sundararajan/NNP)
(',', ',')
('better', 'RB')
('known', 'VBN')
('as', 'IN')
(NE Sundar/NNP)
('Pichai.is', 'NNP')
('an', 'DT')
('Indian-American', 'JJ')
('business', 'NN')
('executive', 'NN')
('.', '.')
('He', 'PRP')
('is', 'VBZ')
('the', 'DT')
('chief', 'JJ')
('executive', 'JJ')
('officer', 'NN')
('of', 'IN')
(NE Alphabet/NNP)
('Inc.and', 'NNP')
('its', 'PRP$')
('subsidiary', 'NN')
('Google.Born', 'NNP')
('in', 'IN')
(NE Madurai/NNP)
(',', ',')
(NE India/NNP)
(',', ',')
(NE Pichai/NNP)
('earned', 'VBD')
('his', 'PRP$')
('degree', 'NN')
('from', 'IN')
(NE IIT/NNP Kharagpur/NNP)
('in', 'IN')
('metallurgical', 'JJ')
('engineering', 'NN')
('.', '.')


In [7]:
entities =[]
labels =[]
for chunk in chunks:
    if hasattr(chunk,'label'):
        #print(chunk)
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())
        
entities_labels = list(set(zip(entities, labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities","Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,India,NE
1,Alphabet,NE
2,Pichai Sundararajan,NE
3,Pichai,NE
4,Sundar,NE
5,Madurai,NE
6,IIT Kharagpur,NE


In [8]:
chunks = nltk.ne_chunk(pos_tags, binary=False) #either NE or not NE
for chunk in chunks:
    print(chunk)
    
entities =[]
labels =[]
for chunk in chunks:
    if hasattr(chunk,'label'):
        #print(chunk)
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())
        
entities_labels = list(set(zip(entities, labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities","Labels"]
entities_df

(PERSON Pichai/NNP)
(ORGANIZATION Sundararajan/NNP)
(',', ',')
('better', 'RB')
('known', 'VBN')
('as', 'IN')
(ORGANIZATION Sundar/NNP)
('Pichai.is', 'NNP')
('an', 'DT')
('Indian-American', 'JJ')
('business', 'NN')
('executive', 'NN')
('.', '.')
('He', 'PRP')
('is', 'VBZ')
('the', 'DT')
('chief', 'JJ')
('executive', 'JJ')
('officer', 'NN')
('of', 'IN')
(ORGANIZATION Alphabet/NNP)
('Inc.and', 'NNP')
('its', 'PRP$')
('subsidiary', 'NN')
('Google.Born', 'NNP')
('in', 'IN')
(GPE Madurai/NNP)
(',', ',')
(GPE India/NNP)
(',', ',')
(PERSON Pichai/NNP)
('earned', 'VBD')
('his', 'PRP$')
('degree', 'NN')
('from', 'IN')
(ORGANIZATION IIT/NNP Kharagpur/NNP)
('in', 'IN')
('metallurgical', 'JJ')
('engineering', 'NN')
('.', '.')


Unnamed: 0,Entities,Labels
0,IIT Kharagpur,ORGANIZATION
1,Sundar,ORGANIZATION
2,Pichai,PERSON
3,Sundararajan,ORGANIZATION
4,India,GPE
5,Alphabet,ORGANIZATION
6,Madurai,GPE


In [9]:
#Basic Named Entity (NE) tagging using NLTK - Sentence based
entities = []
labels = []

sentence = nltk.sent_tokenize(text)
for sent in sentence:
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)),binary=False):
        if hasattr(chunk,'label'):
            entities.append(' '.join(c[0] for c in chunk))
            labels.append(chunk.label())
            
entities_labels = list(set(zip(entities,labels)))

entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities","Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,IIT Kharagpur,ORGANIZATION
1,Sundar,ORGANIZATION
2,Pichai,PERSON
3,Sundararajan,ORGANIZATION
4,India,GPE
5,Alphabet,ORGANIZATION
6,Madurai,GPE


In [10]:
import spacy 
from spacy import displacy
#SpaCy 2.x brough significant speed and accuracy improvements
spacy.__version__

'3.4.3'

In [11]:
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

In [12]:
doc = nlp(text)

entities = []
labels = []
position_start = []
position_end = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
    
df = pd.DataFrame({'Entities':entities,'Labels':labels,'Position_Start':position_start, 'Position_End':position_end})

df

Unnamed: 0,Entities,Labels,Position_Start,Position_End
0,"(Pichai, Sundararajan)",PERSON,0,19
1,"(Sundar, Pichai.is)",LOC,37,53
2,"(Indian, -, American)",NORP,57,72
3,"(Alphabet, Inc.and)",ORG,130,146
4,(Google),ORG,162,168
5,(Madurai),GPE,177,184
6,(India),GPE,186,191
7,(Pichai),GPE,193,199
8,"(IIT, Kharagpur)",ORG,223,236
