### using Spacy

In [2]:
import spacy



In [3]:
nlp=spacy.load('en_core_web_sm')

In [4]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [5]:
doc=nlp('Tesla Inc is going to acquire Twitter Inc for $ billion ')

In [6]:
for x in doc.ents:
    print(x.text, "|", x.label_, "|", spacy.explain(x.label_))

Tesla Inc | ORG | Companies, agencies, institutions, etc.
Twitter Inc | ORG | Companies, agencies, institutions, etc.
$ billion | MONEY | Monetary values, including unit


#### Visualize
The displacy.render function from spaCy is used to visualize the named entities in a text. It generates a visual representation with colored highlights indicating the recognized entities and their respective categories.

In [9]:

from spacy import displacy
displacy.render(doc, style='ent')

In [10]:
#list of all entites
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [11]:
doc2=nlp('Sundar  is manager at Google.')
for x in doc2.ents:
    print(x)
    print(x.text, "|", x.label_, "|", spacy.explain(x.label_))

Google
Google | ORG | Companies, agencies, institutions, etc.


In [12]:
doc2=nlp('Sundar Pichai  is manager at Google.')
for x in doc2.ents:
    print(x.text, "|", x.label_, "|", spacy.explain(x.label_))

Sundar Pichai | PERSON | People, including fictional
Google | ORG | Companies, agencies, institutions, etc.


In [13]:
#customize entites
doc=nlp('Tesla is going to acquire Twitter for $ billion ')
for ent in doc.ents:
    print(ent.text, "|", ent.label_)

Twitter | PERSON
$ billion | MONEY


In [14]:
from spacy.tokens import Span
s1=Span(doc, 0,1, label="ORG")
s2=Span(doc, 5,6, label="ORG")

In [15]:
doc.set_ents([s1, s2], default='unmodified')

In [16]:
for ent in doc.ents:
    print(ent.text, "|", ent.label_)

Tesla | ORG
Twitter | ORG
$ billion | MONEY


### using NLTK

In [17]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [19]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords# Fetch the stop words
stop_words = set(stopwords.words('english'))
text = "Apple Inc. is an American multinational technology company headquartered in Cupertino, California."
tokenizer = RegexpTokenizer(r'\w+')
word_tokens = tokenizer.tokenize(text)# remove stop words
filtered_words = [word for word in word_tokens if word.lower() not in stop_words]
print(filtered_words)

['Apple', 'Inc', 'American', 'multinational', 'technology', 'company', 'headquartered', 'Cupertino', 'California']


In [21]:
## pos tagging
from nltk import pos_tag
pos_tags = pos_tag(filtered_words)
print(pos_tags)

[('Apple', 'NNP'), ('Inc', 'NNP'), ('American', 'NNP'), ('multinational', 'NNP'), ('technology', 'NN'), ('company', 'NN'), ('headquartered', 'VBD'), ('Cupertino', 'NNP'), ('California', 'NNP')]


In [22]:
## NER
from nltk import ne_chunk
named_entities = ne_chunk(pos_tags)
print(named_entities)

(S
  (PERSON Apple/NNP)
  (ORGANIZATION Inc/NNP American/NNP)
  multinational/NNP
  technology/NN
  company/NN
  headquartered/VBD
  (PERSON Cupertino/NNP California/NNP))
