In [1]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [2]:
#write a function
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + '-' + ent.label_ + '-' + str(spacy.explain(ent.label_)))
    else:
        print('No named entities found')

In [3]:
doc=nlp('May I go to Washington, DC next May to see the washington Mounment ?')
show_ents(doc)

Washington, DC-GPE-Countries, cities, states
next May-DATE-Absolute or relative dates or periods
the washington Mounment-ORG-Companies, agencies, institutions, etc.


In [4]:
doc=nlp('Do you know the Capital of India is Delhi')
show_ents(doc)

the Capital of India-ORG-Companies, agencies, institutions, etc.
Delhi-GPE-Countries, cities, states


In [5]:
doc=nlp('Can I please borrow 500 dollars from you to buy some Microsoft stock?')
show_ents(doc)

500 dollars-MONEY-Monetary values, including unit
Microsoft-ORG-Companies, agencies, institutions, etc.


In [6]:
for ent in doc.ents:
    print(ent.text , ent.start , ent.end , ent.start_char , ent.end_char , ent.label_)

500 dollars 4 6 20 31 MONEY
Microsoft 11 12 53 62 ORG


In [7]:
doc=nlp('Tesla to build a U.K. factory for $6 Million')
show_ents(doc)
for ent in doc.ents:
    print(ent.text , ent.start , ent.end , ent.start_char , ent.end_char , ent.label_)

U.K.-GPE-Countries, cities, states
$6 Million-MONEY-Monetary values, including unit
U.K. 4 5 17 21 GPE
$6 Million 7 10 34 44 MONEY


In [8]:
#adding a named entity to a span

In [9]:
doc=nlp('Tesla to build a U.k. factory for $6 million')
show_ents(doc)

U.k.-GPE-Countries, cities, states
$6 million-MONEY-Monetary values, including unit


In [10]:
from spacy.tokens import Span

In [11]:
#get the hash value of the org entity
ORG = doc.vocab.strings['ORG']

In [12]:
new_ent=Span(doc,0,1,label=ORG)

In [13]:
doc.ents=list(doc.ents)+[new_ent]

In [14]:
#doc=the name of the doc span
#0=the start index position of the span
#1=the stop index position (exclusive)
#label=org-the label assigned to our entity

In [15]:
show_ents(doc)

Tesla-ORG-Companies, agencies, institutions, etc.
U.k.-GPE-Countries, cities, states
$6 million-MONEY-Monetary values, including unit


In [16]:
doc=nlp('Tesla to build  a U.k. factory for $6 million.Tesla is a AI car')
show_ents(doc)

U.k.-GPE-Countries, cities, states
$6 million-MONEY-Monetary values, including unit
Tesla-ORG-Companies, agencies, institutions, etc.
AI-ORG-Companies, agencies, institutions, etc.


In [17]:
#adding named entity for all index
doc=nlp('Our company plans to intoduce a new vacuum cleaner. If successful, the vacuum cleaner will be our first product.')

In [18]:
from spacy.matcher import PhraseMatcher

In [19]:
matcher=PhraseMatcher(nlp.vocab)

In [20]:
#create the desired phrase patterns
phrase_list=['vacuum cleaner','vacumm-cleaner']

In [21]:
phrase_patterns=[nlp(text) for text in phrase_list]

In [22]:
matcher.add('newproduct',None,*phrase_patterns)

In [23]:
matches=matcher(doc)

In [24]:
print(matches)

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]


In [25]:
#here we create spans for each match,and create named entities from the doc
from spacy.tokens import Span

In [26]:
PROD=doc.vocab.strings['PRODUCT']

In [27]:
new_ents=[Span(doc,match[1],match[2],label=PROD)for match in matches]

In [28]:
doc.ents=list(doc.ents)+new_ents

In [29]:
show_ents(doc)

vacuum cleaner-PRODUCT-Objects, vehicles, foods, etc. (not services)
vacuum cleaner-PRODUCT-Objects, vehicles, foods, etc. (not services)
first-ORDINAL-"first", "second", etc.


In [30]:
#counting entities
doc=nlp('Originally priced at $29.50, the seater was marked down to five dollars.')
show_ents(doc)

29.50-MONEY-Monetary values, including unit
five dollars-MONEY-Monetary values, including unit


In [31]:
len([ent for ent in doc.ents if ent.label_=='MONEY'])

2

In [33]:
doc=nlp('Originally priced at $29.50,\nthe seater was marked down to five dollars.')
show_ents(doc)

29.50-MONEY-Monetary values, including unit
five dollars-MONEY-Monetary values, including unit


In [35]:
spacy.__version__

'3.7.6'

In [36]:
doc=nlp('Originally priced at $29.50,\nthe seater was marked down to five dollars.')
show_ents(doc)

29.50-MONEY-Monetary values, including unit
five dollars-MONEY-Monetary values, including unit


In [37]:
#noun chunks
doc=nlp('Autonomous cars shift insurance liability toward manufacturers.')
for chunk in doc.noun_chunks:
    print(chunk.text+'-'+chunk.root.text+'-'+chunk.root.dep_+'-'+chunk.root.head.text)

Autonomous cars-cars-nsubj-shift
insurance liability-liability-dobj-shift
manufacturers-manufacturers-pobj-toward


In [42]:
len(doc.noun_chunks)

TypeError: object of type 'generator' has no len()

In [45]:
print(list(doc.noun_chunks))

[Autonomous cars, insurance liability, manufacturers]


In [47]:
len(list(doc.noun_chunks))

3

In [49]:
#import the displacy library
from spacy import displacy

In [54]:
doc=nlp('Over the last quater Apple sold nearly 20 tousand iPods for a profit of $6 million.'
        'By contrast,Sony sold only 7 thousand Walkman music players.')
displacy.render(doc,style='ent',jupyter=True)

In [52]:
#viewing sentences line by line
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True)

In [55]:
doc2=nlp('Over the last quater Apple sold nearly 20 tousand iPods for a profit of $6 million.'
        'By contrast,my kids sold a lot of lemonade.')
for sent in doc2.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True)



In [56]:
for sent in doc2.sents:
    docx=nlp(sent.text)
    if docx.ents:
        displacy.render(docx,style='ent',jupyter=True)
    else:
        print(docx.text)

By contrast,my kids sold a lot of lemonade.


In [60]:
doc2=nlp('Over the last quater Apple sold nearly 20 tousand iPods for a profit of $6 million.'
        'By contrast,Sony sold only 7 thousand Walkman music players.')
for sent in doc2.sents:
    docx=nlp(sent.text)
    if docx.ents:
        displacy.render(docx,style='ent',jupyter=True)
    else:
        print(docx.text)

In [62]:
options={'ents':['ORG',"PRODUCT"]}
displacy.render(doc2,style='ent',jupyter=True,options=options)