In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
# write a function to dispacy basic entity info:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text +" - "+ ent.label_+" - " + spacy.explain(ent.label_))
    else:
        print("No Named entity found.")

In [3]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')

show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [4]:
doc = nlp(u'Can I please borrow 500 dollars from you to buy some Microsoft stock?')

In [5]:
for ent in doc.ents:
    print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

500 dollars 4 6 20 31 MONEY
Microsoft 11 12 53 62 ORG


In [6]:
doc = nlp(u'Tesla to build a U.K. factory for $6 million')

In [7]:
show_ents(doc) # ---> Tesla is not identified as an organization

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [8]:
from spacy.tokens import Span

ORG = doc.vocab.strings["ORG"]

new_ent = Span(doc, 0,1, label = ORG)

# Add the entity to the existing doc object
doc.ents = list(doc.ents) + [new_ent]

In [9]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [10]:
doc = nlp(u'Our company plans to introduce a new vacuum cleaner. '
          u'If successful, the vacuum cleaner will be our first product.')

show_ents(doc)

first - ORDINAL - "first", "second", etc.


In [11]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [12]:
# Create the desired phrase patterns:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [13]:
# Apply the patterns to our matcher object:
matcher.add('newproduct', phrase_patterns)

# Apply the matcher to our Doc object:
matches = matcher(doc)

# See what matches occur:
matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [14]:
doc = nlp(u'Our company plans to introduce a new vacuum cleaner. '
          u'If successful, the vacuum cleaner will be our first product.')

show_ents(doc)

first - ORDINAL - "first", "second", etc.


In [15]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [16]:
# Create the desired phrase patterns:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [17]:
matcher.add("newproduct",phrase_patterns)

In [18]:
matches = matcher(doc)
matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [19]:
from spacy.tokens import Span

PROD = doc.vocab.strings["PRODUCT"]
new_ents = [Span(doc, match[1], match[2], label =PROD) for match in matches]

doc.ents = list(doc.ents) + new_ents

In [20]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
first - ORDINAL - "first", "second", etc.


In [21]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')

show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [22]:
len([ent for ent in doc.ents if ent.label_ =="MONEY"])

2

In [23]:
doc = nlp(u'Originally priced at $29.50,\nthe sweater was marked down to five dollars.')

show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [24]:
spacy.__version__

'3.7.4'

In [25]:
doc = nlp("'Originally priced at $29.50,\nthe sweater was marked down to five dollars.'")

show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [26]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [27]:
from spacy.language import Language
@Language.component("remove_whitespace")
def remove_whitespace_entities(doc):
    doc.ents = [e for e in doc.ents if not e.text.isspace()]
    return doc

nlp.add_pipe("remove_whitespace", after ="ner")

<function __main__.remove_whitespace_entities(doc)>

Above is just for showing how to add custom components in spacy

the above code removes the spaces as entites

In [2]:
doc = nlp("Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc.noun_chunks:
    print(chunk.text+" - "+chunk.root.text+" - "+chunk.root.dep_+" - "+chunk.root.head.text)

Autonomous cars - cars - nsubj - shift
insurance liability - liability - dobj - shift
manufacturers - manufacturers - pobj - toward


In [31]:
len(doc.sents)

TypeError: object of type 'generator' has no len()

In [32]:
len(doc.noun_chunks)

TypeError: object of type 'generator' has no len()

In [8]:
len(doc)

8

In [33]:
len(list(doc.noun_chunks))

3

In [34]:
len(list(doc.sents))

1