In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)


Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("San Francisco considers banning sidewalk delivery robots")

# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)  # ['San', 'B', 'GPE']
print(ent_francisco)  # ['Francisco', 'I', 'GPE']

[('San Francisco', 0, 13, 'GPE')]
['San', 'B', 'GPE']
['Francisco', 'I', 'GPE']


In [3]:
import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
doc = nlp("fb is hiring a new vice president of global policy")
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('Before', ents)
# The model didn't recognize "fb" as an entity :(

# Create a span for the new entity
fb_ent = Span(doc, 0, 1, label="ORG")
orig_ents = list(doc.ents)

# Option 1: Modify the provided entity spans, leaving the rest unmodified
doc.set_ents([fb_ent], default="unmodified")

# Option 2: Assign a complete list of ents to doc.ents
doc.ents = orig_ents + [fb_ent]

ents = [(e.text, e.start, e.end, e.label_) for e in doc.ents]
print('After', ents)
# [('fb', 0, 1, 'ORG')] 

Before []
After [('fb', 0, 1, 'ORG')]


In [4]:
fb_ent = doc.char_span(0, 2, label="ORG")

In [5]:
import numpy
import spacy
from spacy.attrs import ENT_IOB, ENT_TYPE

nlp = spacy.load("en_core_web_sm")
doc = nlp.make_doc("London is a big city in the United Kingdom.")
print("Before", doc.ents)  # []

header = [ENT_IOB, ENT_TYPE]
attr_array = numpy.zeros((len(doc), len(header)), dtype="uint64")
attr_array[0, 0] = 3  # B
attr_array[0, 1] = doc.vocab.strings["GPE"]
doc.from_array(header, attr_array)
print("After", doc.ents)  # [London]


Before ()
After (London,)


In [10]:
import spacy
from spacy import displacy

text = "When Ruhi Sharma started working on web development at Google in 2007, few people outside of the company took him seriously."

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
displacy.serve(doc, style="ent")


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [12]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [13]:
import spacy
nlp = spacy.load("en_core_web_sm")
nlp

<spacy.lang.en.English at 0x2a578f94d50>

In [14]:
introduction_doc = nlp(
...     "This tutorial is about Natural Language Processing in spaCy."
... )
type(introduction_doc)
spacy.tokens.doc.Doc

[token.text for token in introduction_doc]
['This', 'tutorial', 'is', 'about', 'Natural', 'Language',
'Processing', 'in', 'spaCy', '.']

['This',
 'tutorial',
 'is',
 'about',
 'Natural',
 'Language',
 'Processing',
 'in',
 'spaCy',
 '.']

In [16]:
file_name = "C:/path_to_your_file/introduction.txt"

In [18]:
file_name = "../folder_name/introduction.txt"  # One directory up


In [19]:
import os
print(os.getcwd())  # This will show the current working directory


c:\Users\HP\AppData\Local\Programs\Microsoft VS Code


In [20]:
from pathlib import Path

file_name = "introduction.txt"
file_path = Path(file_name)

if file_path.exists():
    introduction_doc = nlp(file_path.read_text(encoding="utf-8"))
    print([token.text for token in introduction_doc])
else:
    print(f"File {file_name} does not exist.")


File introduction.txt does not exist.


In [23]:
# Sample text
about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)

# Processing the text with spaCy
about_doc = nlp(about_text)

# Extracting sentences
sentences = list(about_doc.sents)

# Checking the number of sentences
print(len(sentences))>>> ellipsis_text = (
...     "Gus, can you, ... never mind, I forgot"
...     " what I was saying. So, do you think"
...     " we should ..."
... )

from spacy.language import Language
@Language.component("set_custom_boundaries")
... def set_custom_boundaries(doc):
...     """Add support to use `...` as a delimiter for sentence detection"""
...     for token in doc[:-1]:
...         if token.text == "...":
...             doc[token.i + 1].is_sent_start = True
...     return doc
...

custom_nlp = spacy.load("en_core_web_sm")
custom_nlp.add_pipe("set_custom_boundaries", before="parser")
custom_ellipsis_doc = custom_nlp(ellipsis_text)
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)
for sentence in custom_ellipsis_sentences:
...     print(sentence)
...
Gus, can you, ...
never mind, I forgot what I was saying.
So, do you think we should ...

# Printing the first 5 tokens of each sentence
for sentence in sentences:
    print(f"{sentence[:5]}...")

2
Gus Proto is a Python...
He is interested in learning...


In [25]:
# Multi-line string without using ellipses or commas incorrectly
about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)

# Process text using spaCy
about_doc = nlp(about_text)

# Get sentences
sentences = list(about_doc.sents)

# Check the number of sentences
print(len(sentences))

# Print first 5 tokens of each sentence
for sentence in sentences:
    print(f"{sentence[:5]}...")


2
Gus Proto is a Python...
He is interested in learning...


In [28]:
f"{'Text with Whitespace':<22}"


'Text with Whitespace  '

In [29]:
# Align the text within 22 characters
formatted_text = f"{'Text with Whitespace':<22}"
print(formatted_text)


Text with Whitespace  


In [32]:
custom_about_text = (
...     "Gus Proto is a Python developer currently"
...     " working for a London@based Fintech"
...     " company. He is interested in learning"
...     " Natural Language Processing."
... )

print([token.text for token in nlp(custom_about_text)[8:15]])
['for', 'a', 'London@based', 'Fintech', 'company', '.', 'He']

['for', 'a', 'London@based', 'Fintech', 'company', '.', 'He']


['for', 'a', 'London@based', 'Fintech', 'company', '.', 'He']

In [40]:
import spacy

# Load the English model
nlp = spacy.load("en_core_web_sm")

# Input text about the piano academy
piano_class_text = (
    "Abacuss Academy is situated"
    " in USA or the City of America and has"
    " world-class Abacuss instructors."
)

# Process the text with spaCy to extract named entities
Abacuss_class_doc = nlp(piano_class_text)

# Loop through detected entities and print details
for ent in Abacuss_class_doc.ents:
    print(
        f"""
ent.text = '{ent.text}'
ent.start_char = {ent.start_char}
ent.end_char = {ent.end_char}
ent.label_ = '{ent.label_}'
spacy.explain('{ent.label_}') = {spacy.explain(ent.label_)}"""
    )



ent.text = 'Abacuss Academy'
ent.start_char = 0
ent.end_char = 15
ent.label_ = 'PERSON'
spacy.explain('PERSON') = People, including fictional

ent.text = 'USA'
ent.start_char = 31
ent.end_char = 34
ent.label_ = 'GPE'
spacy.explain('GPE') = Countries, cities, states

ent.text = 'the City of America'
ent.start_char = 38
ent.end_char = 57
ent.label_ = 'GPE'
spacy.explain('GPE') = Countries, cities, states

ent.text = 'Abacuss'
ent.start_char = 78
ent.end_char = 85
ent.label_ = 'PERSON'
spacy.explain('PERSON') = People, including fictional


In [42]:
displacy.serve(Abacuss_class_doc, style="ent")


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [44]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Input text for the survey
survey_text = (
    "Out of 5 people surveyed, James Robert,"
    " Julie Fuller and Benjamin Brooks like"
    " apples. Kelly Cox and Matthew Evans"
    " like oranges."
)

# Function to replace person names with "[REDACTED]"
def replace_person_names(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "[REDACTED] "
    return token.text_with_ws

# Function to redact names
def redact_names(nlp_doc):
    with nlp_doc.retokenize() as retokenizer:
        for ent in nlp_doc.ents:
            retokenizer.merge(ent)
    tokens = map(replace_person_names, nlp_doc)
    return "".join(tokens)

# Process the text
survey_doc = nlp(survey_text)

# Redact names and print the result
print(redact_names(survey_doc))


Out of 5 people surveyed, [REDACTED] , [REDACTED] and [REDACTED] like apples. [REDACTED] and [REDACTED] like oranges.
