In [61]:
import spacy

In [62]:
nlp = spacy.load("en_core_web_sm")

In [63]:
nlp

<spacy.lang.en.English at 0x1f73db3a060>

In [64]:
introduction_doc = nlp(
...     "This tutorial is about Natural Language Processing in spaCy."
... )

In [65]:
type(introduction_doc)

spacy.tokens.doc.Doc

In [66]:
[token.text for token in introduction_doc]

['This',
 'tutorial',
 'is',
 'about',
 'Natural',
 'Language',
 'Processing',
 'in',
 'spaCy',
 '.']

In [67]:
import pathlib

In [68]:
file_name = "introduction.txt"

In [69]:
import os
file_name = "introduction.txt"
if os.path.exists(file_name):
    print("File exists")
else:
    print("File not found")

File exists


In [70]:
introduction_doc = nlp(pathlib.Path(file_name).read_text(encoding="utf-8"))

In [71]:
print ([token.text for token in introduction_doc])

[]


In [72]:
about_text = (
...     "Gus Proto is a Python developer currently"
...     " working for a London-based Fintech"
...     " company. He is interested in learning"
...     " Natural Language Processing."
... )

In [73]:
about_doc = nlp(about_text)

In [74]:
sentences = list(about_doc.sents)


In [75]:
len(sentences)

2

In [76]:
for sentence in sentences:print(f"{sentence[:5]}...")

Gus Proto is a Python...
He is interested in learning...


In [82]:
ellipsis_text = (
...     "Gus, can you, ... never mind, I forgot"
...     " what I was saying. So, do you think"
...     " we should ..."
... )

In [78]:
from spacy.language import Language

In [79]:
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
     """Add support to use `...` as a delimiter for sentence detection"""
     for token in doc[:-1]:
         if token.text == "...":
             doc[token.i + 1].is_sent_start = True
     return doc


In [80]:
custom_nlp = spacy.load("en_core_web_sm")
custom_nlp.add_pipe("set_custom_boundaries", before="parser")
custom_ellipsis_doc = custom_nlp(ellipsis_text)
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)
for sentence in custom_ellipsis_sentences:
     print(sentence)



Gus, can you, ...
never mind, I forgot what I was saying.
So, do you think we should ...


In [83]:
import spacy
nlp = spacy.load("en_core_web_sm")



In [84]:
about_text = (
...     "Gus Proto is a Python developer currently"
...     " working for a London-based Fintech"
...     " company. He is interested in learning"
...     " Natural Language Processing."
... )


In [85]:
about_doc = nlp(about_text)
for token in about_doc:
     print (token, token.idx)

Gus 0
Proto 4
is 10
a 13
Python 15
developer 22
currently 32
working 42
for 50
a 54
London 56
- 62
based 63
Fintech 69
company 77
. 84
He 86
is 89
interested 92
in 103
learning 106
Natural 115
Language 123
Processing 132
. 142


In [86]:
print(
...     f"{"Text with Whitespace":22}"
...     f"{"Is Alphanumeric?":15}"
...     f"{"Is Punctuation?":18}"
...     f"{"Is Stop Word?"}"
... )

Text with Whitespace  Is Alphanumeric?Is Punctuation?   Is Stop Word?


In [92]:
for token in about_doc:
    print(
        f"{str(token.text_with_ws):22}"
        f"{str(token.is_alpha):15}"
        f"{str(token.is_punct):18}"
        f"{str(token.is_stop)}"
    )


Gus                   True           False             False
Proto                 True           False             False
is                    True           False             True
a                     True           False             True
Python                True           False             False
developer             True           False             False
currently             True           False             False
working               True           False             False
for                   True           False             True
a                     True           False             True
London                True           False             False
-                     False          True              False
based                 True           False             False
Fintech               True           False             False
company               True           False             False
.                     False          True              False
He                    True  

In [93]:
custom_about_text = (
...     "Gus Proto is a Python developer currently"
...     " working for a London@based Fintech"
...     " company. He is interested in learning"
...     " Natural Language Processing."
... )

In [94]:
print([token.text for token in nlp(custom_about_text)[8:15]])

['for', 'a', 'London@based', 'Fintech', 'company', '.', 'He']


In [95]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [102]:
piano_class_text = (
...     "York University is situated in Russia"
...     " I went to Russia on 20th june 2024"
...     " We lived in their campus in Ekertinburk city."
...     " It was very great experience"
...     " we first went to Mubai to fly to Russia"
...     " after that we were having connecting flight from Dubai Saharja Airport."
... )

In [103]:
piano_class_doc = nlp(piano_class_text)

In [106]:
for ent in piano_class_doc.ents:
    print(
        f"""
ent.text = {ent.text}
ent.start_char = {ent.start_char}
ent.end_char = {ent.end_char}
ent.label_ = {ent.label_}
spacy.explain('{ent.label_}') = {spacy.explain(ent.label_)}
        """
    )



ent.text = York University
ent.start_char = 0
ent.end_char = 15
ent.label_ = ORG
spacy.explain('ORG') = Companies, agencies, institutions, etc.
        

ent.text = Russia
ent.start_char = 31
ent.end_char = 37
ent.label_ = GPE
spacy.explain('GPE') = Countries, cities, states
        

ent.text = Russia
ent.start_char = 48
ent.end_char = 54
ent.label_ = GPE
spacy.explain('GPE') = Countries, cities, states
        

ent.text = 20th june 2024
ent.start_char = 58
ent.end_char = 72
ent.label_ = DATE
spacy.explain('DATE') = Absolute or relative dates or periods
        

ent.text = Ekertinburk city
ent.start_char = 101
ent.end_char = 117
ent.label_ = GPE
spacy.explain('GPE') = Countries, cities, states
        

ent.text = first
ent.start_char = 151
ent.end_char = 156
ent.label_ = ORDINAL
spacy.explain('ORDINAL') = "first", "second", etc.
        

ent.text = Mubai
ent.start_char = 165
ent.end_char = 170
ent.label_ = GPE
spacy.explain('GPE') = Countries, cities, states
        

ent.text = 

In [107]:
import spacy
from spacy import displacy

In [108]:
from spacy import displacy

displacy.serve(piano_class_doc, style="ent")




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
