In [2]:
import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_lg")

In [3]:
# parse the text
doc = nlp("The rain is over. He's happy.")

In [4]:
# The data stored is lossless
doc.text

"The rain is over. He's happy."

In [5]:
# The text has already been tokenized
for i, token in enumerate(doc):
    print(f'{i}:{token}')

0:The
1:rain
2:is
3:over
4:.
5:He
6:'s
7:happy
8:.


In [8]:
# We can access many token attributes
for token in doc:
    print(f'{token}:{token.pos_}')

The:DET
rain:NOUN
is:AUX
over:ADV
.:PUNCT
He:PRON
's:AUX
happy:ADJ
.:PUNCT


In [9]:
# The text has already been sentencized
for i, sentence in enumerate(doc.sents):
    print(f'{i}:{sentence}')

0:The rain is over.
1:He's happy.


In [10]:
# We can assign arbitrary span attributes

# Register span attributes
Span.set_extension('speaker', default=None, force=True)

# assign span attributes
for sentence in doc.sents:
    sentence._.speaker = 'Yu'

# get span attributes
for sentence in doc.sents:
    print(f'speaker:{sentence._.speaker}, sentence:{sentence}')

speaker:Yu, sentence:The rain is over.
speaker:Yu, sentence:He's happy.
