In [58]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [59]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [60]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [61]:
# we cannot access sentences using index form a doc
doc.sents[0]

TypeError: 'generator' object is not subscriptable

In [None]:
# putting sentences in a list
list(doc.sents)

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [None]:
# Now you can access individual sentences
list(doc.sents)[1]

This is another sentence.

In [None]:
type(list(doc.sents)[1])

spacy.tokens.span.Span

In [None]:
doc.sents[1].start()

TypeError: 'generator' object is not subscriptable

In [None]:
doc_sents = [sent for sent in doc.sents]
doc_sents

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [None]:
doc_sents[1]

This is another sentence.

In [None]:
# print start and end of spans
doc_sents[1].start, doc_sents[1].end

(6, 11)

In [None]:
# SPACY'S DEFAULT BEHAVIOR
doc = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter Drucker


In [None]:
from spacy.language import Language
@Language.component("set_custom_rules")
# ADD A NEW RULE TO THE PIPELINE
def set_custom_rules(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe("set_custom_rules", before="parser")

nlp.pipe_names

ValueError: [E007] 'set_custom_rules' already exists in pipeline. Existing names: ['tok2vec', 'tagger', 'set_custom_rrules', 'set_custom_rules', 'parser', 'senter', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
# Re-run the Doc object creation:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc4.sents:
    print(sent)

"Management is doing things right;
leadership is doing the right things."
-Peter Drucker


In [62]:
# change segmentation rule
nlp = spacy.load('en_core_web_sm')

In [64]:
myString = u"This is a sentence. This is another.\n\nThis is a\nthird sentence."
print(myString)

This is a sentence. This is another.

This is a
third sentence.


In [66]:
doc = nlp(myString)
for sent in doc.sents:
    print(sent)

This is a sentence.
This is another.


This is a
third sentence.


In [70]:
# CHANGING THE RULES
from spacy.pipeline import SentenceSegmenter

def split_on_newlines(doc):
    start = 0
    seen_newline = False
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'): # handles multiple occurrences
            seen_newline = True
    yield doc[start:]      # handles the last group of tokens


sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
nlp.add_pipe(sbd)

In [71]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_new_lines)

NameError: name 'SentenceSegmenter' is not defined