In [1]:
import spacy
nlp = spacy.load('en')

In [2]:
doc = nlp(u'This is the first sentance. This is another sentance. This is the last sentance.')

In [3]:
for sent in doc.sents:
    print(sent)

This is the first sentance.
This is another sentance.
This is the last sentance.


In [5]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [8]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [9]:
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing the right things; leadership is doing the right things."


-Peter Drucker




In [13]:
#How to Add custom segmentation rules

In [20]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [22]:
nlp.add_pipe(set_custom_boundaries, before='parser')
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [23]:
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [24]:
for sent in doc4.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things."
-Peter Drucker


In [25]:
#Change Segmentation Rules
nlp = spacy.load('en')

In [28]:
mystring = u"This is a sentance. This is another. \n\n This is a \nthird sentance."

In [29]:
print(mystring)

This is a sentance. This is another. 

 This is a 
third sentance.


In [30]:
doc = nlp(mystring)

In [31]:
for sentence in doc.sents:
    print(sentence)

This is a sentance.
This is another. 

 
This is a 
third sentance.


In [33]:
from spacy.pipeline import SentenceSegmenter

In [34]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
    yield doc[start:]

In [35]:
sbd = SentenceSegmenter(nlp.vocab, strategy = split_on_newlines)

In [36]:
nlp.add_pipe(sbd)

In [37]:
doc = nlp(mystring)

In [39]:
for sentence in doc.sents:
    print(sentence)

This is a sentance. This is another. 

 
This is a 

third sentance.
