# Sentence Segmentation

In [9]:
s1 = 'This is a sentence. This is second sentence. This is last sentence.'
s2 = 'This is a sentence; This is second sentence; This is last sentence.'

In [2]:
import spacy
nlp = spacy.load(name='en_core_web_sm')

In [3]:
doc1 = nlp(s1)

In [5]:
# Identify sentence in paragraph

for sent in doc1.sents:
    print(sent.text)

This is a sentence.
This is second sentence.
This is last sentence.


In [6]:
#Example 2
#got special character .

s3 = 'This is a sentence. This is second U.K. sentence. This is last sentence.'

In [7]:
doc3 = nlp(s3)

In [8]:
for sent in doc3.sents:
    print(sent.text)

This is a sentence.
This is second U.K. sentence.
This is last sentence.


In [10]:
#Example 3 - paragrah with ; to separate sentence
print(s2)
doc2 = nlp(s2)
for sent in doc2.sents:
    print(sent.text)

This is a sentence; This is second sentence; This is last sentence.
This is a sentence; This is second sentence; This is last sentence.


## how to make it recognize semicolon to separate sentence
#### notice that in s2, first 2 sentences end with semicolon, last one end with full stop

### Function: set_custom_boundaries 
#### remove last token
#### whenever encounter ;, the token afterward is the starting point of new sent.


In [42]:
# register a custom pipeline component under a given name
# allows initializing the component by name using Language.add_pipe

from spacy.language import Language

@Language.component('set_custom_boundaries')
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            print(token.i)
            doc[token.i+1].is_sent_start = True
    return doc

Language.component('set_custom_boundaries',func=set_custom_boundaries)

<function __main__.set_custom_boundaries(doc)>

## check current pipeline componenst

#### text > NLP(tokenizer,tagger,parser,ner...) > Doc

In [37]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [38]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1c3efa61d08>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1c3efa61c48>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1c3ee867978>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1c3ee867588>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1c3efc542c8>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1c3efc453c8>)]

In [44]:
# add custom function to pipeline so that before parser, will perform the defined step first
#nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('set_custom_boundaries',before='parser')

<function __main__.set_custom_boundaries(doc)>

In [45]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'set_custom_boundaries',
 'parser',
 'ner',
 'attribute_ruler',
 'lemmatizer']

In [46]:
print(s2)
doc2 = nlp(s2)
for sent in doc2.sents:
    print(sent.text)

This is a sentence; This is second sentence; This is last sentence.
4
9
This is a sentence;
This is second sentence;
This is last sentence.
