# The purpose of this notebook is to demonstrate the value of modular pipelines and interact with them

In [1]:
import spacy
from spacy import displacy

# First we will set up a default pipeline -- notice there are no arguments when calling load()....

In [2]:
default_nlp = spacy.load("en_core_web_sm")

In [3]:
for pipe in default_nlp.pipeline:
    print(pipe)

('tagger', <spacy.pipeline.pipes.Tagger object at 0x7f922590bef0>)
('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7f921e8cb408>)
('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7f921e8cb468>)


# Now set up a pipeline where some steps are not enabled

In [4]:
simpler_nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])

In [5]:
for pipe in simpler_nlp.pipeline:
    print(pipe)

('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7f9219e9e0a8>)


# Which will be faster?  Let's experiment...

In [6]:
MAX_DOCUMENTS = 1000

example_text = "There is slight enlargement of the spleen. No history of a heart murmur."

document_set = []
for i in range(MAX_DOCUMENTS):
    # add this document in N times...
    document_set.append(example_text)

In [7]:
print('Size of our document set : {}'.format(len(document_set)))

Size of our document set : 1000


In [8]:
%%time

for text in document_set:
    default_nlp(text)

CPU times: user 18.5 s, sys: 1.37 ms, total: 18.5 s
Wall time: 18.5 s


In [9]:
%%time

for text in document_set:
    simpler_nlp(text)

CPU times: user 6.55 s, sys: 1.43 ms, total: 6.55 s
Wall time: 6.54 s


# Since the steps of a pipeline are modular, let's change the order

In [10]:
simple_text = u"This is a sentence."

In [11]:
def my_component(doc):
    print("After tokenization, this doc has {} tokens.".format(len(doc)))
    print("The part-of-speech tags are:", [token.pos_ for token in doc])
    if len(doc) < 10:
        print("This is a pretty short document.")
    return doc

In [12]:
custom_pipeline = spacy.load("en_core_web_sm")

custom_pipeline.add_pipe(my_component, name="print_info", last=True)

print(custom_pipeline.pipe_names)

doc = custom_pipeline(simple_text)

['tagger', 'parser', 'ner', 'print_info']
After tokenization, this doc has 5 tokens.
The part-of-speech tags are: ['DET', 'VERB', 'DET', 'NOUN', 'PUNCT']
This is a pretty short document.


# What happens when we try to print part of speech tags as the first step in the pipeline?

In [13]:
print_first_pipeline = spacy.load("en_core_web_sm")

print_first_pipeline.add_pipe(my_component, name="print_info", first = True)

print(print_first_pipeline.pipe_names)

doc = print_first_pipeline(simple_text)

['print_info', 'tagger', 'parser', 'ner']
After tokenization, this doc has 5 tokens.
The part-of-speech tags are: ['', '', '', '', '']
This is a pretty short document.
