# The purpose of this notebook is to demonstrate the value of modular pipelines and interact with them

In [1]:
import spacy
from spacy import displacy

# Lots of functionality in spacy is modular : Loadable and configurable modules.  This includes languages, models, alternative processors, etc

# Why do we care about modular text processing?

Legos | Brick Wall
- | - 
![alt](images/legos.jpg) | ![alt](images/brickwall.jpg)

# There are many languages besides English supported by spacy:
## Including: German, French, Spanish, 
## https://spacy.io/usage/models#languages

# There's also an entire ecosystem of additional models, functionality that is modular within spacy.  Let's look at a few here:
## https://spacy.io/universe

# Let's start playing with NLP legos... First we will set up a default pipeline -- notice there are no arguments when calling load()....

In [2]:
default_nlp = spacy.load("en_core_web_sm")

In [3]:
for pipe in default_nlp.pipeline:
    print(pipe)

('tagger', <spacy.pipeline.pipes.Tagger object at 0x7f7330e1ce48>)
('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7f7303893828>)
('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7f7303893888>)


# Now set up a pipeline where some steps are not enabled

In [4]:
simpler_nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])

In [5]:
for pipe in simpler_nlp.pipeline:
    print(pipe)

('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7f72fee644c8>)


# Which will be faster?  Let's experiment...

In [6]:
example_text = "There is slight enlargement of the spleen. No history of a heart murmur."

In [16]:
MAX_DOCUMENTS = 10

document_set = []
for i in range(MAX_DOCUMENTS):
    # add this document in N times...
    document_set.append(example_text)
    
print('Size of our document set : {}'.format(len(document_set)))

Size of our document set : 10


In [17]:
%%time

for text in document_set:
    default_nlp(text)

CPU times: user 179 ms, sys: 9.48 ms, total: 189 ms
Wall time: 186 ms


In [18]:
%%time

for text in document_set:
    simpler_nlp(text)

CPU times: user 63.7 ms, sys: 3.75 ms, total: 67.5 ms
Wall time: 65 ms


# Let's pause for a moment and try what we did above but instead of running the pipeline over 10 documents, let's run it over 1000 documents

In [None]:
MAX_DOCUMENTS = CHANGE_ME

larger_document_set = []
for i in range(MAX_DOCUMENTS):
    # add this document in N times...
    larger_document_set.append(example_text)
    
print('Size of our larger document set : {}'.format(len(larger_document_set)))

In [None]:
%%time

for text in larger_document_set:
    default_nlp(text)

In [19]:
%%time

for text in larger_document_set:
    simpler_nlp(text)

NameError: name 'larger_document_set' is not defined

# Since the steps of a pipeline are modular, let's change the order

In [20]:
simple_text = u"This is a sentence."

In [21]:
def my_component(doc):
    print("After tokenization, this doc has {} tokens.".format(len(doc)))
    print("The part-of-speech tags are:", [token.pos_ for token in doc])
    if len(doc) < 10:
        print("This is a pretty short document.")
    return doc

In [22]:
custom_pipeline = spacy.load("en_core_web_sm")

custom_pipeline.add_pipe(my_component, name="print_info", last=True)

print(custom_pipeline.pipe_names)

doc = custom_pipeline(simple_text)

['tagger', 'parser', 'ner', 'print_info']
After tokenization, this doc has 5 tokens.
The part-of-speech tags are: ['DET', 'VERB', 'DET', 'NOUN', 'PUNCT']
This is a pretty short document.


# What happens when we try to print part of speech tags as the first step in the pipeline?  Change the code below to run the "print_info" component as the first component instead of the last

In [31]:
print_first_pipeline = spacy.load("en_core_web_sm")

print_first_pipeline.add_pipe(my_component, name="print_info", first = True)
print_first_pipeline.add_pipe(my_component, name="print_info_last", last = True)

print(print_first_pipeline.pipe_names)

doc = print_first_pipeline(simple_text)

['print_info', 'tagger', 'parser', 'ner', 'print_info_last']
After tokenization, this doc has 5 tokens.
The part-of-speech tags are: ['', '', '', '', '']
This is a pretty short document.
After tokenization, this doc has 5 tokens.
The part-of-speech tags are: ['DET', 'VERB', 'DET', 'NOUN', 'PUNCT']
This is a pretty short document.


# Let's look at the components in this pipeline again.  Change the code below so that for each of the pipelines above (custom_pipeline and print_first_pipeline) we write out their steps and the order they are executed

In [33]:
for pipe in print_first_pipeline.pipeline:
    print(pipe)
    
#for pipe in print_last_pipeline.pipeline:
#    print(pipe)

('print_info', <function my_component at 0x7f72fc422ae8>)
('tagger', <spacy.pipeline.pipes.Tagger object at 0x7f72f9ad0240>)
('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7f72f9e46fa8>)
('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7f72f9fcc048>)
('print_info_last', <function my_component at 0x7f72fc422ae8>)


# How many steps do they each have?  What is the difference between them?