In [1]:
# Import the spaCy library and the displacy module
from spacy import displacy  # displacy module is used for drawing dependency trees
import spacy

In [2]:
# Load a large language model for English and assign it to the variable 'nlp'
nlp = spacy.load('en_core_web_lg')

# Call the variable to examine the object
nlp

<spacy.lang.en.English at 0x25527e94f40>

MODIFYING SPACY PIPELINES

In [3]:
nlp.pipeline  # examine the components of a pipeline attribute of a Language object

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x25527f779a0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x25528120ee0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x255281540b0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x255281bcec0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x2552816ea80>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x25528154200>)]

This returns a spaCy SimpleFrozenList object, which consists of Python tuples with two items:

1. component names, e.g. tagger, parser, ner, lemmatizer

tagger - signifies whether the word is a noun, adjective, verb, and so on

parser - Parser is used to report any syntax error

Named Entity Recognition (NER) - It helps to easily identify the key elements in a text, like names of people, places, brands, monetary values, and more.

Lemmatizer - It gives the root word as the output.

2. the actual components that perform different tasks, e.g. spacy.pipeline.tok2vec.Tok2Vec.

Here, tok2vec -> maps Tokens to their numerical representations

and attribute_ruler -> applies user-defined rules to Tokens, such as matches for a given linguistic pattern, and adds this information to the Token as an attribute if requested.

In [4]:
# Load a large language model for English, but exclude named entity recognition ('ner') and syntactic dependency parsing ('parser'). 
nlp = spacy.load('en_core_web_lg', exclude=['ner', 'parser'])

In [5]:
# Examine the active components under the Language object 'nlp'
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x255665fc0a0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x255665fc100>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x255679a5f00>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x255679b3200>)]

analyze_pipes() method provides an overview of the pipeline components and their interactions. 
By setting the attribute pretty to True, spaCy prints out a table that lists the components and the annotations they produce.

In [6]:
# Analyse the pipeline and store the analysis under 'pipe_analysis'
pipe_analysis = nlp.analyze_pipes(pretty=True)  # analyze_pipes() method returns a Python dictionary, which contains the same information as the pipeline

[1m

#   Component         Assigns       Requires   Scores      Retokenizes
-   ---------------   -----------   --------   ---------   -----------
0   tok2vec           doc.tensor                           False      
                                                                      
1   tagger            token.tag                tag_acc     False      
                                                                      
2   attribute_ruler                                        False      
                                                                      
3   lemmatizer        token.lemma              lemma_acc   False      

✔ No problems found.


In [7]:
# Examine the value stored under the key 'problems'
pipe_analysis['problems']  # Problem reports are stored in a dictionary under the key problems. We can access the values under the problems key by placing the name of the key in brackets [ ].

{'tok2vec': [], 'tagger': [], 'attribute_ruler': [], 'lemmatizer': []}

In this case, the lists are empty, because no problems exist.

We use the assert statement with the len() function and the comparison operator == to check that the length of the list is 0.

If this assertion is not true, that is, if the length of problem_list is more than 0, which would indicate the presence of a problem, Python will raise an AssertionError and stop.

In [8]:
# Loop over the key/value pairs in the dictionary. Assign the key and value pairs to the variables 'component_name' and 'problem_list'.
for component_name, problem_list in pipe_analysis['problems'].items():
    
    # Use the assert statement to check the list of problems; raise Error if necessary.
    assert len(problem_list) == 0, f"There is a problem with {component_name}: {problem_list}!"
    
    # The quotation marks are preceded by the character f. By declaring that this string can be formatted, we can insert variables into the string.
    # The variable names inserted into the string are surrounded by curly braces {}. If an error message is raised, these parts of the string will be populated using the values currently stored under the variables component_name and problem_list.
    # If no problems are encountered, the loop will pass silently.

PROCESSING TEXTS EFFICIENTLY

In [9]:
# Initialise the language model again, because we need dependency parsing for the following sections.
nlp = spacy.load('en_core_web_lg')

# Define a list of example sentences
sents = ["On October 1, 2009, the Obama administration went ahead with a Bush administration program, increasing nuclear weapons production.", 
         "The 'Complex Modernization' initiative expanded two existing nuclear sites to produce new bomb parts.", 
         "The administration built new plutonium pits at the Los Alamos lab in New Mexico and expanded enriched uranium processing at the Y-12 facility in Oak Ridge, Tennessee."]

# Call the variable to examine output
sents

['On October 1, 2009, the Obama administration went ahead with a Bush administration program, increasing nuclear weapons production.',
 "The 'Complex Modernization' initiative expanded two existing nuclear sites to produce new bomb parts.",
 'The administration built new plutonium pits at the Los Alamos lab in New Mexico and expanded enriched uranium processing at the Y-12 facility in Oak Ridge, Tennessee.']

spaCy Language objects have a specific method, pipe(), for processing texts stored in a Python list.

The pipe() method has been optimised for this purpose, processing texts in batches rather than individually, which makes this method faster than processing each list item separately using a for loop.

The pipe() method takes a list as input and returns a Python generator named pipe.

In [10]:
# Feed the list of sentences to the pipe() method for processing text
docs = nlp.pipe(sents)

# Call the variable to examine the output
docs

<generator object Language.pipe at 0x000002552A986EB0>

Generators are Python objects that contain other objects. When called, a generator object will yield objects contained within itself.

To retrieve all objects in a generator, we must cast the output into another object type, such as a list.
List basically collects the generator output for examination.

In [11]:
# Cast the pipe generator into a list
docs = list(docs)

# Call the variable to examine the output
docs

[On October 1, 2009, the Obama administration went ahead with a Bush administration program, increasing nuclear weapons production.,
 The 'Complex Modernization' initiative expanded two existing nuclear sites to produce new bomb parts.,
 The administration built new plutonium pits at the Los Alamos lab in New Mexico and expanded enriched uranium processing at the Y-12 facility in Oak Ridge, Tennessee.]

This gives us a list of spaCy Doc objects for further processing.

In [12]:
# spaCy allows setting custom attributes to Doc, Span and Token objects.
# Import the Doc object from the 'tokens' module in spaCy
from spacy.tokens import Doc  #  A Doc is a sequence of Token objects

In [13]:
# Add two custom attributes to the Doc object, 'age' and 'location' using the set_extension() method.
Doc.set_extension("age", default=None)  # custom attributes can be added directly to the Doc object using the set_extension() method
Doc.set_extension("location", default=None)

# The age and location attributes are now added to the Doc object.

In [14]:
# Create a dictionary whose values consist of another dictionary with three keys: 'age', 'location' and 'text'.
sents_dict = {0: {"age": 23, 
                  "location": "Helsinki", 
                  "text": "The Senate Square is by far the most important landmark in Helsinki."
                 },
              1: {"age": 35, 
                  "location": "Tallinn", 
                  "text": "The Old Town, for sure."
                 },
              2: {"age": 58, 
                  "location": "Stockholm", 
                  "text": "Södermalm is interesting!"
                 }
             }

In [15]:
# Loop over the sents_dict dictionary to process the examples and add the custom attributes to the resulting Doc objects
# Set up a placeholder list to hold the processed texts
docs = []

# Loop over pairs of keys and values in the 'sents_dict' dictionary.
# Note that the key/value pairs are available under the items() method.
# We refer to these keys and values as 'key' and 'data', respectively.
# This means that we used the variable 'data' to refer to the nested
# dictionary.
for key, data in sents_dict.items():
    
    # Retrieve the value under the key 'text' from the nested dictionary.
    # Feed this text to the language model under 'nlp' and assign the 
    # result to the variable 'doc'.
    doc = nlp(data['text'])
    
    # Retrieve the values for 'age' and 'location' from the nested dictionary.
    # Assign these values into the custom attributes defined for the Doc object.
    # Note that custom attributes reside under a pseudo attribute consisting of
    # an underscore '_'!  
    doc._.age = data['age']
    doc._.location = data['location']
    
    # Append the current Doc object under 'doc' to the list 'docs'
    docs.append(doc)
    
# This provides a list of Doc objects, which is assigned under the variable docs.

Loop over the docs list and print out the Doc and its custom attributes

In [16]:
# Loop over each Doc object in the list 'docs'
for doc in docs: # doc defines what is stored in the new list. We loop over items in the list docs and refer to each item using the variable doc.
    
    # Print each Doc and the 'age' and 'location' attributes
    print(doc, doc._.age, doc._.location)

The Senate Square is by far the most important landmark in Helsinki. 23 Helsinki
The Old Town, for sure. 35 Tallinn
Södermalm is interesting! 58 Stockholm


The custom attributes can be used to filter the data.

One efficient way to filter the data is to use a Python list comprehension.

A list comprehension is like a for loop that is declared on the fly using brackets [], which are used to designate lists in Python.

In [17]:
# Use a list comprehension to filter the Docs for those whose
# 'age' attribute has a value under 40.
under_forty = [doc for doc in docs if doc._.get('age') < 40]

# Call the variable to examine the output
under_forty

[The Senate Square is by far the most important landmark in Helsinki.,
 The Old Town, for sure.]

This returns a list with only two Doc objects that fill the designated criteria, that is, their age attribute has a value below 40.

WRITING PROCESSED TEXTS TO DISK

When working with high volumes of texts, we first need to ensure that the pipeline produces the desired results in smaller texts. 

If it works then we should process all the texts and save the result, because processing large volumes of text takes time and resources.

In [18]:
# spaCy provides a special object type named DocBin for storing Doc objects that contain linguistic annotations from spaCy.
# Import the DocBin object from the 'tokens' module in spacy
from spacy.tokens import DocBin

In [19]:
# To populate the DocBin object with Docs upon creation, use the docs argument to pass a Python generator or list that contains Doc objects.
# In this case, we add the three Docs stored under the variable docs to the DocBin.
# Initialize a DocBin object and add Docs from 'docs'
docbin = DocBin(docs=docs)

To add custom attributes to Docs, Spans, or Tokens, set the store_user_data argument to True, e.g. DocBin(docs=docs, store_user_data=True).

In [20]:
# Verify that all three Docs made it into the DocBin by examining the output of its __len__() method.
# Get the number of Docs in the DocBin
docbin.__len__()

3

In [21]:
# Define and feed a string object the language model under 'nlp' and add the resulting Doc to the DocBin object 'docbin'
docbin.add(nlp("Yet another Doc object."))

# Verify that the Doc was added; length should be now 4
docbin.__len__()

4

Write the object to a disk for storage.
This can be achieved using the to_disk() method of the DocBin.

In [22]:
# Write the DocBin object to disk
docbin.to_disk(path='‪‪') # The to_disk() method takes a single argument, path, which defines a path to the file in which the DocBin object should be written.

In [23]:
# Initialise a new DocBin object and use the 'from_disk' method to load the data from the disk. Assign the result to the variable 'docbin_loaded'.
docbin_loaded = DocBin().from_disk(path='C:\\Users\\PALAK BHATT\\Downloads\\‪‪')

# Call the variable to examine the output
docbin_loaded

<spacy.tokens._serialize.DocBin at 0x2556aa2d5b0>

In [24]:
# Use the 'get_docs' method to retrieve Doc objects from the DocBin, passing the vocabulary under 'nlp.vocab' to reconstruct the data.
# Cast the resulting generator object into a list for examination.
docs_loaded = list(docbin_loaded.get_docs(nlp.vocab))

# Call the variable to examine the output
docs_loaded

[The Senate Square is by far the most important landmark in Helsinki.,
 The Old Town, for sure.,
 Södermalm is interesting!,
 Yet another Doc object.]

SIMPLIFYING OUTPUT FOR NOUN PHRASES AND NAMED ENTITIES

Print out the noun chunks in each Doc object contained in the list docs.

In [25]:
# spaCy provides access to noun phrases via the attribute noun_chunks of a Doc object 
# Define the first for-loop over the list 'docs'
# The variable 'doc' refers to items in the list
for doc in docs:
    
    # Loop over each noun chunk in the Doc object
    for noun_chunk in doc.noun_chunks:
        
        # Print noun chunk
        print(noun_chunk)

The Senate Square
the most important landmark
Helsinki
The Old Town
Södermalm


In [26]:
# For merging noun phrases into a single Token, spaCy provides a function named merge_noun_tokens that can be added to the pipeline stored in a Language object using the add_pipe method
# Add component that merges noun phrases into single Tokens
nlp.add_pipe('merge_noun_chunks')

<function spacy.pipeline.functions.merge_noun_chunks(doc: spacy.tokens.doc.Doc) -> spacy.tokens.doc.Doc>

In [27]:
# Verify that the component was added successfully by examining the pipeline attribute under the Language object nlp
# List the pipeline components
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2552a93a940>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2556b133c40>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2556aa8acf0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x255a7313a40>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x255a731cdc0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2556aa8ad60>),
 ('merge_noun_chunks',
  <function spacy.pipeline.functions.merge_noun_chunks(doc: spacy.tokens.doc.Doc) -> spacy.tokens.doc.Doc>)]

In [28]:
# Apply the Language object 'nlp' to the list of sentences under 'sents'
docs = list(nlp.pipe(sents))

# Call the variable to examine the output
docs

[On October 1, 2009, the Obama administration went ahead with a Bush administration program, increasing nuclear weapons production.,
 The 'Complex Modernization' initiative expanded two existing nuclear sites to produce new bomb parts.,
 The administration built new plutonium pits at the Los Alamos lab in New Mexico and expanded enriched uranium processing at the Y-12 facility in Oak Ridge, Tennessee.]

If we loop over the Tokens in the first Doc object in the list, which can be accessed using brackets at position zero, e.g. [0], we can see that the noun phrases are now merged and tagged using the label NOUN.

In [29]:
# Loop over Tokens in the first Doc object in the list
for token in docs[0]:
    
    # Print out the Token and its part-of-speech tag
    print(token, token.pos_)

On ADP
October PROPN
1 NUM
, PUNCT
2009 NUM
, PUNCT
the Obama administration NOUN
went VERB
ahead ADV
with ADP
a Bush administration program NOUN
, PUNCT
increasing VERB
nuclear weapons production NOUN
. PUNCT


In [30]:
displacy.render(docs[0], style='dep')

In [31]:
# spaCy stores noun chunks as Spans, whose start attribute determines the index of the Token where the Span starts, while the end attribute determines where the Span has ended.
# Loop over the noun chunks in the first Doc object [0] in the list 'docs'
for noun_chunk in docs[0].noun_chunks:
    
    # Print out noun chunk, its type, the Token where the chunks starts and where it ends
    print(noun_chunk, type(noun_chunk), noun_chunk.start, noun_chunk.end)

October <class 'spacy.tokens.span.Span'> 1 2
the Obama administration <class 'spacy.tokens.span.Span'> 6 7
a Bush administration program <class 'spacy.tokens.span.Span'> 10 11
nuclear weapons production <class 'spacy.tokens.span.Span'> 13 14


MERGING NAMED ENTITIES

In [32]:
# Remove the 'merge_noun_chunks' function from the pipeline under 'nlp'
nlp.remove_pipe('merge_noun_chunks')

# Process the original sentences again
docs = list(nlp.pipe(sents))

In [33]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2552a93a940>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2556b133c40>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2556aa8acf0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x255a7313a40>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x255a731cdc0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2556aa8ad60>)]

In [34]:
# Add the 'merge_entities' function to the pipeline
nlp.add_pipe('merge_entities')

# Process the data again
docs = list(nlp.pipe(sents))

In [35]:
# Loop over Tokens in the third Doc object in the list
for token in docs[2]:
    
    # Print out the Token and its part-of-speech tag
    print(token, token.pos_)

The DET
administration NOUN
built VERB
new ADJ
plutonium NOUN
pits NOUN
at ADP
the DET
Los Alamos PROPN
lab NOUN
in ADP
New Mexico PROPN
and CCONJ
expanded VERB
enriched ADJ
uranium NOUN
processing NOUN
at ADP
the Y-12 DET
facility NOUN
in ADP
Oak Ridge PROPN
, PUNCT
Tennessee PROPN
. PUNCT


Named entities that consist of multiple Tokens, as exemplified by place names such as “Los Alamos” and “New Mexico”, have been merged into single Tokens.