In [86]:
from IPython.display import Image
from IPython.core.display import HTML 
import random 

In [2]:
# Import the English language class
from spacy.lang.en import English
from spacy.lang.es import Spanish

# object nlp

nlp = English()#  object containing the processing pipelineç

nlp1 = Spanish() 

### The Doct object

The Doc behaves like a normal Python sequence by the way and lets you iterate over its tokens, or get a token by its index.

* It's created automatically when you process a text with the nlp object.


In [3]:
# Created by processing a string of text with the nlp object
doc = nlp("Hello world!")

# Iterate over tokens in a Doc
for token in doc:
    print(token.text)

Hello
world
!


### Token objects

Represent the tokens in a document – for example, a word or a punctuation character.

When you call nlp on a string, spaCy first tokenizes the text and creates a document object. 

In [4]:
doc = nlp("Hello world!")

# Index into the Doc to get a single Token
token = doc[1]

# Get the token text via the .text attribute
print(token.text)

world


### The Span object

A Span object is a slice of the document consisting of one or more tokens. It's only a view of the Doc and doesn't contain any data itself.

In [5]:
doc = nlp("Hello world!")

# A slice from the Doc is a Span object
span = doc[1:3]

# Get the span text via the .text attribute
print(span.text)

world!


### Lexical attribute

In [6]:
from spacy.lang.en import English

nlp = English()

# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i + 1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


### Statistical models: spaCy’s pre-trained model packages 

In [7]:
!python -m spacy download de_core_news_sm

[+] Download and installation successful
You can now load the model via spacy.load('de_core_news_sm')


In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")

The package enable spaCy to make predictions:
* Binary weights
* Vocabulary
* Meta information (language, pipeline)

####  .pos_ attribute, part-of-speech tag.

In [9]:
# Process a text
doc = nlp("She ate the pizza")

# Iterate over the tokens
for token in doc:
    # Print the text and the predicted part-of-speech tag
    print(token.text, token.pos_) #.pos_ attribute, the predicted part-of-speech tag.

She PRON
ate VERB
the DET
pizza NOUN


####  .dep_ : predicted dependency label

#### .head : syntactic head token: parent token this word is attached to.

In [10]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


In [11]:
for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    token_head = token.head.text
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}{token_head:<10}")

She         PRON      nsubj     ate       
ate         VERB      ROOT      ate       
the         DET       det       pizza     
pizza       NOUN      dobj      ate       


#### Predicting Named Entities

Named entities are "real world objects" that are assigned a name – for example, a person, an organization or a country

* **doc.ents** property lets you access the named entities predicted

In [12]:
# Process a text
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


* #### spacy.explain() : get definitions for the most common tags and labels,

In [13]:
spacy.explain("GPE")

'Countries, cities, states'

In [14]:
spacy.explain("MONEY")

'Monetary values, including unit'

### Predicting Named Entities in context

In [15]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X" because is not in the en_core
iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)

Apple ORG
Missing entity: iPhone X


### Rule-based matching

Compared to regular expressions, the matcher works with Doc and Token objects instead of only strings.

* It's also more flexible: you can search for texts but also other lexical attributes.

* You can even write rules that use the model's predictions. For example, find the word "duck" only if it's a verb, not a noun.

#### Match patterns

Match patterns are lists of dictionaries. Each dictionary describes one token. The keys are the names of token attributes, mapped to their expected values.

In [16]:
[{"TEXT": "iPhone"}, {"TEXT": "X"}]

[{'TEXT': 'iPhone'}, {'TEXT': 'X'}]

In [17]:
[{"LOWER": "iphone"}, {"LOWER": "x"}]

[{'LOWER': 'iphone'}, {'LOWER': 'x'}]

In [18]:
import spacy

# Import the Matcher
from spacy.matcher import Matcher

# Load a model and create the nlp object
nlp = spacy.load("en_core_web_sm")

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", None, pattern)

# Process some text
doc = nlp("Upcoming iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

When you call the matcher on a doc, it returns a list of tuples.

Each tuple consists of three values: the match ID, the start index and the end index of the matched span.

In [19]:
# Call the matcher on the doc
doc = nlp("Upcoming iPhone X release date leaked")
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(match_id,start,end)
    print(matched_span.text)

9528407286733565721 1 3
iPhone X


#### Matching lexical attributes

In [20]:
pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]

In [21]:
doc = nlp("2018 FIFA World Cup: France won!")

#### Matching other token attributes

In [22]:
pattern = [
    {"LEMMA": "love", "POS": "VERB"},
    {"POS": "NOUN"}
]

doc = nlp("I loved dogs but now I love cats more.")

In [23]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: iOS 7
Match found: iOS 11
Match found: iOS 10


#### Using operators and quantifiers

Operators and quantifiers let you define how often a token should be matched. They can be added using the "OP" key.ç

Here, the "?" operator makes the determiner token optional, so it will match a token with the lemma "buy", an optional article and a noun.

In [24]:
pattern = [
    {"LEMMA": "buy"},
    {"POS": "DET", "OP": "?"},  # optional: match 0 or 1 times
    {"POS": "NOUN"}
]

doc = nlp("I bought a smartphone. Now I'm buying apps.")

In [25]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: downloaded Fortnite
Match found: downloading Minecraft
Match found: download Winzip


* {"OP": "!"}	Negation: match 0 times
* {"OP": "?"}	Optional: match 0 or 1 times
* {"OP": "+"}	Match 1 or more times
* {"OP": "*"}	Match 0 or more times

In [26]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP":"?"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 5
Match found: beautiful design
Match found: smart search
Match found: automatic labels
Match found: optional voice
Match found: optional voice responses


## Large-scale data analysus

### Data structures

#### Vocab

* Stores data shared across multiple documents, the Vocab. This includes words, but also the labels schemes for tags and entities.
* To save memory, spaCy encodes all strings to hash values (IDs).
* Strings are only stored once in the StringStore via nlp.vocab.strings
* String store: lookup table in both directions


In [27]:
doc = nlp("I love coffee")
print("hash value:", nlp.vocab.strings["coffee"])
print("string value:", nlp.vocab.strings[3197928453018144401])

hash value: 3197928453018144401
string value: coffee


In [28]:
doc = nlp("I love coffee")
print("hash value:", doc.vocab.strings["coffee"])

hash value: 3197928453018144401


In [29]:
from spacy.lang.en import English

nlp = English()
doc = nlp("I have a cat")

# Look up the hash for the word "cat"
cat_hash = nlp.vocab.strings["cat"]
print(cat_hash)

# Look up the cat_hash to get the string
cat_string = nlp.vocab.strings[cat_hash]
print(cat_string)

5439657043933447811
cat


#### Lexeme

* Contains the context-independent information about a word.
* You can get a lexeme by looking up a string or a hash ID in the vocab.
* Word text: lexeme.text and lexeme.orth (the hash).
* Lexemes expose attributes, just like tokens.
* Lexemes don't have part-of-speech tags, dependencies or entity labels. Those depend on the context.

In [30]:
doc = nlp("I love coffee")
lexeme = nlp.vocab["coffee"]

# Print the lexical attributes
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

coffee 3197928453018144401 True


In [31]:
Image(url= "https://course.spacy.io/vocab_stringstore.png")

#### Doc

In [32]:
Image(url= "https://course.spacy.io/span_indices.png")

You can create a Doc manually.

* The Doc class takes three arguments: the shared vocab, the words and the spaces.

*  The spaces are a list of boolean values indicating whether the word is followed by a space. 

In [33]:
# Create an nlp object
from spacy.lang.en import English
nlp = English()

# Import the Doc class
from spacy.tokens import Doc

# The words and spaces to create the doc from
words = ["Hello", "world", "!"]
spaces = [True, False, False]

# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)

#### Span

A Span is a slice of a doc consisting of one or more tokens. 

* The Span takes at least three arguments: the doc it refers to, and the start and end index of the span. 
    
* Remember that the end index is exclusive.

* doc.ents are writable, so we can add entities manually by overwriting it with a list of spans.

In [34]:
from spacy.lang.en import English

nlp = English()

# Import the Doc and Span classes
from spacy.tokens import Doc, Span

words = ["I", "like", "David", "Bowie"]
spaces = [True, True, True, False]

# Create a doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

# Create a span for "David Bowie" from the doc and assign it the label "PERSON"
span = Span(doc, 2, 4, label="PERSON")
print(span.text, span.label_)

# Add the span to the doc's entities
doc.ents = [span]

# Print entities' text and labels
print([(ent.text, ent.label_) for ent in doc.ents])

I like David Bowie
David Bowie PERSON
[('David Bowie', 'PERSON')]


### Tips

* The Doc and Span are very powerful and optimized for performance. They give you access to all references and relationships of the words and sentences.

* If your application needs to output strings, make sure to convert the doc as late as possible. If you do it too early, you'll lose all relationships between the tokens.

* To keep things consistent, try to use built-in token attributes wherever possible. For example, token.i for the token index.

* Don't forget to always pass in the shared vocab.

#### Example: not using native token attributes

In [35]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin is a nice city")

# Get all tokens and part-of-speech tags
token_texts = [token.text for token in doc]
pos_tags = [token.pos_ for token in doc]

for index, pos in enumerate(pos_tags):
    # Check if the current token is a proper noun
    if pos == "PROPN":
        # Check if the next token is a verb
        if pos_tags[index + 1] == "VERB":
            result = token_texts[index]
            print("Found proper noun before a verb:", result)

#### Using native token attributes

In [36]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin is a nice city")

# Iterate over the tokens
for token in doc:
    # Check if the current token is a proper noun
    if token.pos_ == "PROPN":
        # Check if the next token is a verb
        if token.i + 1 < len(doc):
            if doc[token.i + 1].pos_ == "VERB":
                print("Found proper noun before a verb:", token.text)

### Word vectors and semantic similarity

* spaCy can compare two objects and predict how similar they are – for example, documents, spans or single tokens.

* The Doc, Token and Span objects have a .similarity method that takes another object and returns a floating point number between 0 and 1, indicating how similar they are.

* In order to use similarity, you need a larger spaCy model that has word vectors included. So if you want to use vectors, always go with a model that ends in "md" or "lg": **NOT en_core_web_sm (small model)**

#### Similarity:

* Similarity is determined using word vectors, Multi-dimensional meaning representations of words.
* Generated using an algorithm like Word2Vec and lots of text.
* Vectors can be added to spaCy's statistical models.
* By default, the similarity returned by spaCy is the cosine similarity between two vectors – but this can be adjusted if necessary.
* Vectors for objects consisting of several tokens, like the Doc and Span, default to the average of their token vectors.
* Short phrases are better than long documents with many irrelevant words

* **Useful for many applications: recommendation systems, flagging duplicates etc.**
* **There's no objective definition of "similarity: It always depends on the context and what your application needs to do.**

In [37]:
# Load a larger model with vectors
nlp = spacy.load("en_core_web_md")

# Compare two documents
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")
print(doc1.similarity(doc2))

0.8627204117787385


In [38]:
# Compare two tokens
doc = nlp("I like pizza and pasta")
token1 = doc[2]
token2 = doc[4]
print(token1.similarity(token2))

0.73695457


In [39]:
# Compare a document with a token
doc = nlp("I like pizza")
token = nlp("soap")[0]

print(doc.similarity(token))

0.32531983166759537


In [40]:
# Compare a span with a document
span = nlp("I like pizza and pasta")[2:5]
doc = nlp("McDonalds sells burgers")

print(span.similarity(doc))

0.6199091710787739


#### Words Vectors

* First, we load the medium model again, which ships with word vectors.
* Next, we can process a text and look up a token's vector using the .vector attribute.

In [41]:
# Load a larger model with vectors
nlp = spacy.load("en_core_web_md")

doc = nlp("I have a banana")
# Access the vector via the token.vector attribute
print(doc[3].vector)

[ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.5564e-01  6.9793e-02 -2

### Combining statistical models with rule-based systems

* Statistical models are useful if your application needs to be able to generalize based on a few examples.

* Example: Instead of providing a list of all person names ever, your application will be able to predict whether a span of tokens is a person name.

* To do this, you would use spaCy's entity recognizer, dependency parser or part-of-speech tagger.


* **Rule-based approaches on the other hand come in handy if there's a more or less finite number of instances you want to find**. For example, all countries or cities of the world, drug names or even dog breeds.ç

* In spaCy, you can achieve this with **custom tokenization rules**, as well as the matcher and phrase matcher.

#### Matcher recap

In [42]:
# Initialize with the shared vocab
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# Patterns are lists of dictionaries describing the tokens
pattern = [{"LEMMA": "love", "POS": "VERB"}, {"LOWER": "cats"}]
matcher.add("LOVE_CATS", None, pattern)

# Operators can specify how often a token should be matched
pattern = [{"TEXT": "very", "OP": "+"}, {"TEXT": "happy"}]
matcher.add("VERY_HAPPY", None, pattern)

# Calling matcher on doc returns list of (match_id, start, end) tuples
doc = nlp("I love cats and I'm very very happy")
matches = matcher(doc)

In [43]:
matches

[(9137535031263442622, 1, 3),
 (2447047934687575526, 7, 9),
 (2447047934687575526, 6, 9)]

In [44]:
matcher = Matcher(nlp.vocab)
matcher.add("DOG", None, [{"LOWER": "golden"}, {"LOWER": "retriever"}])
doc = nlp("I have a Golden Retriever")

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print("Matched span:", span.text)
    # Get the span's root token and root head token
    print("Root token:", span.root.text)
    print("Root head token:", span.root.head.text)
    # Get the previous token and its POS tag
    print("Previous token:", doc[start - 1].text, doc[start - 1].pos_)
    
    print(doc.vocab.strings[match_id], doc[start:end].text)

Matched span: Golden Retriever
Root token: Retriever
Root head token: have
Previous token: a DET
DOG Golden Retriever


#### Phrase matcher

* It performs a keyword search on the document, but instead of only finding strings, it gives you direct access to the tokens in context.

* It takes Doc objects as patterns.

* This makes it very useful for matching large dictionaries and word lists on large volumes of text.


In [45]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

pattern = nlp("Golden Retriever")
matcher.add("DOG", None, pattern) #Instead of a list of dictionaries, we pass in a Doc object as the pattern.
doc = nlp("I have a Golden Retriever")

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Get the matched span
    span = doc[start:end]
    print("Matched span:", span.text)

Matched span: Golden Retriever


## Processing Pipelines

In [46]:
Image(url= "https://course.spacy.io/pipeline.png")

First, the tokenizer is applied to turn the string of text into a Doc object. Next, a series of pipeline components is applied to the doc in order. In this case, the tagger, then the parser, then the entity recognizer. Finally, the processed doc is returned, so you can work with it.

* The **part-of-speech** tagger sets the **token.tag** and **token.pos** attributes.
* The **dependency parser** adds the **token.dep and token.head** attributes and is also responsible for detecting sentences and base noun phrases, also known as noun chunks.
* The **named entity recognizer** adds the **detected entities to the doc.ents property**. It also sets entity type attributes on the tokens that indicate if a token is part of an entity or not.
* The **text classifier sets category labels** that apply to the whole text, and **adds them to the doc.cats property**.

### Pipeline attributes

To see the names of the pipeline components present in the current nlp object, you can use the nlp.pipe_names attribute.

In [47]:
print(nlp.pipe_names) #nlp.pipe_names: list of pipeline component names

['tagger', 'parser', 'ner']


In [48]:
print(nlp.pipeline) #nlp.pipeline: list of (name, component) tuples

[('tagger', <spacy.pipeline.pipes.Tagger object at 0x0000020D841117C8>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x0000020D80296348>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x0000020D802968E8>)]


### Custom pipeline components

After the text is tokenized and a Doc object has been created, pipeline components are applied in order. spaCy supports a range of built-in components, but also lets you define your own.

Custom components are executed automatically when you call the nlp object on a text.

Fundamentally, a pipeline component is a function or callable that takes a doc, modifies it and returns it, so it can be processed by the next component in the pipeline.

In [49]:
def custom_component(doc):
    # Do something to the doc here
    return doc

nlp.add_pipe(custom_component) #Components can be added to the pipeline using the nlp.add_pipe method.

To specify where to add the component in the pipeline, you can use the following keyword arguments:

* Setting last to True will add the component last in the pipeline. This is the default behavior.

* Setting first to True will add the component first in the pipeline, right after the tokenizer.

* The before and after arguments let you define the name of an existing component to add the new component before or after. For example, before="ner" will add it before the named entity recognizer.

* The other component to add the new component before or after needs to exist, though – otherwise, spaCy will raise an error.

In [50]:
# Create the nlp object
nlp = spacy.load("en_core_web_sm")

# Define a custom component
def custom_component(doc):
    # Print the doc's length
    print("Doc length:", len(doc))
    # Return the doc object
    return doc

# Add the component first in the pipeline
nlp.add_pipe(custom_component, first=True)

# Print the pipeline component names
print("Pipeline:", nlp.pipe_names)

# Process a text
doc = nlp("Hello world!")

Pipeline: ['custom_component', 'tagger', 'parser', 'ner']
Doc length: 3


In [51]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

# Define the custom component
def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc


# Add the component to the pipeline after the "ner" component
nlp.add_pipe(animal_component, after="ner")
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

animal_patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]
['tagger', 'parser', 'ner', 'animal_component']
[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


### Extension attributes

Custom attributes let you add any metadata to docs, tokens and spans. The data can be added once, or it can be computed dynamically.

* **Custom attributes are available via the ._ (dot underscore) property**. This makes it clear that they were added by the user, and not built into spaCy, like token.text.

* Attributes need to be registered on the global Doc, Token and Span classes you can import from spacy.tokens. To **register a custom attribute on the Doc, Token and Span**, you can use the **set_extension method**.

In [52]:
# Import global classes
from spacy.tokens import Doc, Token, Span

# Set extensions on the Doc, Token and Span
Doc.set_extension("title", default=None)
Token.set_extension("is_color", default=False)
Span.set_extension("has_color", default=False)

* **Attribute extensions**: set a default value that can be overwritten.

In [53]:
from spacy.tokens import Token

# Set extension on the Token with default value
Token.set_extension("is_color", default=False, force = True)

doc = nlp("The sky is blue.")

# Overwrite extension attribute value
doc[3]._.is_color = True

In [54]:
from spacy.lang.en import English
from spacy.tokens import Token

nlp = English()

# Register the Token extension attribute "is_country" with the default value False
Token.set_extension("is_country", default=False)

# Process the text and set the is_country attribute to True for the token "Spain"
doc = nlp("I live in Spain.")
doc[3]._.is_country = True

# Print the token text and the is_country attribute for all tokens
print([(token.text, token._.is_country) for token in doc])

[('I', False), ('live', False), ('in', False), ('Spain', True), ('.', False)]


* **Property extensions**: hey can define a getter function and an optional setter.

* The getter function is only called when you retrieve the attribute. 

* Getter functions take one argument: the object, in this case, the token. In this example, the function returns whether the token text is in our list of colors.


In [55]:
from spacy.tokens import Token

# Define getter function
def get_is_color(token):
    colors = ["red", "yellow", "blue"]
    return token.text in colors

# Set extension on the Token with getter
Token.set_extension("is_color", getter=get_is_color, force = True)

doc = nlp("The sky is blue.")
print(doc[3]._.is_color, "-", doc[3].text)

True - blue


In [56]:
from spacy.lang.en import English
from spacy.tokens import Token

nlp = English()

# Define the getter function that takes a token and returns its reversed text
def get_reversed(token):
    return token.text[::-1]


# Register the Token property extension "reversed" with the getter get_reversed
Token.set_extension("reversed", getter=get_reversed)

# Process the text and print the reversed attribute for each token
doc = nlp("All generalizations are false, including this one.")
for token in doc:
    print("reversed:", token._.reversed)

reversed: llA
reversed: snoitazilareneg
reversed: era
reversed: eslaf
reversed: ,
reversed: gnidulcni
reversed: siht
reversed: eno
reversed: .


If you want to set extension attributes on a span, you almost always want to use a property extension with a getter. Otherwise, you'd have to update every possible span ever by hand to set all the values.

In [57]:
from spacy.tokens import Span

# Define getter function
def get_has_color(span):
    colors = ["red", "yellow", "blue"]
    return any(token.text in colors for token in span)

# Set extension on the Span with getter
Span.set_extension("has_color", getter=get_has_color, force = True)

doc = nlp("The sky is blue.")
print(doc[1:4]._.has_color, "-", doc[1:4].text)
print(doc[0:2]._.has_color, "-", doc[0:2].text)

True - sky is blue
False - The sky


In this example, the get_has_color function takes the span and returns whether the text of any of the tokens is in the list of colors.

After we've processed the doc, we can check different slices of the doc and the custom ._.has_color property returns whether the span contains a color token or not.

* **Method extensions**:  make the extension attribute a callable method.

* You can then pass one or more arguments to it, and compute attribute values dynamically – for example, based on a certain argument or setting.

In [58]:
from spacy.tokens import Doc

# Define method with arguments
def has_token(doc, token_text):
    in_doc = token_text in [token.text for token in doc]
    return in_doc

# Set extension on the Doc with method
Doc.set_extension("has_token", method=has_token, force = True)

doc = nlp("The sky is blue.")
print(doc._.has_token("blue"), "- blue")
print(doc._.has_token("cloud"), "- cloud")

True - blue
False - cloud


In this example, the method function checks whether the doc contains a token with a given text. The first argument of the method is always the object itself – in this case, the doc. It's passed in automatically when the method is called. All other function arguments will be arguments on the method extension. In this case, token_text.

In [59]:
import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")


def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text


# Set the Span extension wikipedia_url using get getter get_wikipedia_url
Span.set_extension("wikipedia_url", getter=get_wikipedia_url)

doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent._.wikipedia_url)

fifty years None
David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie


### Components with extensions

 In this exercise, a pipeline component finds country names and a custom extension attribute that returns a country’s capital, if available.

In [None]:
import json
from spacy.lang.en import English
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("exercises/en/countries.json") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/en/capitals.json") as f:
    CAPITALS = json.loads(f.read())

nlp = English()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # Create an entity Span with the label "GPE" for all matches
    matches = matcher(doc)
    doc.ents = [Span(doc, start, end, label="GPE") for match_id, start, end in matches]
    return doc


# Add the component to the pipeline
nlp.add_pipe(countries_component)
print(nlp.pipe_names)

# Getter that looks up the span text in the dictionary of country capitals
get_capital = lambda span: CAPITALS.get(span.text)

# Register the Span extension attribute "capital" with the getter get_capital
Span.set_extension("capital", getter = get_capital)

# Process the text and print the entity text, label and capital attributes
doc = nlp("Czech Republic may help Slovakia protect its airspace")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

### Scaling and performance

* If you need to process a lot of texts and create a lot of Doc objects in a row, the nlp.pipe method can speed this up significantly.


In [60]:
# BAD: docs = [nlp(text) for text in LOTS_OF_TEXTS]

docs = list(nlp.pipe(LOTS_OF_TEXTS)) #Good

NameError: name 'LOTS_OF_TEXTS' is not defined

In [71]:
import json
import spacy

nlp = spacy.load("en_core_web_sm")

with open("exercises/en/tweets.json") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the adjectives
for doc in nlp.pipe(TEXTS):
    print([token.text for token in doc if token.pos_ == "ADJ"])

FileNotFoundError: [Errno 2] No such file or directory: 'exercises/en/tweets.json'

* Setting as_tuples=True on nlp.pipe lets you pass in (text, context) tuples
* Yields (doc, context) tuples
* Useful for associating metadata with the doc

In [63]:
data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    print(doc.text, context["page_number"])

This is a text 15
And another text 16


In [65]:
from spacy.tokens import Doc

Doc.set_extension("id", default=None, force = True)
Doc.set_extension("page_number", default=None, force = True)

data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    doc._.id = context["id"]
    doc._.page_number = context["page_number"]

In [75]:
import json
from spacy.lang.en import English
from spacy.tokens import Doc

with open("exercises/en/bookquotes.json") as f:
    DATA = json.loads(f.read())

nlp = English()

# Register the Doc extension "author" (default None)
Doc.set_extension("author", default=None)

# Register the Doc extension "book" (default None)
Doc.set_extension("book", default=None)

for doc, context in nlp.pipe(DATA, as_tuples=True):
    # Set the doc._.book and doc._.author attributes from the context
    doc._.book = context["book"]
    doc._.author = context["author"]

    # Print the text and custom attribute data
    print(f"{doc.text}\n — '{doc._.book}' by {doc._.author}\n")

FileNotFoundError: [Errno 2] No such file or directory: 'exercises/en/bookquotes.json'

* Another common scenario: Sometimes you already have a model loaded to do other processing, but you only need the tokenizer for one particular text.

* If you only need a tokenized Doc object, you can use the nlp.make_doc method instead, which takes a text and returns a doc.

* nlp.make_doc turns the text into a doc before the pipeline components are called.

In [68]:
doc = nlp.make_doc("Hello world!")

* spaCy also allows you to temporarily disable pipeline components using the nlp.disable_pipes context manager.
* Restores them after the with block
* Only runs the remaining components

In [69]:
# Disable tagger and parser
with nlp.disable_pipes("tagger", "parser"):
    # Process the text and print the entities
    doc = nlp(text)
    print(doc.ents)

(Apple,)


## Training a neural network model

### Training and updating models

#### Why update a  model?

* Better results on your specific domain
* Learn classification schemes specifically for your problem
* Essential for text classification
* Very useful for named entity recognition
* Less critical for part-of-speech tagging and dependency parsing

#### How to train 

* 1.Initialize the model weights randomly with nlp.begin_training
* 2.Predict a few examples with the current weights by calling nlp.update
* 3.Compare prediction with true labels
* 4.Calculate how to change weights to improve predictions
* 5.Update weights slightly
* 6.Go back to 2.

In [76]:
Image(url= "https://course.spacy.io/training.png")

* Training data: Examples and their annotations.
* Text: The input text the model should predict a label for.
* Label: The label the model should predict.
* Gradient: How to change the weights.

#### Training data

* Examples of what we want the model to predict in context
* Update an existing model: a few hundred to a few thousand examples
* Train a new category: a few thousand to a million examples
* spaCy's English models: 2 million words
* Usually created manually by human annotators
* Can be semi-automated – for example, using spaCy's Matcher!

In [81]:
import json
from spacy.matcher import Matcher
from spacy.lang.en import English

TEXTS = [
  "How to preorder the iPhone X",
  "iPhone X is coming",
  "Should I pay $1,000 for the iPhone X?",
  "The iPhone 8 reviews are here",
  "iPhone 11 vs iPhone 8: What's the difference?",
  "I need a new phone! Any tips?"
]

nlp = English()
matcher = Matcher(nlp.vocab)

# Two tokens whose lowercase forms match "iphone" and "x"
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]

# Token whose lowercase form matches "iphone" and a digit
pattern2 = [{"LOWER":  "iphone"}, {"IS_DIGIT": True}]

# Add patterns to the matcher and check the result
matcher.add("GADGET", None, pattern1, pattern2)
for doc in nlp.pipe(TEXTS):
    print([doc[start:end] for match_id, start, end in matcher(doc)])

[iPhone X]
[iPhone X]
[iPhone X]
[iPhone 8]
[iPhone 11, iPhone 8]
[]


In [83]:
import json
from spacy.matcher import Matcher
from spacy.lang.en import English

nlp = English()
matcher = Matcher(nlp.vocab)
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]
matcher.add("GADGET", None, pattern1, pattern2)

TRAINING_DATA = []

# Create a Doc object for each text in TEXTS
for doc in nlp.pipe(TEXTS):
    # Match on the doc and create a list of matched spans
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Get (start character, end character, label) tuples of matches
    entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
    # Format the matches as a (doc.text, entities) tuple
    training_example = (doc.text, {"entities": entities})
    # Append the example to the training data
    TRAINING_DATA.append(training_example)

print(*TRAINING_DATA, sep="\n")

('How to preorder the iPhone X', {'entities': [(20, 28, 'GADGET')]})
('iPhone X is coming', {'entities': [(0, 8, 'GADGET')]})
('Should I pay $1,000 for the iPhone X?', {'entities': [(28, 36, 'GADGET')]})
('The iPhone 8 reviews are here', {'entities': [(4, 12, 'GADGET')]})
("iPhone 11 vs iPhone 8: What's the difference?", {'entities': [(0, 9, 'GADGET'), (13, 21, 'GADGET')]})
('I need a new phone! Any tips?', {'entities': []})


### The Training loop

#### The steps of a training loop
* Loop for a number of times.
* Shuffle the training data.
* Divide the data into batches.
* Update the model for each batch.
* Save the updated model.

In [84]:
TRAINING_DATA = [
    ("How to preorder the iPhone X", {"entities": [(20, 28, "GADGET")]})
    # And many more examples...
]

In [89]:
# Loop for 10 iterations
for i in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    # Create batches and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA):
        # Split the batch in texts and annotations
        texts = [text for text, annotation in batch]
        annotations = [annotation for text, annotation in batch]
        # Update the model
        nlp.update(texts, annotations)

# Save the model
#nlp.to_disk(path_to_model)

#### Updating an existing model
* Improve the predictions on new data
* Especially useful to improve existing categories, like "PERSON"
* Also possible to add new categories
* Be careful and make sure the model doesn't "forget" the old ones

In [92]:
# Start with blank English model
nlp = spacy.blank("en")
# Create blank entity recognizer and add it to the pipeline
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
# Add a new label
ner.add_label("GADGET")

# Start the training
nlp.begin_training()
# Train for 10 iterations
for itn in range(10):
    random.shuffle(TRAINING_DATA)
    # Divide examples into batches
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, annotation in batch]
        annotations = [annotation for text, annotation in batch]
        # Update the model
        nlp.update(texts, annotations)

In [94]:
import spacy
import random


TRAINING_DATA = [
    ["How to preorder the iPhone X", { "entities": [[20, 28, "GADGET"]] }],
    ["iPhone X is coming", { "entities": [[0, 8, "GADGET"]] }],
    ["Should I pay $1,000 for the iPhone X?", { "entities": [[28, 36, "GADGET"]] }],
    ["The iPhone 8 reviews are here", { "entities": [[4, 12, "GADGET"]] }],
    ["Your iPhone goes up to 11 today", { "entities": [[5, 11, "GADGET"]] }],
    ["I need a new phone! Any tips?", { "entities": [] }]
]

nlp = spacy.blank("en")
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
ner.add_label("GADGET")

# Start the training
nlp.begin_training()

# Loop for 10 iterations
for itn in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}

    # Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]

        # Update the model
        nlp.update(texts, annotations, losses=losses)
    print(losses)

{'ner': 33.439899265766144}
{'ner': 21.628397047519684}
{'ner': 8.306586449965835}
{'ner': 9.809614906203933}
{'ner': 7.671824979726807}
{'ner': 6.110952659400027}
{'ner': 2.7680821026052627}
{'ner': 1.0693252180375623}
{'ner': 2.3070501229153564}
{'ner': 2.01578828188625}
