In [1]:
# Import the English language class
from spacy.lang.en import English

# Create the nlp object
nlp =English()

# Process a text
doc = nlp("This is a sentence.")

# Print the document text
print(doc.text)

This is a sentence.


In [2]:
# Import the English language class and create the nlp object
# from ____ import ____

# nlp = ____

# Process the text
doc =nlp("I like tree kangaroos and narwhals.")

# A slice of the Doc for "tree kangaroos"
tree_kangaroos =doc[2:4]
print(tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:-1]
print(tree_kangaroos_and_narwhals.text)

tree kangaroos
tree kangaroos and narwhals


In [3]:
############ to find % ############
# from spacy.lang.en import English

# nlp = English()

# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i +1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


## spacy Statistical models

In [4]:
import en_core_web_sm

In [5]:
import spacy

# Load the "en_core_web_sm" model
nlp = en_core_web_sm.load()

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc =nlp(text)
# Print the document text
print(doc.text)

It’s official: Apple is the first U.S. public company to reach a $1 trillion market value


In [16]:
# import spacy

# nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print(f"{token_text:<12} {token_pos:<10} {token_dep:<10}")

It           PRON       nsubj     
’s           VERB       punct     
official     NOUN       ROOT      
:            PUNCT      punct     
Apple        PROPN      nsubj     
is           AUX        ROOT      
the          DET        det       
first        ADJ        amod      
U.S.         PROPN      nmod      
public       ADJ        amod      
company      NOUN       attr      
to           PART       aux       
reach        VERB       relcl     
a            DET        det       
$            SYM        quantmod  
1            NUM        compound  
trillion     NUM        nummod    
market       NOUN       compound  
value        NOUN       dobj      


In [23]:
# import spacy

# nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc =nlp(text)

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Apple ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


### using Matcher

In [25]:
# import spacy

# Import the Matcher
from spacy.matcher import Matcher

nlp = en_core_web_sm.load()
doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders")

# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{'TEXT':'iPhone'},{'TEXT':'X'}]

# Add the pattern to the matcher
matcher.add("IPHONE_X_PATTERN", None, pattern)

# Use the matcher on the doc
matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X']


In [26]:
# import spacy
# from spacy.matcher import Matcher

# nlp = spacy.load("en_core_web_sm")
# matcher = Matcher(nlp.vocab)

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": 'iOS'}, {"IS_DIGIT":True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: iOS 7
Match found: iOS 11
Match found: iOS 10


Write one pattern that only matches forms of “download” (tokens with the lemma “download”), followed by a token with the part-of-speech tag "PROPN" (proper noun).

In [28]:
# import spacy
# from spacy.matcher import Matcher

# nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": 'download'}, {"POS":'PROPN'}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 2
Match found: downloaded Fortnite
Match found: downloading Minecraft


Write one pattern that matches adjectives ("ADJ") followed by one or two "NOUN"s (one noun and one optional noun).

In [29]:
# import spacy
# from spacy.matcher import Matcher

# nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

# Write a pattern for adjective plus one or two nouns
pattern = [{"POS":'ADJ'}, {"POS": 'NOUN'}, {"POS": 'NOUN', "OP": '?'}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 5
Match found: beautiful design
Match found: smart search
Match found: automatic labels
Match found: optional voice
Match found: optional voice responses


# Data structures

### Strings to Hashes

In [30]:
#string to hash  and hash to string
### looking for CAT
from spacy.lang.en import English

nlp = English()
doc = nlp("I have a cat")

# Look up the hash for the word "cat"
cat_hash = nlp.vocab.strings['cat']
print(cat_hash)

# Look up the cat_hash to get the string
cat_string = nlp.vocab.strings[cat_hash]
print(cat_string)

5439657043933447811
cat


In [31]:
#looking for Person
# from spacy.lang.en import English

# nlp = English()
doc = nlp("David Bowie is a PERSON")

# Look up the hash for the string label "PERSON"
person_hash =nlp.vocab.strings['PERSON']
print(person_hash)

# Look up the person_hash to get the string
person_string = nlp.vocab.strings[person_hash]
print(person_string)

380
PERSON


### Vocab,hashes and lexemes

In [33]:
from spacy.lang.en import English
from spacy.lang.de import German

# Create an English and German nlp object
nlp = English()
nlp_de = German()

# Get the ID for the string 'Bowie'
bowie_id = nlp.vocab.strings["Bowie"]
print(bowie_id)



2644858412616767388


In [34]:
# Look up the ID for "Bowie" in the vocab
print(nlp_de.vocab.strings[bowie_id])

KeyError: "[E018] Can't retrieve string for hash '2644858412616767388'. This usually refers to an issue with the `Vocab` or `StringStore`."

Hash cant be reversed...

### Docs and Spans

In [36]:
from spacy.lang.en import English

nlp = English()

# Import the Doc class
from spacy.tokens import Doc

# Desired text: "spaCy is cool!"
words = ["spaCy", "is", "cool", "!"]
spaces = [True, True, False, False]

# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

spaCy is cool!


In [37]:
from spacy.lang.en import English

nlp = English()

# Import the Doc class
from spacy.tokens import Doc

# Desired text: "Go, get started!"
words = ["Go", ",", "get", "started", "!"]
spaces = [False, True, True, False,False]

# Create a Doc from the words and spaces
doc=Doc(nlp.vocab,words=words,spaces=spaces)

print(doc.text)

Go, get started!


In [39]:
from spacy.lang.en import English

nlp = English()

# Import the Doc class
from spacy.tokens import Doc

# Desired text: "Oh, really?!"
words = ['Oh', ',', 'really', '?', '!']
spaces = [False,True,False,False,False]

# Create a Doc from the words and spaces
doc = Doc(nlp.vocab,words,spaces)
print(doc.text)

Oh, really?!


In [40]:
from spacy.lang.en import English

nlp = English()

# Import the Doc and Span classes
from spacy.tokens import Doc,Span

words = ["I", "like", "David", "Bowie"]
spaces = [True, True, True, False]

# Create a doc from the words and spaces
doc = Doc(nlp.vocab,words,spaces)
print(doc.text)

# Create a span for "David Bowie" from the doc and assign it the label "PERSON"
span = Span(doc,2,4,label="PERSON")
print(span.text, span.label_)

# Add the span to the doc's entities
doc.ents = [span]

# Print entities' text and labels
print([(ent.text, ent.label_) for ent in doc.ents])

I like David Bowie
David Bowie PERSON
[('David Bowie', 'PERSON')]


The code in this example is trying to analyze a text and collect all proper nouns that are followed by a verb.

In [45]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city")

for token in doc:
    if token.pos_ =='PROPN':
        if doc[token.i+1].pos_=='VERB':
            print(f'Found proper noun : {token.text},before verb: {doc[token.i+1].text}')
            

Found proper noun : Berlin,before verb: looks


# Word Vetors and Semantic Similarities

Similarities works only for medium and large models

In [47]:
#!python -m spacy download en_core_web_md

In [51]:
import en_core_web_md


*simarilities can work for doc and doc, span and doc, token and span*

In [52]:
import spacy

# Load the en_core_web_md model
nlp=en_core_web_md.load()

# Process a text
doc = nlp("Two bananas in pyjamas")

# Get the vector for the token "bananas"
bananas_vector = doc[1].vector
print(bananas_vector)

[-2.2009e-01 -3.0322e-02 -7.9859e-02 -4.6279e-01 -3.8600e-01  3.6962e-01
 -7.7178e-01 -1.1529e-01  3.3601e-02  5.6573e-01 -2.4001e-01  4.1833e-01
  1.5049e-01  3.5621e-01 -2.1508e-01 -4.2743e-01  8.1400e-02  3.3916e-01
  2.1637e-01  1.4792e-01  4.5811e-01  2.0966e-01 -3.5706e-01  2.3800e-01
  2.7971e-02 -8.4538e-01  4.1917e-01 -3.9181e-01  4.0434e-04 -1.0662e+00
  1.4591e-01  1.4643e-03  5.1277e-01  2.6072e-01  8.3785e-02  3.0340e-01
  1.8579e-01  5.9999e-02 -4.0270e-01  5.0888e-01 -1.1358e-01 -2.8854e-01
 -2.7068e-01  1.1017e-02 -2.2217e-01  6.9076e-01  3.6459e-02  3.0394e-01
  5.6989e-02  2.2733e-01 -9.9473e-02  1.5165e-01  1.3540e-01 -2.4965e-01
  9.8078e-01 -8.0492e-01  1.9326e-01  3.1128e-01  5.5390e-02 -4.2423e-01
 -1.4082e-02  1.2708e-01  1.8868e-01  5.9777e-02 -2.2215e-01 -8.3950e-01
  9.1987e-02  1.0180e-01 -3.1299e-01  5.5083e-01 -3.0717e-01  4.4201e-01
  1.2666e-01  3.7643e-01  3.2333e-01  9.5673e-02  2.5083e-01 -6.4049e-02
  4.2143e-01 -1.9375e-01  3.8026e-01  7.0883e-03 -2

In [53]:
bananas_vector.shape

(300,)

In [54]:
# import spacy

# nlp = spacy.load("en_core_web_md")

doc1 = nlp("It's a warm summer day")
doc2 = nlp("It's sunny outside")

# Get the similarity of doc1 and doc2
similarity = doc1.similarity(doc2)
print(similarity)

0.8789265574516525


In [55]:
# import spacy

# nlp = spacy.load("en_core_web_md")

doc = nlp("TV and books")
token1, token2 = doc[0], doc[2]

# Get the similarity of the tokens "TV" and "books"
similarity = token1.similarity(token2)
print(similarity)

0.22325331


In [59]:
# import spacy

# nlp = spacy.load("en_core_web_md")

doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")

# Create spans for "great restaurant" and "really nice bar"
span1 =doc[3:5]
span2 =doc[-4:-1]

# Get the similarity of the spans
similarity = span1.similarity(span2)
print(similarity)

0.75173926


# Combining models and tools

In [74]:
# import spacy
from spacy.matcher import Matcher

nlp = en_core_web_sm.load()
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

# Create the match patterns
pattern1 = [{"LOWER": "amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"LOWER": "ad"},{'TEXT':'-'}, {'LOWER':'free'},{"POS": "NOUN"}]

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add("PATTERN1", None, pattern1)
matcher.add("PATTERN2", None, pattern2)

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing


In [90]:
# Initialize with the shared vocab
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# Patterns are lists of dictionaries describing the tokens
pattern = [{"LEMMA": "love", "POS": "VERB"}, {"LOWER": "cats"}]
matcher.add("LOVE_CATS", None, pattern)

# Operators can specify how often a token should be matched
pattern = [{"TEXT": "very", "OP": "+"}, {"TEXT": "happy"}]
matcher.add("VERY_HAPPY", None, pattern)

# # Calling matcher on doc returns list of (match_id, start, end) tuples
doc = nlp("I loved cats and I'm very very happy")
matches = matcher(doc)
for i,start,end in matches:
    print(doc.vocab.strings[i],doc[start:end])

LOVE_CATS loved cats
VERY_HAPPY very happy
VERY_HAPPY very very happy


In [86]:
import spacy
from spacy.matcher import Matcher

nlp = en_core_web_sm.load()
matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", None, pattern)

doc = nlp("Hello, world! Hello world!")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, world


##### Phrase matching

In [92]:
# import json
# from spacy.lang.en import English

# with open("exercises/en/countries.json") as f:
#     COUNTRIES = json.loads(f.read())

# nlp = English()
# doc = nlp("Czech Republic may help Slovakia protect its airspace")

# # Import the PhraseMatcher and initialize it
# from spacy.matcher import PhraseMatcher

# matcher = PhraseMatcher(nlp.vocab)

# # Create pattern Doc objects and add them to the matcher
# # This is the faster version of: [nlp(country) for country in COUNTRIES]
# patterns = list(nlp.pipe(COUNTRIES))
# matcher.add("COUNTRY", None, *patterns)

# # Call the matcher on the test document and print the result
# matches = matcher(doc)
# print([doc[start:end] for match_id, start, end in matches])

##### Extracting countries and relationships
*Iterate over the matches and create a Span with the label "GPE" (geopolitical entity).
*Overwrite the entities in doc.ents and add the matched span.
*Get the matched span’s root head token.
*Print the text of the head token and the span.

In [94]:
# import spacy
# from spacy.matcher import PhraseMatcher
# from spacy.tokens import Span
# import json

# with open("exercises/en/countries.json") as f:
#     COUNTRIES = json.loads(f.read())
# with open("exercises/en/country_text.txt") as f:
#     TEXT = f.read()

# nlp = spacy.load("en_core_web_sm")
# matcher = PhraseMatcher(nlp.vocab)
# patterns = list(nlp.pipe(COUNTRIES))
# matcher.add("COUNTRY", None, *patterns)

# # Create a doc and reset existing entities
# doc = nlp(TEXT)
# doc.ents = []

# # Iterate over the matches
# for match_id, start, end in matcher(doc):
#     # Create a Span with the label for "GPE"
#     span = Span(doc,start,end,label='GPE')

#     # Overwrite the doc.ents and add the span
#     doc.ents = list(doc.ents) + [span]

#     # Get the span's root head token
#     span_root_head = span.root.head
#     # Print the text of the span root's head token and the span text
#     print(span_root_head.text, "-->", span.text)

# # Print the entities in the document
# print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "GPE"])

# Processing Pipelines

In [96]:
# import spacy

# Load the en_core_web_sm model
nlp = en_core_web_sm.load()

# Print the names of the pipeline components
print(nlp.pipe_names)

# Print the full pipeline of (name, component) tuples
print(nlp.pipeline)

['tagger', 'parser', 'ner']
[('tagger', <spacy.pipeline.pipes.Tagger object at 0x0000026D19BA4108>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x0000026D01642B28>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x0000026D19AA9648>)]


*The example shows a custom component that prints the number of tokens in a document.*

In [101]:

# Define the custom component
def length_component(doc):
    # Get the doc's length
    doc_length = len(doc)
    print(f"This document is {doc_length} tokens long.")
    # Return the doc
    return doc


# Load the small English model
nlp = en_core_web_sm.load()

# Add the component first in the pipeline and print the pipe names
nlp.add_pipe(length_component,first=True)
print(nlp.pipe_names)

# Process a text
doc =nlp('This is a sentence.')

['length_component', 'tagger', 'parser', 'ner']
This document is 5 tokens long.


###### In this exercise, you’ll be writing a custom component that uses the PhraseMatcher to find animal names in the document and adds the matched spans to the doc.ents. A PhraseMatcher with the animal patterns has already been created as the variable matcher.

In [102]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

In [110]:
nlp=en_core_web_sm.load()
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns=list(nlp.pipe(animals))
print('Animal Patterns:', animal_patterns)
matcher=PhraseMatcher(nlp.vocab)
matcher.add('ANIMAL',None,*animal_patterns)

# Define the custom component
def animal_component(doc):
    matches=matcher(doc)
    spans=[Span(doc,start,end,label='Animal') for match_id,start,end in matches]
    # Overwrite the doc.ents with the matched spans
    print(doc.ents)
    doc.ents=spans
    return doc

# Add the component to the pipeline after the "ner" component
nlp.add_pipe(animal_component,after='ner')
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
  
doc=nlp('I have a cat and a Golden Retriever')
print(doc.ents)
print([(ent.text,ent.label_) for ent in doc.ents])
    
    
    
    
    

Animal Patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]
['tagger', 'parser', 'ner', 'animal_component']
()
(cat, Golden Retriever)
[('cat', 'Animal'), ('Golden Retriever', 'Animal')]


##### Custom extensions

In [113]:
from spacy.lang.en import English
from spacy.tokens import Token

nlp = English()

# Register the Token extension attribute "is_country" with the default value False
Token.set_extension('is_Country', default=False)

# Process the text and set the is_country attribute to True for the token "Spain"
doc = nlp("I live in Spain.")
doc[3]._.is_Country = True

# Print the token text and the is_country attribute for all tokens
print([(token.text, token._.is_Country) for token in doc])

[('I', False), ('live', False), ('in', False), ('Spain', True), ('.', False)]


In [119]:
from spacy.lang.en import English
from spacy.tokens import Token

nlp = English()

# Define the getter function that takes a token and returns its reversed text
def get_reversed(token):
    return token.text[::-1]


# Register the Token property extension "reversed" with the getter get_reversed
Token.set_extension('reversed', getter=get_reversed)

# Process the text and print the reversed attribute for each token
doc = nlp("All generalizations are false, including this one.")
for token in doc:
    print("reversed:", token._.reversed)

reversed: llA
reversed: snoitazilareneg
reversed: era
reversed: eslaf
reversed: ,
reversed: gnidulcni
reversed: siht
reversed: eno
reversed: .


*Complete the get_has_number function .
Use Doc.set_extension to register "has_number" (getter get_has_number) and print its value.*

In [120]:
from spacy.lang.en import English
from spacy.tokens import Doc

In [121]:
nlp=English()
#define the getter fucntion
def get_has_number(doc):
    # Return if any of the tokens in the doc return True for token.like_num
    return any(token.like_num for token in doc)


# Register the Doc property extension "has_number" with the getter get_has_number
Doc.set_extension('has_number', getter=get_has_number)

# Process the text and check the custom has_number attribute
doc = nlp("The museum closed for five years in 2012.")
print("has_number:",doc._.has_number)

has_number: True


*Use Span.set_extension to register "to_html" (method to_html).
Call it on doc[0:2] with the tag "strong".*

In [129]:
from spacy.lang.en import English
from spacy.tokens import Span

nlp = English()

# Define the method
def to_html(span, tag):
    # Wrap the span text in a HTML tag and return it
    return f"<{tag}>{span.text}</{tag}>"


# Register the Span method extension "to_html" with the method to_html
Span.set_extension("to_html", method=to_html,force=True)

# Process the text and call the to_html method on the span with the tag name "strong"
doc = nlp("Hello world, this is a sentence.")
span = doc[0:2]
print(span._.to_html('Strong'))

<Strong>Hello world</Strong>


*In this exercise, you’ll combine custom extension attributes with the model’s predictions and create an attribute getter that returns a Wikipedia search URL if the span is a person, organization, or location.*

In [134]:
# import spacy
from spacy.tokens import Span

nlp = en_core_web_sm.load()


def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text


# Set the Span extension wikipedia_url using get getter get_wikipedia_url
Span.set_extension("wikipedia_url", getter=get_wikipedia_url,force=True)

doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent.label_,ent._.wikipedia_url)

fifty years DATE None
first ORDINAL None
David Bowie PERSON https://en.wikipedia.org/w/index.php?search=David_Bowie


*write a pipeline component that finds country names and a custom extension attribute that returns a country’s capital, if available.*

In [135]:
# import json
# from spacy.lang.en import English
# from spacy.tokens import Span
# from spacy.matcher import PhraseMatcher

# with open("exercises/en/countries.json") as f:
#     COUNTRIES = json.loads(f.read())

# with open("exercises/en/capitals.json") as f:
#     CAPITALS = json.loads(f.read())

# nlp = English()
# matcher = PhraseMatcher(nlp.vocab)
# matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


# def countries_component(doc):
#     # Create an entity Span with the label "GPE" for all matches
#     matches = matcher(doc)
#     doc.ents = [Span(doc, start, end, label="GPE") for match_id, start, end in matches]
#     return doc


# # Add the component to the pipeline
# nlp.add_pipe(countries_component)
# print(nlp.pipe_names)

# # Getter that looks up the span text in the dictionary of country capitals
# get_capital = lambda span: CAPITALS.get(span.text)

# # Register the Span extension attribute "capital" with the getter get_capital
# Span.set_extension("capital", getter=get_capital)

# # Process the text and print the entity text, label and capital attributes
# doc = nlp("Czech Republic may help Slovakia protect its airspace")
# print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

#### Processing Pipeline

In [136]:
# import json
# import spacy

# nlp = spacy.load("en_core_web_sm")

# with open("exercises/en/tweets.json") as f:
#     TEXTS = json.loads(f.read())

# # Process the texts and print the adjectives
# for doc in nlp.pipe(TEXTS):
#     print([token.text for token in doc if token.pos_ == "ADJ"])

In [137]:
# import json
# from spacy.lang.en import English
# from spacy.tokens import Doc

# with open("exercises/en/bookquotes.json") as f:
#     DATA = json.loads(f.read())

# nlp = English()

# # Register the Doc extension "author" (default None)
# Doc.set_extension("author",default=None)

# # Register the Doc extension "book" (default None)
# Doc.set_extension("book",default=None)

# for doc, context in nlp.pipe(DATA,as_tuples=True):
#     # Set the doc._.book and doc._.author attributes from the context
#     doc._.book = context['book']
#     doc._.author = context['author']

#     # Print the text and custom attribute data
#     print(f"{doc.text}\n — '{doc._.book}' by {doc._.author}\n")

#### Selective processing

*In this exercise, you’ll use the nlp.make_doc and nlp.disable_pipes methods to only run selected components when processing a text.*

In [138]:
# import spacy

nlp = en_core_web_sm.load()
text = (
    "Chick-fil-A is an American fast food restaurant chain headquartered in "
    "the city of College Park, Georgia, specializing in chicken sandwiches."
)

# Only tokenize the text
doc = nlp.make_doc(text)
print([token.text for token in doc])

['Chick', '-', 'fil', '-', 'A', 'is', 'an', 'American', 'fast', 'food', 'restaurant', 'chain', 'headquartered', 'in', 'the', 'city', 'of', 'College', 'Park', ',', 'Georgia', ',', 'specializing', 'in', 'chicken', 'sandwiches', '.']


In [139]:
# import spacy

# nlp = spacy.load("en_core_web_sm")
text = (
    "Chick-fil-A is an American fast food restaurant chain headquartered in "
    "the city of College Park, Georgia, specializing in chicken sandwiches."
)

# Disable the tagger and parser
with nlp.disable_pipes("tagger", "parser"):
    # Process the text
    doc = nlp(text)
    # Print the entities in the doc
    print(doc.ents,)

(Chick-fil-A, American, College Park, Georgia)


# Training a Neural Network Model

#### Creating Training Data

*spaCy’s rule-based Matcher is a great way to quickly create training data for named entity models. A list of sentences is available as the variable TEXTS. You can print it to inspect it. We want to find all mentions of different iPhone models, so we can create training data to teach a model to recognize them as "GADGET".*

In [140]:
# import json
# from spacy.matcher import Matcher
# from spacy.lang.en import English

# with open("exercises/en/iphone.json") as f:
#     TEXTS = json.loads(f.read())

# nlp = English()
# matcher = Matcher(nlp.vocab)

# # Two tokens whose lowercase forms match "iphone" and "x"
# pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]

# # Token whose lowercase form matches "iphone" and a digit
# pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]

# # Add patterns to the matcher and check the result
# matcher.add("GADGET", None, pattern1, pattern2)
# for doc in nlp.pipe(TEXTS):
#     print([doc[start:end] for match_id, start, end in matcher(doc)])

*Let’s use the match patterns we’ve created in the previous exercise to bootstrap a set of training examples. A list of sentences is available as the variable TEXTS.*

In [141]:
# import json
# from spacy.matcher import Matcher
# from spacy.lang.en import English

# with open("exercises/en/iphone.json") as f:
#     TEXTS = json.loads(f.read())

# nlp = English()
# matcher = Matcher(nlp.vocab)
# pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
# pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]
# matcher.add("GADGET", None, pattern1, pattern2)

# TRAINING_DATA = []

# # Create a Doc object for each text in TEXTS
# for doc in nlp.pipe(TEXTS):
#     # Match on the doc and create a list of matched spans
#     spans = [doc[start:end] for match_id, start, end in matcher(doc)]
#     # Get (start character, end character, label) tuples of matches
#     entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
#     print(entities)
#     # Format the matches as a (doc.text, entities) tuple
#     training_example = (doc.text, {"entities": entities})
#     # Append the example to the training data
#     TRAINING_DATA.append(training_example)

# print(*TRAINING_DATA, sep="\n")

*In this exercise, you’ll prepare a spaCy pipeline to train the entity recognizer to recognize "GADGET" entities in a text – for example, “iPhone X”.*

In [143]:
# import spacy
# import random
# import json

# with open("exercises/en/gadgets.json") as f:
#     TRAINING_DATA = json.loads(f.read())

# nlp = spacy.blank("en")
# ner = nlp.create_pipe("ner")
# nlp.add_pipe(ner)
# ner.add_label("GADGET")

# # Start the training
# nlp.begin_training()

# # Loop for 10 iterations
# for itn in range(10):
#     # Shuffle the training data
#     random.shuffle(TRAINING_DATA)
#     losses = {}

#     # Batch the examples and iterate over them
#     for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
#         texts = [text for text, entities in batch]
#         annotations = [entities for text, entities in batch]

#         # Update the model
#         nlp.update(texts, annotations, losses=losses)
#     print(losses)