In [1]:
import spacy
from IPython.display import display
import json



In [None]:
#2
nlp = spacy.blank("en")
doc = nlp("This is a sentence.")
print(doc.text)

This is a sentence.


In [23]:
#3
nlp = spacy.blank("en")
doc = nlp("I like tree kangaroos and narwhals.")
first_token = doc[0]
print(first_token.text)

tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals.text)

I
tree kangaroos
tree kangaroos and narwhals


In [25]:
#4
nlp = spacy.blank("en")
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are.")

for token in doc:
    if token.like_num:
        next_token = doc[token.i + 1]
        if next_token.text == "%":
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


In [32]:
#7-8
nlp = spacy.load("en_core_web_sm")
text = "It's official: Apple is the first U.S. public company to reach a $1 trillion market value"
doc = nlp(text)

for token in doc:
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

print("\n")

for ent in doc.ents:
    print(ent.text, ent.label_)

spacy.explain("GPE")

It          PRON      nsubj     
's          AUX       ccomp     
official    ADJ       acomp     
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      


Apple ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


'Countries, cities, states'

In [None]:
#9
nlp = spacy.load("en_core_web_sm")
text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

iphone_x = doc[1:3]
print("Missing entity:", iphone_x.text)

Apple ORG
Missing entity: iPhone X


In [None]:
#10
nlp = spacy.load("en_core_web_sm")
matcher = spacy.matcher.Matcher(nlp.vocab)
pattarn = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattarn])

doc = nlp("Upcoming iPhone X release date leaked")

matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

print("")

pattern = [{"IS_DIGIT": True}, {"LOWER": "fifa"}, {"LOWER": "world"}, {"LOWER": "cup"}, {"IS_PUNCT": True}]
matcher.add("FIFA_PATTERN", [pattern])

doc = nlp("2018 FIFA World Cup: France won!")

matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

print("")

pattern = [{"LEMMA": "love", "POS": "VERB"}, {"POS": "NOUN"}]
matcher.add("LV_PATTERN", [pattern])

doc = nlp("I loved dogs but now I love cats more.")

matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

print("")

pattern = [{"LEMMA": "buy"}, {"POS": "DET", "OP": "?"}, {"POS": "NOUN"}]
matcher.add("BUY_PATTERN", [pattern])

doc = nlp("I bought a smartphone. Now I'm buying apps.")

matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

iPhone X

2018 FIFA World Cup:

loved dogs
love cats

bought a smartphone
buying apps


In [51]:
#11
nlp = spacy.load("en_core_web_sm")
doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders")

matcher = spacy.matcher.Matcher(nlp.vocab)

pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattern])

matches = matcher(doc)

print("Matches:", [doc[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X']


In [56]:
#12
nlp = spacy.load("en_core_web_sm")
matcher = spacy.matcher.Matcher(nlp.vocab)

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

matcher.add("IOS_VERSION_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

print("\nPart 2\n")

doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]
spacy.explain("PROPN")

matcher.add("DOWNLOAD_THINGS_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

print("\nPart 3\n")

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "*"}]

matcher.add("ADJ_NOUN_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: iOS 7
Match found: iOS 11
Match found: iOS 10

Part 2

Total matches found: 3
Match found: downloaded Fortnite
Match found: downloading Minecraft
Match found: download Winzip

Part 3

Total matches found: 5
Match found: beautiful design
Match found: smart search
Match found: automatic labels
Match found: optional voice
Match found: optional voice responses


In [None]:
#Chapter 2
#1
nlp.vocab.strings.add("coffee")
coffee_hash = nlp.vocab.strings["coffee"]
coffee_string = nlp.vocab.strings[coffee_hash]

display(nlp.vocab.strings[3197928453018144401])
nlp.vocab.strings["coffee"]

lexeme = nlp.vocab["coffee"]
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

'coffee'

coffee 3197928453018144401 True


In [None]:
#2
nlp = spacy.blank("en")
doc = nlp("I have a cat")

cat_hash = nlp.vocab.strings["cat"]
display(cat_hash)
display(nlp.vocab.strings[cat_hash])

print("\nPart 2\n")

doc = nlp("David Bowie is a PERSON")

person_hash = nlp.vocab.strings["PERSON"]
display(person_hash)
nlp.vocab.strings[person_hash]

5439657043933447811

'cat'


Part 2



380

'PERSON'

In [None]:
#4
nlp = spacy.blank("en")

words = ["Hello", "world", "!"]
spaces = [True, False, False]

doc = spacy.tokens.Doc(nlp.vocab, words=words, spaces=spaces)

display(doc)

span = spacy.tokens.Span(doc, 0, 2)

span_with_label = spacy.tokens.Span(doc, 0, 2, label="GREETING")

doc.ents = [span_with_label]

display(doc.ents)

Hello world!

(Hello world,)

In [None]:
#5
nlp = spacy.blank("en")

words = ["spaCy", "is", "cool", "!"]
spaces = [True, True, False, False]

doc = spacy.tokens.Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

print("\nPart 2\n")

words = ["Go", ",", "get", "started", "!"]
spaces = [False, True, True, False, False]

doc = spacy.tokens.Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

print("\nPart 3\n")

words = ["Oh", ",", "really", "?"]
spaces = [False, True, False, False]

doc = spacy.tokens.Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

spaCy is cool!

Part 2

Go, get started!

Part 3

Oh, really?


In [76]:
#6
nlp = spacy.blank("en")

words = ["I", "like", "David", "Bowie"]
spaces = [True, True, True, False]

doc = spacy.tokens.Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

span = spacy.tokens.Span(doc, 2, 4, label="PERSON")
print(span.text, span.label_)

doc.ents = [span]

print([(ent.text, ent.label_) for ent in doc.ents])

I like David Bowie
David Bowie PERSON
[('David Bowie', 'PERSON')]


In [78]:
#7
nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city")

for token in doc:
    if token.pos_ == "PROPN":
        if doc[token.i + 1].pos_ == "VERB":
            print("Found proper noun before a verb:", token.text)

Found proper noun before a verb: Berlin


In [None]:
#8
nlp = spacy.load("en_core_web_md")

doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")
print(doc1.similarity(doc2))

doc = nlp("I like pizza and pasta")
token1 = doc[2]
token2 = doc[4]
print(token1.similarity(token2))

doc = nlp("I like pizza")
token = nlp("soap")[0]

print(doc.similarity(token))

span = nlp("I like pizza and pasta")[2:5]
doc = nlp("McDonalds sells burgers")

print(span.similarity(doc))

print(doc[2].vector)

doc1 = nlp("I like cats")
doc2 = nlp("I hate cats")

print(doc1.similarity(doc2))

0.8382381200790405
1.0
0.2274085134267807
0.5528545379638672
[-9.9847e-01  1.5255e-01  9.0905e-01  5.0226e-01  3.9668e-01  1.1185e+00
 -6.8643e-02 -2.0645e-01  1.6348e-01  1.4820e+00 -3.7295e-01  7.4472e-01
  1.3312e-01 -3.9140e-01  3.8056e-01 -6.2452e-01  5.6385e-01  9.2489e-01
  1.5690e-02  4.6075e-01 -2.4564e-01  4.2107e-01  4.3966e-02 -9.3159e-02
 -3.7071e-01 -4.8146e-01 -6.4922e-01  3.9712e-01 -2.0001e-01 -7.6328e-01
 -1.5941e-01  6.8756e-01  7.0181e-01 -7.5998e-01  2.7529e-01  4.6337e-02
 -1.5418e-02  1.8180e-01  4.5510e-01  1.0663e+00 -2.9151e-01 -2.1489e-01
  1.2203e-01  4.2934e-02  8.3894e-02  4.4703e-01 -1.0444e-01  1.1424e-01
 -4.7985e-01  5.6945e-01 -1.6874e-01  2.2438e-01  7.0262e-02 -3.7424e-01
  3.7648e-01  1.1642e-01  1.9766e-01  3.0820e-01 -2.2964e-01  2.1171e-01
  3.1852e-01 -1.0459e-02  4.6802e-01  3.5646e-01  2.2602e-01 -7.3830e-01
  3.4500e-01 -1.2327e-01 -1.5291e-01  3.0218e-01  5.3695e-02  7.8704e-01
  6.0712e-02  4.7265e-01 -7.4620e-01 -1.9323e-01  3.0870e-01  3

In [88]:
#9
nlp = spacy.load("en_core_web_md")

doc = nlp("Two bananas in pyjamas")

bananas_vector = doc[1].vector

print(bananas_vector)

[-0.6334     0.18981   -0.53544   -0.52658   -0.30001    0.30559
 -0.49303    0.14636    0.012273   0.96802    0.0040354  0.25234
 -0.29864   -0.014646  -0.24905   -0.67125   -0.053366   0.59426
 -0.068034   0.10315    0.66759    0.024617  -0.37548    0.52557
  0.054449  -0.36748   -0.28013    0.090898  -0.025687  -0.5947
 -0.24269    0.28603    0.686      0.29737    0.30422    0.69032
  0.042784   0.023701  -0.57165    0.70581   -0.20813   -0.03204
 -0.12494   -0.42933    0.31271    0.30352    0.09421   -0.15493
  0.071356   0.15022   -0.41792    0.066394  -0.034546  -0.45772
  0.57177   -0.82755   -0.27885    0.71801   -0.12425    0.18551
  0.41342   -0.53997    0.55864   -0.015805  -0.1074    -0.29981
 -0.17271    0.27066    0.043996   0.60107   -0.353      0.6831
  0.20703    0.12068    0.24852   -0.15605    0.25812    0.007004
 -0.10741   -0.097053   0.085628   0.096307   0.20857   -0.23338
 -0.077905  -0.030906   1.0494     0.55368   -0.10703    0.052234
  0.43407   -0.13926    0

In [92]:
#10
nlp = spacy.load("en_core_web_md")

doc1 = nlp("It's a warm summer day")
doc2 = nlp("It's sunny outside")

similarity = doc1.similarity(doc2)
print(similarity)

print("\nPart 2\n")

doc = nlp("TV and books")
token1, token2 = doc[0], doc[2]

similarity = token1.similarity(token2)
print(similarity)

print("\nPart 3\n")

doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")

span1 = doc[3:5]
span2 = doc[12:15]

similarity = span1.similarity(span2)
print(similarity)

0.8456854224205017

Part 2

0.18317238986492157

Part 3

0.7541285157203674


In [99]:
#11
matcher = spacy.matcher.Matcher(nlp.vocab)

pattern = [{"LEMMA": "love", "POS": "VERB"}, {"LOWER": "cats"}]
matcher.add("LOVE_CATS", [pattern])

pattern = [{"TEXT": "very", "OP": "+"}, {"TEXT": "happy"}]
matcher.add("VERY_HAPPY", [pattern])

doc = nlp("I love cats and I'm very very happy")
matches = matcher(doc)
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

matcher = spacy.matcher.Matcher(nlp.vocab)
matcher.add("DOG", [[{"LOWER": "golden"}, {"LOWER": "retriever"}]])
doc = nlp("I have a Golden Retriever")

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print("Matched span:", span.text)
    print("Root token:", span.root.text)
    print("Root head token:", span.root.head.text)
    print("Previous token:", doc[start - 1].text, doc[start - 1].pos_)

matcher = spacy.matcher.PhraseMatcher(nlp.vocab)
pattern = nlp("Golden Retriever")
matcher.add("DOG", [pattern])
doc = nlp("I have a Golden Retriever")

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print("Matched span:", span.text)

Match found: love cats
Match found: very happy
Match found: very very happy
Matched span: Golden Retriever
Root token: Retriever
Root head token: have
Previous token: a DET
Matched span: Golden Retriever


In [11]:
#13
nlp = spacy.load("en_core_web_sm")
doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

# Create the match patterns
pattern1 = [{"TEXT": "Amazon"}, {"IS_TITLE": True, "POS": "PROPN"}]
pattern2 = [{"TEXT": "ad"}, {"TEXT": "-"}, {"TEXT": "free"}, {"POS": "NOUN"}]

# Initialize the Matcher and add the patterns
matcher = spacy.matcher.Matcher(nlp.vocab)
matcher.add("PATTERN1", [pattern1])
matcher.add("PATTERN2", [pattern2])

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing


In [9]:
#14
with open("countries.json", encoding="utf8") as file:
    COUNTRIES = json.loads(file.read())

nlp = spacy.blank("en")
doc = nlp("Czech Republic may help Slovakia protect its airspace")

matcher = spacy.matcher.PhraseMatcher(nlp.vocab)

patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", patterns)

matches = matcher(doc)
print([doc[start:end] for match_id, start, end in matches])

[Czech Republic, Slovakia]


In [5]:
#15
with open("countries.json", encoding="utf8") as file:
    COUNTRIES = json.loads(file.read())

with open("country_text.txt", encoding="utf8") as file:
    TEXT = file.read()

nlp = spacy.blank("en")
matcher = spacy.matcher.PhraseMatcher(nlp.vocab)
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", patterns)

doc = nlp(TEXT)
doc.ents = []

for match_id, start, end in matcher(doc):
    span = spacy.tokens.Span(doc, start, end, label="GPE")
    doc.ents = list(doc.ents) + [span]
    span_root_head = span.root.head
    print(span_root_head.text, "-->", span.text)

print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == "GPE"])

Namibia --> Namibia
South --> South Africa
Cambodia --> Cambodia
Kuwait --> Kuwait
Somalia --> Somalia
Haiti --> Haiti
Mozambique --> Mozambique
Somalia --> Somalia
Rwanda --> Rwanda
Singapore --> Singapore
Sierra --> Sierra Leone
Afghanistan --> Afghanistan
Iraq --> Iraq
Sudan --> Sudan
Congo --> Congo
Haiti --> Haiti
[('Namibia', 'GPE'), ('South Africa', 'GPE'), ('Cambodia', 'GPE'), ('Kuwait', 'GPE'), ('Somalia', 'GPE'), ('Haiti', 'GPE'), ('Mozambique', 'GPE'), ('Somalia', 'GPE'), ('Rwanda', 'GPE'), ('Singapore', 'GPE'), ('Sierra Leone', 'GPE'), ('Afghanistan', 'GPE'), ('Iraq', 'GPE'), ('Sudan', 'GPE'), ('Congo', 'GPE'), ('Haiti', 'GPE')]


In [None]:
#Rozdział 3 
#3
nlp = spacy.load("en_core_web_sm")
display(nlp.pipeline)
nlp.pipe_names

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f847f2eab70>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f8480a7c890>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f847d264040>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f847e3d2fd0>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f847d282210>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f847f30b990>)]

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [22]:
#6
@spacy.language.Language.component("Length component")
def length_component_function(doc):
    doc_length = len(doc)
    print(f'This document is {doc_length} tokens long.')
    return doc

nlp = spacy.load("en_core_web_sm")

nlp.add_pipe("Length component", name="Length component", first=True)
print(nlp.pipe_names)

doc = nlp("Some random text")

['Length component', 'tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
This document is 3 tokens long.


In [23]:
#7
nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = spacy.matcher.PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", animal_patterns)

# Define the custom component
@spacy.language.Language.component("animal_component")
def animal_component_function(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [spacy.tokens.Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc


# Add the component to the pipeline after the "ner" component
nlp.add_pipe("animal_component", name="animal_component", after="ner")
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label) for ent in doc.ents])

animal_patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'animal_component']
[('cat', 6303828839600189595), ('Golden Retriever', 6303828839600189595)]


In [17]:
#9
nlp = spacy.blank("en")
# Register the Token extension attribute "is_country" with the default value False
spacy.tokens.Token.set_extension("is_country", default=False, force=True)

# Process the text and set the is_country attribute to True for the token "Spain"
doc = nlp("I live in Spain.")
doc[3]._.is_country = True

# Print the token text and the is_country attribute for all tokens
print([(token.text, token._.is_country) for token in doc])


#Part 2
nlp = spacy.blank("en")

# Define the getter function that takes a token and returns its reversed text
def get_reversed(token):
    return token.text[::-1]


# Register the Token property extension "reversed" with the getter get_reversed
spacy.tokens.Token.set_extension("reversed", getter=get_reversed, force=True)

# Process the text and print the reversed attribute for each token
doc = nlp("All generalizations are false, including this one.")
for token in doc:
    print("reversed:", token._.reversed)

[('I', False), ('live', False), ('in', False), ('Spain', True), ('.', False)]
reversed: llA
reversed: snoitazilareneg
reversed: era
reversed: eslaf
reversed: ,
reversed: gnidulcni
reversed: siht
reversed: eno
reversed: .


In [18]:
#10
nlp = spacy.blank("en")

if spacy.tokens.Doc.has_extension("has_number"):
    spacy.tokens.Doc.remove_extension("has_number")
# Define the getter function
def get_has_number(doc):
    # Return if any of the tokens in the doc return True for token.like_num
    return any(token.like_num for token in doc)


# Register the Doc property extension "has_number" with the getter get_has_number
spacy.tokens.Doc.set_extension("has_number", getter=get_has_number)

# Process the text and check the custom has_number attribute
doc = nlp("The museum closed for five years in 2012.")
print("has_number:", doc._.has_number)


#Part 2
nlp = spacy.blank("en")

# Define the method
def to_html(span, tag):
    # Wrap the span text in a HTML tag and return it
    return f"<{tag}>{span.text}</{tag}>"


# Register the Span method extension "to_html" with the method to_html
spacy.tokens.Span.set_extension("to_html", method=to_html, force=True)

# Process the text and call the to_html method on the span with the tag name "strong"
doc = nlp("Hello world, this is a sentence.")
span = doc[0:2]
print(span._.to_html("strong"))

has_number: True
<strong>Hello world</strong>


In [None]:
#11
nlp = spacy.load("en_core_web_sm")


def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text


# Set the Span extension wikipedia_url using the getter get_wikipedia_url
spacy.tokens.Span.set_extension("wikipedia_url", getter=get_wikipedia_url, force=True)

doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent._.wikipedia_url)

fifty years None
first None
David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie


In [30]:
#12
with open("countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

with open("capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

nlp = spacy.blank("en")
matcher = spacy.matcher.PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", list(nlp.pipe(COUNTRIES)))


@spacy.language.Language.component("countries_component")
def countries_component_function(doc):
    # Create an entity Span with the label "GPE" for all matches
    matches = matcher(doc)
    doc.ents = [spacy.tokens.Span(doc, start, end, label="GPE") for match_id, start, end in matches]
    return doc


# Add the component to the pipeline
nlp.add_pipe("countries_component", name="countries_component")
print(nlp.pipe_names)

# Getter that looks up the span text in the dictionary of country capitals
get_capital = lambda span: CAPITALS.get(span.text)

# Register the Span extension attribute "capital" with the getter get_capital
spacy.tokens.Span.set_extension("capital", getter=get_capital)

# Process the text and print the entity text, label and capital attributes
doc = nlp("Czech Republic may help Slovakia protect its airspace")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

['countries_component']
[('Czech Republic', 'GPE', 'Prague'), ('Slovakia', 'GPE', 'Bratislava')]


In [47]:
#14
nlp = spacy.load("en_core_web_sm")

with open("tweets.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the adjectives
doc = nlp.pipe(TEXTS)
for text in doc:
    print([token.text for token in text if token.pos_ == "ADJ"])

#Part 2
nlp = spacy.load("en_core_web_sm")

with open("tweets.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the entities
docs = list(nlp.pipe(TEXTS))
entities = [doc.ents for doc in docs]
print(*entities)

#Part 3
nlp = spacy.blank("en")

people = ["David Bowie", "Angela Merkel", "Lady Gaga"]

# Create a list of patterns for the PhraseMatcher
patterns = list(nlp.pipe(people))

['favorite']
['sick']
[]
['happy']
['delicious', 'fast']
['SANDWICH']
['terrible', 'gettin']
(McDonalds,) () (McDonalds,) (McDonalds, Spain) (The Arch Deluxe :P,) () (This morning,)


In [52]:
#15
with open("bookquotes.json", encoding="utf8") as f:
    DATA = json.loads(f.read())

nlp = spacy.blank("en")

# Register the Doc extension "author" (default None)
spacy.tokens.Doc.set_extension("author", default=None, force=True)

# Register the Doc extension "book" (default None)
spacy.tokens.Doc.set_extension("book", default=None, force=True)

for doc, context in nlp.pipe(DATA, as_tuples=True):
    # Set the doc._.book and doc._.author attributes from the context
    doc._.book = context['book']
    doc._.author = context['author']

    # Print the text and custom attribute data
    print(f"{doc.text}\n — '{doc._.book}' by {doc._.author}\n")

One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.
 — 'Metamorphosis' by Franz Kafka

I know not all that may be coming, but be it what it will, I'll go to it laughing.
 — 'Moby-Dick or, The Whale' by Herman Melville

It was the best of times, it was the worst of times.
 — 'A Tale of Two Cities' by Charles Dickens

The only people for me are the mad ones, the ones who are mad to live, mad to talk, mad to be saved, desirous of everything at the same time, the ones who never yawn or say a commonplace thing, but burn, burn, burn like fabulous yellow roman candles exploding like spiders across the stars.
 — 'On the Road' by Jack Kerouac

It was a bright cold day in April, and the clocks were striking thirteen.
 — '1984' by George Orwell

Nowadays people know the price of everything and the value of nothing.
 — 'The Picture Of Dorian Gray' by Oscar Wilde



In [59]:
#16
nlp = spacy.load("en_core_web_sm")
text = (
    "Chick-fil-A is an American fast food restaurant chain headquartered in "
    "the city of College Park, Georgia, specializing in chicken sandwiches."
)

# Only tokenize the text
doc = nlp.make_doc(text)
print([token.text for token in doc])

#Part 2
nlp = spacy.load("en_core_web_sm")

# Disable the tagger and lemmatizer
with nlp.select_pipes(disable=["tagger", "lemmatizer"]):
    # Process the text
    doc = nlp(text)
    # Print the entities in the doc
    print(doc.ents)

['Chick', '-', 'fil', '-', 'A', 'is', 'an', 'American', 'fast', 'food', 'restaurant', 'chain', 'headquartered', 'in', 'the', 'city', 'of', 'College', 'Park', ',', 'Georgia', ',', 'specializing', 'in', 'chicken', 'sandwiches', '.']
(American, College Park, Georgia)
