learning resource: https://course.spacy.io/en/

In [1]:
import spacy

In [2]:
nlp = spacy.blank("en") #creating a blank english nlp object

In [3]:
nlp

<spacy.lang.en.English at 0x212bddd4790>

In [4]:
doc=nlp("Hello world!") #processing a string using nlp object

In [5]:
doc

Hello world!

In [6]:
for i in doc:
    print(i.text)# Iterate over tokens in a Doc
    print(type(i.text))

Hello
<class 'str'>
world
<class 'str'>
!
<class 'str'>


In [7]:
for i in doc:
    print(i) # Iterate over tokens in a Doc
    print(type(i))

Hello
<class 'spacy.tokens.token.Token'>
world
<class 'spacy.tokens.token.Token'>
!
<class 'spacy.tokens.token.Token'>


In [8]:
doc[1]

world

In [9]:
doc[1].text

'world'

In [10]:
doc[1:3]

world!

In [11]:
type(doc[1:3])

spacy.tokens.span.Span

In [12]:
doc[1:3].text

'world!'

In [13]:
type(doc[1:3].text)

str

In [14]:
doc=nlp("the distance is 5m from the hospital.")

In [15]:
print("Index:   ", [token.i for token in doc])
print("Text:    ", [token.text for token in doc])

print("is_alpha:", [token.is_alpha for token in doc])
print("is_punct:", [token.is_punct for token in doc])
print("like_num:", [token.like_num for token in doc])

Index:    [0, 1, 2, 3, 4, 5, 6, 7, 8]
Text:     ['the', 'distance', 'is', '5', 'm', 'from', 'the', 'hospital', '.']
is_alpha: [True, True, True, False, True, True, True, True, False]
is_punct: [False, False, False, False, False, False, False, False, True]
like_num: [False, False, False, True, False, False, False, False, False]


In [16]:
import spacy

# Create the German nlp object
nlp = spacy.blank('de')

# Process a text (this is German for: "Kind regards!")
doc = nlp("Liebe Grüße!")

# Print the document text
print(doc.text)

Liebe Grüße!


In [17]:
import spacy

# Create the Spanish nlp object
nlp = spacy.blank('es')

# Process a text (this is Spanish for: "How are you?")
doc = nlp("¿Cómo estás?")

# Print the document text
print(doc.text)

¿Cómo estás?


In [18]:
# Import spaCy and create the English nlp object
import spacy

nlp = spacy.blank('en')

# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:-1]
print(tree_kangaroos_and_narwhals.text)


tree kangaroos
tree kangaroos and narwhals


In [19]:
import spacy

nlp = spacy.blank("en")

# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i+1]
        print(next_token)
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.text)

,
%
Percentage found: 60
%
Percentage found: 4


spaCy provides a number of trained pipeline packages you can download using the spacy download command. For example, the "en_core_web_sm" package is a small English pipeline that supports all core capabilities and is trained on web text.

The spacy.load method loads a pipeline package by name and returns an nlp object.

The package provides the binary weights that enable spaCy to make predictions.

It also includes the vocabulary, meta information about the pipeline and the configuration file used to train it. It tells spaCy which language class to use and how to configure the processing pipeline.

In [20]:
nlp = spacy.load('en_core_web_sm')



In [21]:
nlp

<spacy.lang.en.English at 0x212c0060670>

In [22]:
doc=nlp("she ate pizza")

In [23]:
for i in doc:
    print(i.text,i.pos_)

she PRON
ate VERB
pizza NOUN


In [24]:
doc=nlp("she ate pizza!")

In [25]:
for i in doc:
    print(i.text,i.pos_)

she PRON
ate VERB
pizza NOUN
! PUNCT


In [26]:
doc=nlp("sharan nair ate pizza!")

In [27]:
for i in doc:
    print(i.text,i.pos_)

sharan PROPN
nair PROPN
ate VERB
pizza NOUN
! PUNCT


Predicting Syntactic Dependencies

In [28]:
doc=nlp("she ate the pizza")

In addition to the part-of-speech tags, we can also predict how the words are related. For example, whether a word is the subject of the sentence or an object.

The .dep_ attribute returns the predicted dependency label.

The .head attribute returns the syntactic head token. You can also think of it as the parent token this word is attached to.

In [29]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

she PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


Predicting Named Entities

Named entities are "real world objects" that are assigned a name – for example, a person, an organization or a country.

In [30]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [31]:
doc.ents

(Apple, U.K., $1 billion)

In [32]:
for i in doc.ents:
    print(i.text,i.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In this case, the model is correctly predicting "Apple" as an organization, "U.K." as a geopolitical entity and "$1 billion" as money

In [33]:
spacy.explain('GPE')

'Countries, cities, states'

In [34]:
spacy.explain('ORG')

'Companies, agencies, institutions, etc.'

Trained pipelines allow you to generalize based on a set of training examples. Once they’re trained, they use binary weights to make predictions. That’s why it’s not necessary to ship them with their training data.

In [35]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

It          PRON      nsubj     
’s          VERB      compound  
official    NOUN      ROOT      
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      


In [36]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Apple ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


In [37]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)
# in the entities we did not get iphone X

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)

Apple ORG
Missing entity: iPhone X


#### *Rule based matching*

In [38]:
import spacy
#import re

# Import the Matcher
from spacy.matcher import Matcher

# Load a pipeline and create the nlp object
nlp = spacy.load("en_core_web_sm")

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattern])

# Process some text
doc = nlp("Upcoming iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

In [39]:
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

iPhone X


In [40]:
matches

[(9528407286733565721, 1, 3)]

In [41]:
pattern = [{"IS_DIGIT": True},{"LOWER": "fifa"},{"LOWER": "world"},{"LOWER": "cup"},{"IS_PUNCT": True}]

In [42]:
doc = nlp("2018 FIFA World Cup: France won!")
matcher.add("fifa_pattern", [pattern])

In [43]:
matches = matcher(doc)

In [44]:
matches

[(16558152967840530489, 0, 5)]

In [45]:
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

2018 FIFA World Cup:


In [46]:
pattern = [
    {"LEMMA": "love", "POS": "VERB"},
    {"POS": "NOUN"}
]
doc = nlp("I loved dogs but now I love cats more.")
matcher.add("dogs_cats_PATTERN", [pattern])
matches = matcher(doc)

In [47]:
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

loved dogs
love cats


In [48]:
matches

[(13772491510779729277, 1, 3), (13772491510779729277, 6, 8)]

In [50]:
pattern = [
    {"LEMMA": "buy"},
    {"POS": "DET", "OP": "?"},  # optional: match 0 or 1 times
    {"POS": "NOUN"}
]
matcher.add("optional match smartphone", [pattern])
doc = nlp("I bought a smartphone. Now I'm buying apps.")
matches = matcher(doc)
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

bought a smartphone
buying apps


In [54]:
import spacy

# Import the Matcher
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders")

# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{"TEXT":"iPhone"},{"TEXT":"X"}]

# Add the pattern to the matcher
matcher.add("apple_iphone",[pattern])

# Use the matcher on the doc
matches =matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X']


In [55]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT":'iOS'}, {"IS_DIGIT":True }]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: iOS 7
Match found: iOS 11
Match found: iOS 10


In [56]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": 'download'}, {"POS":"PROPN"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: downloaded Fortnite
Match found: downloading Minecraft
Match found: download Winzip


In [57]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP":'?' }]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 5
Match found: beautiful design
Match found: smart search
Match found: automatic labels
Match found: optional voice
Match found: optional voice responses
