<a href="https://colab.research.google.com/github/XinyueChen-Flora/NLP-Learn/blob/main/spaCy_Learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# import spacy and create a blanc english nlp object
import spacy
nlp = spacy.blank("en")

In [25]:
# Created by processing a string of text with the nlp object
doc = nlp("I like tree kangaroos and narwhals.")

In [16]:
# Iterate over tokens in a Doc
for token in doc:
  print(token.text)

I
like
kangaroos
and
narwhals
.


In [18]:
# Index into the Doc to get a single Token
first_token = doc[0]
# Get the token via the .text attribute
print(first_token.text)

I


In [27]:
# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

tree kangaroos


In [21]:
# i is the index of the token within the parent document.
# text returns the token text.
# is_alpha, is_punct and like_num return boolean values indicating whether the token consists of alphabetic characters, whether it's punctuation or whether it resembles a number. 
# For example, a token "10" – one, zero – or the word "ten" – T, E, N.
# These attributes are also called lexical attributes: they refer to the entry in the vocabulary and don't depend on the token's context.
print("Index:   ", [token.i for token in doc])
print("Text:    ", [token.text for token in doc])
print("is_alpha:", [token.is_alpha for token in doc])
print("is_punct:", [token.is_punct for token in doc])
print("like_num:", [token.like_num for token in doc])

Index:    [0, 1, 2, 3, 4, 5]
Text:     ['I', 'like', 'kangaroos', 'and', 'narwhals', '.']
is_alpha: [True, True, True, True, True, False]
is_punct: [False, False, False, False, False, True]
like_num: [False, False, False, False, False, False]


In [29]:
# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals)

tree kangaroos and narwhals


# Pipeline

Predicting Part-of-speech tags

In [36]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("She ate the pizza")
for token in doc:
  # For each token in the doc, we can print the text and the .pos_ attribute, the predicted part-of-speech tag.
  print(token.text, token.pos_)

She PRON
ate VERB
the DET
pizza NOUN


Predicting Syntactic Dependencies

In [38]:
for token in doc:
  print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


Predicting Named Entities

In [39]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
  print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [40]:
spacy.explain("GPE")
spacy.explain("det")

'determiner'

# Rule-based matching

Using the Matcher

In [48]:
from spacy.matcher import Matcher
# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)
# Add the pattern to the matcher
# The matcher.add method lets you add a pattern. 
# The first argument is a unique ID to identify which pattern was matched. 
# The second argument is a list of patterns.
pattern = [{"TEXT":"iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattern])
# Process some text
doc = nlp("Upcoming iPhone X release data leaked")
# Call the matcher on the doc
matches = matcher(doc)
# When you call the matcher on a doc, it returns a list of tuples.
# Each tuple consists of three values: the match ID, the start index and the end index of the matched span.
print(matches)
# we can iterate over the matches and create a Span object: a slice of the doc at the start and end index.
# match_id: hash value of the pattern name
# start: start index of matched span
# end: end index of matched span
for match_id, start, end in matches:
  matched_span = doc[start:end]
  print(matched_span.text)

[(9528407286733565721, 1, 3)]
iPhone X


Matching lexical attributes


In [51]:
pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]
doc = nlp("2018 FIFA World Cup: France won!")
matcher.add("FIFA_PATTERN", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
  matched_span = doc[start:end]
  print(matched_span.text)


2018 FIFA World Cup:


Matching other token attributes

In [53]:
pattern = [
           {"LEMMA": "love", "POS": "VERB"},
           {"POS": "NOUN"}
]
doc = nlp("I loved dogs but now I love cats more")
matcher.add("LOVE_PATTERN", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
  matched_span = doc[start:end]
  print(matched_span.text)

loved dogs
love cats


Using operators and quantifiers

In [54]:
pattern = [
           {"LEMMA": "buy"},
           {"POS": "DET","OP":"?"}, # optional: match 0 or 1 times
           {"POS": "NOUN"}
]
doc = nlp("I bought a smartphone. Now I'm buying apps.")
matcher.add("buy_PATTERN", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
  matched_span = doc[start:end]
  print(matched_span.text)

bought a smartphone
buying apps
