<a href="https://colab.research.google.com/github/XinyueChen-Flora/NLP-Learn/blob/main/spaCy_Learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# import spacy and create a blanc english nlp object
import spacy
nlp = spacy.blank("en")

In [25]:
# Created by processing a string of text with the nlp object
doc = nlp("I like tree kangaroos and narwhals.")

In [16]:
# Iterate over tokens in a Doc
for token in doc:
  print(token.text)

I
like
kangaroos
and
narwhals
.


In [18]:
# Index into the Doc to get a single Token
first_token = doc[0]
# Get the token via the .text attribute
print(first_token.text)

I


In [27]:
# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

tree kangaroos


In [21]:
# i is the index of the token within the parent document.
# text returns the token text.
# is_alpha, is_punct and like_num return boolean values indicating whether the token consists of alphabetic characters, whether it's punctuation or whether it resembles a number. 
# For example, a token "10" – one, zero – or the word "ten" – T, E, N.
# These attributes are also called lexical attributes: they refer to the entry in the vocabulary and don't depend on the token's context.
print("Index:   ", [token.i for token in doc])
print("Text:    ", [token.text for token in doc])
print("is_alpha:", [token.is_alpha for token in doc])
print("is_punct:", [token.is_punct for token in doc])
print("like_num:", [token.like_num for token in doc])

Index:    [0, 1, 2, 3, 4, 5]
Text:     ['I', 'like', 'kangaroos', 'and', 'narwhals', '.']
is_alpha: [True, True, True, True, True, False]
is_punct: [False, False, False, False, False, True]
like_num: [False, False, False, False, False, False]


In [29]:
# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals)

tree kangaroos and narwhals


# Pipeline

Predicting Part-of-speech tags

In [36]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("She ate the pizza")
for token in doc:
  # For each token in the doc, we can print the text and the .pos_ attribute, the predicted part-of-speech tag.
  print(token.text, token.pos_)

She PRON
ate VERB
the DET
pizza NOUN


Predicting Syntactic Dependencies

In [38]:
for token in doc:
  print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


Predicting Named Entities

In [39]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
  print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [40]:
spacy.explain("GPE")
spacy.explain("det")

'determiner'

# Rule-based matching

Using the Matcher

In [48]:
from spacy.matcher import Matcher
# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)
# Add the pattern to the matcher
# The matcher.add method lets you add a pattern. 
# The first argument is a unique ID to identify which pattern was matched. 
# The second argument is a list of patterns.
pattern = [{"TEXT":"iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattern])
# Process some text
doc = nlp("Upcoming iPhone X release data leaked")
# Call the matcher on the doc
matches = matcher(doc)
# When you call the matcher on a doc, it returns a list of tuples.
# Each tuple consists of three values: the match ID, the start index and the end index of the matched span.
print(matches)
# we can iterate over the matches and create a Span object: a slice of the doc at the start and end index.
# match_id: hash value of the pattern name
# start: start index of matched span
# end: end index of matched span
for match_id, start, end in matches:
  matched_span = doc[start:end]
  print(matched_span.text)

[(9528407286733565721, 1, 3)]
iPhone X


Matching lexical attributes


In [51]:
pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]
doc = nlp("2018 FIFA World Cup: France won!")
matcher.add("FIFA_PATTERN", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
  matched_span = doc[start:end]
  print(matched_span.text)


2018 FIFA World Cup:


Matching other token attributes

In [53]:
pattern = [
           {"LEMMA": "love", "POS": "VERB"},
           {"POS": "NOUN"}
]
doc = nlp("I loved dogs but now I love cats more")
matcher.add("LOVE_PATTERN", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
  matched_span = doc[start:end]
  print(matched_span.text)

loved dogs
love cats


Using operators and quantifiers

In [54]:
pattern = [
           {"LEMMA": "buy"},
           {"POS": "DET","OP":"?"}, # optional: match 0 or 1 times
           {"POS": "NOUN"}
]
doc = nlp("I bought a smartphone. Now I'm buying apps.")
matcher.add("buy_PATTERN", [pattern])
matches = matcher(doc)
for match_id, start, end in matches:
  matched_span = doc[start:end]
  print(matched_span.text)

bought a smartphone
buying apps


# Large-scale data analysis with spaCy

Data Structures

In [58]:
# Strings to hashes
doc = nlp("I have a cat")
# Look up the hash for the work "cat"
cat_hash = nlp.vocab.strings["cat"]
print(cat_hash)

# Look up the cat_hash to get the string
cat_string = nlp.vocab.strings[cat_hash]
print(cat_string)

5439657043933447811
cat


In [62]:
# Doc, Span and Token
from spacy.tokens import Doc, Span

# The words and spaces to create the doc from
words = ["Hello", "world", "!"]
spaces = [True, False, False]

# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc)

# Create a span manually
span = Span(doc, 0, 2)

# Create a span with a label
span_with_label = Span(doc, 0, 2, label="GREETING")

# Add span to the doc.ents
doc.ents = [span_with_label]

Hello world!
(Hello world,)


Word vectors and semantic similarity

In [76]:
# solve the problem of cannot find the model
import spacy.cli
spacy.cli.download("en_core_web_md")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [77]:

import en_core_web_md
nlp = en_core_web_md.load()

# Compare two documents
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")
print(doc1.similarity(doc2))

0.8627204117787385


In [78]:
# compare a document with a token
doc = nlp("I like pizza")
token = nlp("soap")[0]

print(doc.similarity(token))

0.32531983166759537


In [79]:
# compare a span with a document
span = nlp("I like pizza and pasta")[2:5]
doc = nlp("McDonalds sells burgers")
print(span.similarity(doc))

0.6199092090831612


In [80]:
# word vectors in spaCy
doc = nlp("I have a bananna")
# Access the vector via the token.vector attribute
print(doc[3].vector)

[ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.5564e-01  6.9793e-02 -2

In [81]:
doc1 = nlp("I like cats")
doc2 = nlp("I hate cats")
print(doc1.similarity(doc2))

0.9501447503553421


Combining predictions and rules

In [83]:
# adding statistical predictions
matcher = Matcher(nlp.vocab)
matcher.add("DOG", [[{"LOWER": "golden"},{"LOWER": "retriever"}]])
doc = nlp("I have a Golden Retriever")

for match_id, start, end in matcher(doc):
  span = doc[start:end]
  print("Matched span:", span.text)
  print("Root token:", span.root.text)
  print("Root head token:", span.root.head.text)
  print("Previous token:", doc[start - 1].text, doc[start-1].pos_)

Matched span: Golden Retriever
Root token: Retriever
Root head token: have
Previous token: a DET


In [84]:
# efficient phrase matching
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

pattern = nlp("Golden Retriever")
matcher.add("DOG", [pattern])
doc = nlp("I have a Golden Retriever")

for match_id, start, end in matcher(doc):
  span = doc[start:end]
  print("Matched span:", span.text)

Matched span: Golden Retriever


# Processing Pipelines