# Official Training Course from spaCy - Notes

- [Advanced NLP with spaCy](https://course.spacy.io/en)

# Chapter 1 - Finding words, phrases, names and concepts

In [1]:
# `language` & `doc`
# `language` is the class to create an object usually named "nlp"
# When a `nlp` object was called on a string, spaCy tokenize the text and create a `doc` object
# A `doc` object contains a series of `token` objects, which can be indexed.

 
from spacy.lang.en import English
from spacy.lang.zh import Chinese

# create doc object and serialize the string
nlp_en = English()
doc_en = nlp_en("This is a sentence with ten words & 2 punctuation!")
print(doc_en.text)

print()

# create doc object and serialize the string
nlp_zh = Chinese()
doc_zh = nlp_zh("这是一句十四个字&2个标点的话！")
print(doc_zh.text)

This is a sentence with ten words & 2 punctuation!

这是一句十四个字&2个标点的话！


In [2]:
# A `token` object contains a word, punctuations, etc.
# A `span` is a collection of `token` objects and can be indexed as well.

# return token by the index
first_token_en = doc_en[0]
print(first_token_en.text)

# return span using index
sec_thrd_token_en = doc_en[1:3]
print(sec_thrd_token_en.text)

print()

# return token by the index
first_token_zh = doc_zh[0]
print(first_token_zh.text)

# return span using index
sec_thrd_token_zh = doc_zh[1:3]
print(sec_thrd_token_zh.text)

This
is a

这
是一


In [3]:
# return and use the index of tokens (to find the next token)

some_token_en = doc_en[9]
print(some_token_en.text)
next_token_en = doc_en[some_token_en.i + 1]
print(next_token_en.text)

print()

some_token_zh = doc_zh[9]
print(some_token_zh.text)
next_token_zh = doc_zh[some_token_zh.i + 1]
print(next_token_zh.text)

punctuation
!

2
个


In [4]:
# return index, and lexical attributes of a token
word_token_en = doc_en[4]
print(word_token_en.text, word_token_en.is_alpha)
punc_token_en = doc_en[7]
print(punc_token_en.text, punc_token_en.is_punct)
num_token_en = doc_en[8]
print(num_token_en.text, num_token_en.like_num)
tnum_token_en = doc_en[5]
print(tnum_token_en.text, tnum_token_en.like_num)

print()

# return index, and lexical attributes of a token
word_token_zh = doc_zh[3]
print(word_token_zh.text, word_token_zh.is_alpha)
punc_token_zh = doc_zh[8]
print(punc_token_zh.text, punc_token_zh.is_punct)
num_token_zh = doc_zh[9]
print(num_token_zh.text, num_token_zh.like_num)
tnum_token_zh = doc_zh[4]
print(tnum_token_zh.text, tnum_token_zh.like_num)

with True
& True
2 True
ten True

句 True
& True
2 True
十 True


In [5]:
# load pre-trained statistical models
import spacy

nlp_en = spacy.load("en_core_web_sm")

nlp_zh = spacy.load('zh_core_web_sm')

In [6]:
# print part of speech using pre-trained model
doc_en = nlp_en("Apple is looking to hire a lawyer from Ireland for 1 million dollars.")
for token in doc_en:
    print(token.text, token.pos_)

print()

doc_zh = nlp_zh("四川大学的董佳明教授在峨眉山创办了青年科幻文学论坛.")
for token in doc_zh:
    print(token.text, token.pos_)

Apple PROPN
is AUX
looking VERB
to PART
hire VERB
a DET
lawyer NOUN
from ADP
Ireland PROPN
for ADP
1 NUM
million NUM
dollars NOUN
. PUNCT

四川 PROPN
大学 NOUN
的 PART
董佳明 PROPN
教授 NOUN
在 ADP
峨眉山 PROPN
创办 VERB
了 PART
青年 NOUN
科幻 ADJ
文学 NOUN
论坛 NOUN
. PUNCT


In [7]:
# syntactic dependencies
for token in doc_en:
    print(token.text, token.pos_, token.dep_, token.head.text)

print()

for token in doc_zh:
    print(token.text, token.pos_, token.dep_, token.head.text)

Apple PROPN nsubj looking
is AUX aux looking
looking VERB ROOT looking
to PART aux hire
hire VERB xcomp looking
a DET det lawyer
lawyer NOUN dobj hire
from ADP prep lawyer
Ireland PROPN pobj from
for ADP prep hire
1 NUM compound million
million NUM nummod dollars
dollars NOUN pobj for
. PUNCT punct looking

四川 PROPN compound:nn 大学
大学 NOUN nmod:assmod 教授
的 PART case 大学
董佳明 PROPN compound:nn 教授
教授 NOUN nsubj 创办
在 ADP case 峨眉山
峨眉山 PROPN nmod:prep 创办
创办 VERB ROOT 创办
了 PART aux:asp 创办
青年 NOUN compound:nn 论坛
科幻 ADJ amod 论坛
文学 NOUN compound:nn 论坛
论坛 NOUN dobj 创办
. PUNCT punct 创办


In [8]:
# named entities

for ent in doc_en.ents:
    print(ent.text, ent.label_)

print()

for ent in doc_zh.ents:
    print(ent.text, ent.label_)

Apple ORG
Ireland GPE
1 million dollars MONEY

四川大学 ORG
董佳明 PERSON


In [9]:
# Explain terminologies spacy.explain()
print(spacy.explain('det'))
print(spacy.explain('PART'))

determiner
particle


In [10]:
# pattern matching
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

pattern = [ [{'TEXT': 'iOS'}, {'IS_DIGIT': True}], [{'LEMMA': 'download'}, {"POS":'PROPN'}] ] 

matcher.add('IOS_DOWNLOAD_PATTERN', pattern)

matches = matcher(doc)
print("Total matches found:", len(matches))

print(matches)

for match_id, start, end in matches:
    print("Match found:", match_id, doc[start:end].text)

Total matches found: 6
[(13073357917303605362, 24, 26), (13073357917303605362, 29, 31), (13073357917303605362, 38, 40), (13073357917303605362, 53, 55), (13073357917303605362, 73, 75), (13073357917303605362, 104, 106)]
Match found: 13073357917303605362 iOS 7
Match found: 13073357917303605362 iOS 11
Match found: 13073357917303605362 iOS 10
Match found: 13073357917303605362 downloaded Fortnite
Match found: 13073357917303605362 downloading Minecraft
Match found: 13073357917303605362 download Winzip


# Chapter 2 - Large-scale data analysis with spaCy

## vocab & string store

### Data Structure

A `doc` object contains `token` objects.

A `vocab` object (nlp.vocab) contains `lexeme` objects.

A `StringStore` object (`nlp.vocab.strings`) contains a lookup table between hashes (`lexeme`s) and corresponding strings.

In [11]:
# string to hashes
from spacy.lang.en import English

nlp = English()
doc = nlp("I have a cat")

cat_hash = nlp.vocab.strings["cat"]
print(cat_hash)

cat_string = nlp.vocab.strings[cat_hash]
print(cat_string)

5439657043933447811
cat


### Attributes of lexeme
- `lexeme` objects contains **context-independent** information about a word like `orth`(hash), `text`, `is_alpha`
- `lexeme` refer to the exact word, not the root of the word

In [12]:
from spacy.lang.en import English

nlp = English()
doc = nlp("I have has had a cat")

for word in ['have', 'has', 'had']:
    word_hash = nlp.vocab.strings[word]
    print(word_hash)
    print(nlp.vocab.strings[word_hash])

print()

lexeme = nlp.vocab['had']
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

14692702688101715474
have
1248239241591158246
has
12960022596163002503
had

had 12960022596163002503 True


### What if there was no doc created?

- Hashes can always be returned, but cannot be reversed to strings if no actual text was processed.

In [13]:
# hash can always be returned, but cannot be reversed to a string if no actual text was processed
nlp = English()

cat_hash = nlp.vocab.strings['cat']
print(cat_hash)

zh_hash = nlp.vocab.strings['好']
print(zh_hash)

print(nlp.vocab.strings[cat_hash])

5439657043933447811
4630374899898420537


KeyError: "[E018] Can't retrieve string for hash '5439657043933447811'. This usually refers to an issue with the `Vocab` or `StringStore`."

## doc, token & span

### `doc` object constructor

- 3 arguemnts: `nlp.vocab`, words, spaces

### `span` object constuctor

- 3 arguments: `doc`, start_index, end_index+1
- optional arguments: label (to name a span)

In [14]:
from spacy.lang.en import English
from spacy.tokens import Doc, Span

nlp = English()

# words and spaces to create a doc from
words = ["Hello", "world", "!"]
spaces = [True, False, False] # whether the word is followed by a space

# create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

# create a span manually
span = Span(doc, 0, 2, label="GREETING")
print(span.text, span.label_)

# Add span to the doc.ents
doc.ents = [span]

for ent in doc.ents:
    print(ent.text)

Hello world!
Hello world GREETING
Hello world


### Best practices
- convert the data to strings as late as possible to fully take advantage of `doc` and `span` data structure
- use toekn attributes if possible, for example `token.i` for token index

In [15]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp("Berlin looks like a nice city!")

for token in doc:
    if token.pos_ == "PROPN":
        if doc[token.i+1].pos_ == "VERB":
            print("Found proper noun before a verb:", token.text)

Found proper noun before a verb: Berlin


## Word Vectors & Similarities

- `doc`, `span`, and `token` all have `similarity` method to generate a similarity score between 0 and 1 comparing with another object
- The default similarity score is cosine between word vectors
- `doc` and `span` use the average of token vetors by default.
- The score was generated using an algorithm like word2vec and a lot of text.
- It requires `md` or `lg` model or above
- Short phrases are better than long documents with a lot of irrelevant words.


### Word vectors are generated by pre-trained models

In [16]:
import spacy

nlp = spacy.load('en_core_web_md')

doc = nlp("Two bananas in pyjamas")

print(doc[1].vector)
print()
print(len(doc[1].vector))

[-2.2009e-01 -3.0322e-02 -7.9859e-02 -4.6279e-01 -3.8600e-01  3.6962e-01
 -7.7178e-01 -1.1529e-01  3.3601e-02  5.6573e-01 -2.4001e-01  4.1833e-01
  1.5049e-01  3.5621e-01 -2.1508e-01 -4.2743e-01  8.1400e-02  3.3916e-01
  2.1637e-01  1.4792e-01  4.5811e-01  2.0966e-01 -3.5706e-01  2.3800e-01
  2.7971e-02 -8.4538e-01  4.1917e-01 -3.9181e-01  4.0434e-04 -1.0662e+00
  1.4591e-01  1.4643e-03  5.1277e-01  2.6072e-01  8.3785e-02  3.0340e-01
  1.8579e-01  5.9999e-02 -4.0270e-01  5.0888e-01 -1.1358e-01 -2.8854e-01
 -2.7068e-01  1.1017e-02 -2.2217e-01  6.9076e-01  3.6459e-02  3.0394e-01
  5.6989e-02  2.2733e-01 -9.9473e-02  1.5165e-01  1.3540e-01 -2.4965e-01
  9.8078e-01 -8.0492e-01  1.9326e-01  3.1128e-01  5.5390e-02 -4.2423e-01
 -1.4082e-02  1.2708e-01  1.8868e-01  5.9777e-02 -2.2215e-01 -8.3950e-01
  9.1987e-02  1.0180e-01 -3.1299e-01  5.5083e-01 -3.0717e-01  4.4201e-01
  1.2666e-01  3.7643e-01  3.2333e-01  9.5673e-02  2.5083e-01 -6.4049e-02
  4.2143e-01 -1.9375e-01  3.8026e-01  7.0883e-03 -2

In [17]:
import spacy

nlp = spacy.load('en_core_web_md')

doc1 = nlp("It's a hot summer day.")
doc2 = nlp("It's sunny outside.")

sim = doc1.similarity(doc2)
print(sim)

span1 = doc1[3:5]
token1 = doc2[2]

print(span1, token1)

sim = span1.similarity(token1)
print(sim)

0.8852706036899473
hot summer sunny
0.5955382


## Pattern Matching - Combining Models and Rules

- The difference between `Matcher` and `PhraseMatcher` is the input formats:
    - Inputs of `Matcher` is a list of token patterns
    - Inputs of `PhraseMatcher` is a `doc` object, e.g. `nlp("some text")`

In [18]:
import json
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher

with open("data/countries.json", encoding='utf8') as f:
    COUNTRIES = json.load(f)
# print(COUNTRIES)

nlp = English()
doc = nlp("Czech Republic may help Slovakia protect its airspace")

# Initialize PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

# shorter version of [nlp(country) for country in COUNTRIES]
pattern = list(nlp.pipe(COUNTRIES))

# Add pattern to matcher
matcher.add("COUNTRY", pattern)

# call the matcher on the doc and print the result
matches = matcher(doc)
for match_id, start, end in matches:
    print(doc[start:end])

Czech Republic
Slovakia


In [19]:
import json
import spacy
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("data/countries.json") as f:
    COUNTIRES = json.load(f)

with open("data/country_text.txt") as f:
    TEXT = f.read()

nlp = spacy.load('en_core_web_sm')

# create a matcher and add patterns
matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.pipe(COUNTIRES))
matcher.add("COUNTRY", patterns)

# create a doc and reset existing entities
doc = nlp(TEXT)
doc.ents = []

for match_id, start, end in matcher(doc):
    # create a span for each match with label "GPE"
    span = Span(doc, start, end, label="GPE")

    doc.ents = list(doc.ents) + [span]

    print(span.root.head.text, "-->", span.text)

print()
print([(ent.text, ent.label_) for ent in doc.ents if ent.label_=="GPE"])


in --> Namibia
in --> South Africa
Africa --> Cambodia
of --> Kuwait
as --> Somalia
Somalia --> Haiti
Haiti --> Mozambique
in --> Somalia
for --> Rwanda
Britain --> Singapore
War --> Sierra Leone
of --> Afghanistan
invaded --> Iraq
in --> Sudan
of --> Congo
earthquake --> Haiti

[('Namibia', 'GPE'), ('South Africa', 'GPE'), ('Cambodia', 'GPE'), ('Kuwait', 'GPE'), ('Somalia', 'GPE'), ('Haiti', 'GPE'), ('Mozambique', 'GPE'), ('Somalia', 'GPE'), ('Rwanda', 'GPE'), ('Singapore', 'GPE'), ('Sierra Leone', 'GPE'), ('Afghanistan', 'GPE'), ('Iraq', 'GPE'), ('Sudan', 'GPE'), ('Congo', 'GPE'), ('Haiti', 'GPE')]


# Chapter 3 - Processing Pipelines

## `nlp` pipeline

- `text` --> `nlp` (`tokenizer`(always necessary and in the first place) -> `tagger` -> `parser` -> `ner` -> ...) --> `doc`

## Components
| Name | Description | Creates |
| :--- | :--- | :--- |
| tagger | Part-of-speech tagger | Token.tag, Token.pos |
| parser | Dependency parser | Token.dep, Token.head, Doc.sents, Doc.noun_chunks |
| ner | Named entity recognizer | Doc.ents, Token.ent_iob, Token.ent_type |
| textcat | Text classifier | Doc.cats |

In [20]:
import spacy

nlp = spacy.load("en_core_web_sm")

print(nlp.pipe_names)

print(nlp.pipeline)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7fa30483c468>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7fa30483c4c0>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7fa311880730>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7fa3118808d0>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7fa30466fe08>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7fa3046d4308>)]


## Custom Pipeline Components

- A custom component is a functions taking a `doc` as the input and returns a `doc`.
- A custom component can be added to a pipeline using `nlp.add_pipe` method

### Arguments for `nlp.add_pipe` method

|Argument |	Description |	Example |
| :--- | :--- | :--- |
|last |	If True, add last |	nlp.add_pipe(component, last=True) |
|first |	If True, add first |	nlp.add_pipe(component, first=True) |
|before |	Add before component |	nlp.add_pipe(component, before="ner") |
|after |	Add after component |	nlp.add_pipe(component, after="tagger") |

In [21]:
# # spacy 2 version

# import spacy
# def length_components(doc):
#     doc_length = len(doc)
#     print(f"This document is {doc_length} tokens long.")
#     return doc

# nlp = spacy.load('en_core_web_sm')
# nlp.add_pipe(length_components, first=True)
# print(nlp.pipe_names)

# doc = nlp("This is a sentence")


# spacy 3 version
import spacy
from spacy.language import Language

@Language.component('length')
def length_components(doc):
    doc_length = len(doc)
    print(f"This document is {doc_length} tokens long.")
    return doc

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('length', first=True)
print(nlp.pipe_names)

doc = nlp("This is a sentence")

['length', 'tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
This document is 4 tokens long.


In [22]:
# A more complex custom pipeline components

import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.language import Language

nlp = spacy.load('en_core_web_sm')

animals = ['Golden Retriever', 'cat', 'turtle', 'Rattus norvegeicus']
animals_patterns = list(nlp.pipe(animals))
print("Animal patterns: ", animals_patterns)

matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", animals_patterns)

@Language.component("ANIMAL")
def animal_component(doc):
    matches = matcher(doc)
    spans = [Span(doc,start,end,label="ANIMAL") for match_id, start, end in matches]
    doc.ents = spans
    return doc

nlp.add_pipe('ANIMAL', after='ner')
print(nlp.pipe_names)

doc = nlp("I have a cat and a Golden Retriever.")
print([(ent.text, ent.label_) for ent in doc.ents])

Animal patterns:  [Golden Retriever, cat, turtle, Rattus norvegeicus]
['tok2vec', 'tagger', 'parser', 'ner', 'ANIMAL', 'attribute_ruler', 'lemmatizer']
[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


## Extension Attributes
- Add custom metadata to `doc`s, `token`s, and `span`s.
- Accessible via the `._` property.
- Registered on the global `Doc`, `Token`, `Span` using the `set_extension` method.

### Extension Attributes Types
- **Attribute Extension**: set a default value that can be overwritten.
- **Property Extension**: 
    - Define a getter and an optional setting function. 
    - The getter function was only called when the attribute value is retrieved.
    - `Span` extensions should almost always use a getter function. Because it is usually expensive or even impossible to manually set values to all `span`s.
- **Method Extension**: 
    - Assign a function that becomes available as an object method.
    - Let you pass arguments to the extension function.

In [23]:
# Attribute extension example
from spacy.lang.en import English
from spacy.tokens import Token

nlp = English()

# Register an attribute extension
Token.set_extension("is_country", default=False)

doc = nlp("I live in Spain.")

# Update value of a certain token
doc[3]._.is_country = True

print([(token.text, token._.is_country) for token in doc])

[('I', False), ('live', False), ('in', False), ('Spain', True), ('.', False)]


In [24]:
# Property extension example
from spacy.lang.en import English
from spacy.tokens import Token

nlp = English()

# define a getter function
def get_reversed(token):
    return token.text[::-1]

# register the extension
Token.set_extension("reversed", getter=get_reversed)

doc = nlp("All generalizations are false, including this one.")

for token in doc:
    print(token._.reversed)

llA
snoitazilareneg
era
eslaf
,
gnidulcni
siht
eno
.


In [25]:
# Another property extension example on `Doc`
from spacy.lang.en import English
from spacy.tokens import Doc

nlp = English()

# define a getter function
def get_has_number(doc):
    return any(token.like_num for token in doc)

# register the property extension
Doc.set_extension("has_number", getter=get_has_number)

doc = nlp("The museum closed for five years in 2012.")

print(doc._.has_number)

True


In [26]:
# Method Extension Example on Span
from spacy.lang.en import English
from spacy.tokens import Span

nlp = English()

# define a getter function to_html
def to_html(span, tag):
    return f"<{tag}>{span.text}</{tag}>"

# register the extension
Span.set_extension("to_html", method=to_html)

doc = nlp("Hello world, this is a sentence.")

span = doc[0:2]
print(span._.to_html(tag="strong"))

<strong>Hello world</strong>


In [27]:
# Another example of property extension on spans
import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")

# define a getter function
def get_wikipedia_url(span):
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text

# Regisiter a extension
Span.set_extension("wikipedia_url", getter=get_wikipedia_url)

doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
    )

for ent in doc.ents:
    print(ent.text, ent._.wikipedia_url)


over fifty years None
first None
David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie


In [28]:
# Combining custom pipeline components with extension attributes
import json
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.language import Language

with open("data/countries.json", 'r') as f:
    COUNTRIES = json.load(f)

with open("data/capitals.json", 'r') as f:
    CAPITALS = json.load(f)

print(COUNTRIES[:6])
print()
print(list(CAPITALS.items())[:6])
print()

nlp = English()

# create a phrase matcher
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", list(nlp.pipe(COUNTRIES)))

# define component function
@Language.component("countries")
def countries_component(doc):
    matches = matcher(doc)
    doc.ents = [Span(doc, start, end, label='GPE') for match_id, start, end in matches]
    return doc

# Add the component to the pipeline
nlp.add_pipe("countries")
print(nlp.pipe_names)
print()

# Getter to look up the span text in a dictionary of country capitals
get_capital = lambda span: CAPITALS.get(span.text)

# Register the Span extension
Span.set_extension('capital', getter=get_capital)

doc = nlp("Czech Republic may help Slovakia protect its airspace")

for ent in doc.ents:
    print(ent.text, ent._.capital)

['Afghanistan', 'Åland Islands', 'Albania', 'Algeria', 'American Samoa', 'Andorra']

[('Afghanistan', 'Kabul'), ('Åland Islands', 'Mariehamn'), ('Albania', 'Tirana'), ('Algeria', 'Algiers'), ('American Samoa', 'Pago Pago'), ('Andorra', 'Andorra la Vella')]

['countries']

Czech Republic Prague
Slovakia Bratislava


## Scaling & Performance

- use `nlp.pipe` to process a large number of texts more efficiently
    - Bad way: `[nlp(text) for text in text_list]`
    - Good way: `list(nlp.pipe(text_list))`

- passing in (text, context) tuple (use as_tuple=True argument)
    - yield (`doc`, `context`) tuples
    - Useful for associating metadata with the `doc`
    - values in `context`s can be assigned to extensions later

- Run only parts of the pipeline
     - use only the tokenizer: `doc = nlp.make_doc("some text")`
     - disable pipeline components within a with block: `with nlp.disable_pipes("tagger", "parser"):`

In [29]:
# Use nlp.pipe to process a stream of texts
import json
import spacy

with open('data/tweets.json', 'r') as f:
    TEXTS = json.load(f)

nlp = spacy.load('en_core_web_sm')

for doc in list(nlp.pipe(TEXTS)):
    print([token.text for token in doc if token.pos_ == "ADJ"])

['favorite']
['sick']
[]
['happy']
['delicious', 'fast']
['open']
['terrible', 'payin']


In [30]:
# processing data with context
import json
from spacy.lang.en import English
from spacy.tokens import Doc

with open('data/bookquotes.json','r') as f:
    DATA = json.load(f)

nlp = English()

# Register extensions
Doc.set_extension('author', default=None)
Doc.set_extension('book', default=None)

for doc, context in nlp.pipe(DATA, as_tuples=True):
    doc._.author = context['author']
    doc._.book = context['book']

    print(f"{doc.text}\n - '{doc._.book}' by {doc._.author}\n")

One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.
 - 'Metamorphosis' by Franz Kafka

I know not all that may be coming, but be it what it will, I'll go to it laughing.
 - 'Moby-Dick or, The Whale' by Herman Melville

It was the best of times, it was the worst of times.
 - 'A Tale of Two Cities' by Charles Dickens

The only people for me are the mad ones, the ones who are mad to live, mad to talk, mad to be saved, desirous of everything at the same time, the ones who never yawn or say a commonplace thing, but burn, burn, burn like fabulous yellow roman candles exploding like spiders across the stars.
 - 'On the Road' by Jack Kerouac

It was a bright cold day in April, and the clocks were striking thirteen.
 - '1984' by George Orwell

Nowadays people know the price of everything and the value of nothing.
 - 'The Picture Of Dorian Gray' by Oscar Wilde



In [31]:
# Only tokenize the text using nlp.make_doc()
import spacy

nlp = spacy.load('en_core_web_sm')

text = (
    "Chick-fil-A is an American fast food restaurant chain headquartered in "
    "the city of College Park, Georgia, specializing in chicken sandwiches."
)

doc = nlp.make_doc(text)
print([token.text for token in doc])

['Chick', '-', 'fil', '-', 'A', 'is', 'an', 'American', 'fast', 'food', 'restaurant', 'chain', 'headquartered', 'in', 'the', 'city', 'of', 'College', 'Park', ',', 'Georgia', ',', 'specializing', 'in', 'chicken', 'sandwiches', '.']


In [32]:
# run parts of pipeline using nlp.disable_pipes()

import spacy

nlp = spacy.load("en_core_web_sm")
text = (
    "Chick-fil-A is an American fast food restaurant chain headquartered in "
    "the city of College Park, Georgia, specializing in chicken sandwiches."
)

with nlp.disable_pipes("tagger", "parser"):
    doc = nlp(text)
    print(doc.ents)



(American, College Park, Georgia)


# Chapter 4 - Training a neural network model

- re-train the pre-trained model on new data to improve accuracy of the particular problem

## Creating a dataset

- add new dataset (e.g., observations with new categories)
- `Matcher` or `PhraseMatcher` can be handy in creating patterns to match new labels
- If new categories were added, make sure to include some existing categories as well.

In [33]:
# Create a training dataset to update the model to recognize "iphone ?" as "GADGET"

import json
from spacy.lang.en import English
from spacy.matcher import Matcher

with open("data/iphone.json",'r') as f:
    TEXT = json.load(f)

nlp = English()

matcher = Matcher(nlp.vocab)
pattern = [[{'LOWER': 'iphone'}, {"LOWER": 'x'}], [{'LOWER': 'iphone'}, {"IS_DIGIT": True}]]
matcher.add("GADGET", pattern)

TRAINING_DATA = []

for doc in nlp.pipe(TEXT):
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
    print('entity:', entities)
    training_example = (doc.text, {'entities': entities})
    print('training example:', training_example)
    TRAINING_DATA.append(training_example)
    print()

entity: [(20, 28, 'GADGET')]
training example: ('How to preorder the iPhone X', {'entities': [(20, 28, 'GADGET')]})

entity: [(0, 8, 'GADGET')]
training example: ('iPhone X is coming', {'entities': [(0, 8, 'GADGET')]})

entity: [(28, 36, 'GADGET')]
training example: ('Should I pay $1,000 for the iPhone X?', {'entities': [(28, 36, 'GADGET')]})

entity: [(4, 12, 'GADGET')]
training example: ('The iPhone 8 reviews are here', {'entities': [(4, 12, 'GADGET')]})

entity: [(0, 9, 'GADGET'), (13, 21, 'GADGET')]
training example: ("iPhone 11 vs iPhone 8: What's the difference?", {'entities': [(0, 9, 'GADGET'), (13, 21, 'GADGET')]})

entity: []
training example: ('I need a new phone! Any tips?', {'entities': []})



## Training Loop

1. **Loop** for a number of times.
2. **Shuffle** the training data.
3. **Divide** the data into mini-batches.
4. **Update** the model for each batch.
5. **Save** the updated model.

In [34]:
import json
import random
import spacy
from spacy.training import Example

with open('data/gadgets.json', 'r') as f:
    TRAINING_DATA = json.load(f)

nlp = spacy.blank('en')
ner = nlp.add_pipe('ner')
ner.add_label("GADGET")

nlp.begin_training()

for itn in range(10):
    random.shuffle(TRAINING_DATA)
    losses = {}

    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        # texts = [text for text, entities in batch]
        # annotations = [entities for text, entities in batch]

        examples = [Example.from_dict(nlp.make_doc(text), annotation) for text, annotation in batch]
        nlp.update(examples, losses=losses)

    print(losses)

{'ner': 25.834728956222534}
{'ner': 18.855169266462326}
{'ner': 8.088818974792957}
{'ner': 4.0973786945978645}
{'ner': 11.280892569835714}
{'ner': 5.674000441357663}
{'ner': 3.2181017984692755}
{'ner': 1.3580093800089275}
{'ner': 0.8035100147845582}
{'ner': 0.6508244672133929}
