# 1. Introduction to spaCy

In [1]:
import spacy
from spacy.lang.en import English

## nlp object

In [2]:
nlp = English()
nlp

<spacy.lang.en.English at 0x7f7cecf49f28>

## doc object

In [3]:
doc = nlp('Hello world!')

for token in doc:
    print(token.text)

Hello
world
!


## token object

In [4]:
token = doc[1]
token

world

## span object

In [5]:
span = doc[1:4]
span

world!

## Lexical attributes

In [6]:
doc = nlp('It costs $5.')
print('Index: ', [token.i for token in doc])
print('Text: ', [token.text for token in doc])
print('is_alpha: ', [token.is_alpha for token in doc])
print('is_punct: ', [token.is_punct for token in doc])
print('like_num: ', [token.like_num for token in doc])

Index:  [0, 1, 2, 3, 4]
Text:  ['It', 'costs', '$', '5', '.']
is_alpha:  [True, True, False, False, False]
is_punct:  [False, False, False, False, True]
like_num:  [False, False, False, True, False]


# 1.1. Statistical models

In [7]:
nlp = spacy.load('en_core_web_sm')
doc = nlp('She ate the pizza')

for token in doc:
    print(token.text, token.pos_)

She PRON
ate VERB
the DET
pizza NOUN


In [8]:
def syntactic_dep(doc):
    for token in doc:
        print(token.text, token.pos_, token.dep_, token.head.text)

In [9]:
syntactic_dep(doc)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


In [10]:
def named_entities(doc):
    for token in doc.ents:
        print(token.text, token.label_)

In [11]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
named_entities(doc)

Apple ORG
U.K. GPE
$1 billion MONEY


In [12]:
for token in doc.ents:
    print('Label: {0}\nDefinition: {1}\n'.format(token.label_, spacy.explain(token.label_)))

Label: ORG
Definition: Companies, agencies, institutions, etc.

Label: GPE
Definition: Countries, cities, states

Label: MONEY
Definition: Monetary values, including unit



# 1.2. Rule-based matching

In [13]:
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')

In [14]:
matcher = Matcher(nlp.vocab)
matcher

<spacy.matcher.matcher.Matcher at 0x7f7cecf375e8>

In [15]:
pattern = [{'ORTH':'iPhone'}, {'ORTH':'X'}]
pattern

[{'ORTH': 'iPhone'}, {'ORTH': 'X'}]

In [16]:
matcher.add('IPHONE_PATTERN', None, pattern)
matcher

<spacy.matcher.matcher.Matcher at 0x7f7cecf375e8>

In [17]:
doc = nlp('New iPhone X release date leaked')
matches = matcher(doc)
matches

[(9528407286733565721, 1, 3)]

In [18]:
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

iPhone X


In [19]:
def matching_pattern(doc, pattern):
    import spacy
    from spacy.matcher import Matcher
    
    nlp = spacy.load('en_core_web_sm')
    matcher = Matcher(nlp.vocab)
    matcher.add('TEMP_PATTERN', None, pattern)
    matches = matcher(doc)
    
    for match_id, start, end in matches:
        matched_span = doc[start:end]
        print(matched_span.text)

## Matching lexical attributes

In [20]:
pattern = [
    {'IS_DIGIT':True},
    {'LOWER':'fifa'},
    {'LOWER':'world'},
    {'LOWER':'cup'},
    {'IS_PUNCT':True},
]

In [21]:
doc = nlp('2018 FIFA World Cup: France won!')

In [22]:
matching_pattern(doc, pattern)

2018 FIFA World Cup:


In [23]:
pattern = [
    {'LEMMA':'love', 'POS':'VERB'},
    {'POS':'NOUN'},
]

doc = nlp("I loved dogs but now I love cats more.")

In [24]:
matching_pattern(doc, pattern)

loved dogs
love cats


## Operator & quantifiers

```python
{'OP': '!'} # negation: match 0 times
{'OP': '?'} # optional: 0 / 1 time
{'OP': '+'} # >= 1 time(s)
{'OP': '*'} # >= 0 time(s)
```

In [25]:
pattern = [
    {'LEMMA':'buy'},
    {'POS':'DET', 'OP':'?'},
    {'POS':'NOUN'},
]

doc = nlp("I bought a smartphone. Now I'm buying apps.")

In [26]:
matching_pattern(doc, pattern)

bought a smartphone
buying apps


In [27]:
syntactic_dep(doc)

I PRON nsubj bought
bought VERB ROOT bought
a DET det smartphone
smartphone NOUN dobj bought
. PUNCT punct bought
Now ADV advmod buying
I PRON nsubj buying
'm VERB aux buying
buying VERB ROOT buying
apps NOUN dobj buying
. PUNCT punct buying


# 2. Data structures: Vocab, Lexemes & StringStore

## Shared Vocab & string store
Vocab: stores data shared across multiple documents

Strings are only stored onced in `StringStore` via `nlp.vocab.strings`

In [28]:
coffee_hash = nlp.vocab.strings['coffee']
coffee_hash

3197928453018144401

In [29]:
'''These 2 will return KeyError'''
# coffee_string = nlp.vocab.strings[coffee_hash]
# nlp.vocab.strings[3197928453018144401]

'These 2 will return KeyError'

In [30]:
doc = nlp("I love coffee")
print('Coffee hash value:', nlp.vocab.strings['coffee'])
print('String value:', nlp.vocab.strings[3197928453018144401])

Coffee hash value: 3197928453018144401
String value: coffee


## Lexemes: entries in the vocabulary

- Context-independent info
    - Text `lexeme.text` and hash `lexeme.orth`

In [31]:
doc = nlp("I love coffee")
lexeme = nlp.vocab['coffee']

print('Text: {0}; Hash: {1}; Alphabet: {2}'.format(lexeme.text, lexeme.orth, lexeme.is_alpha))

Text: coffee; Hash: 3197928453018144401; Alphabet: True


# 2.1. Data Structures: Doc, Span & Token

In [32]:
from spacy.lang.en import English
from spacy.tokens import Doc, Span

nlp = English()

In [33]:
words = ['Hello', 'world', '!']
spaces = [True, False, False]
doc = Doc(nlp.vocab, words=words, spaces=spaces)
doc

Hello world!

In [34]:
span = Span(doc, 0, 2)
span

Hello world

In [35]:
span_with_label = Span(doc, 0, 2, label='GREETING')
span_with_label

Hello world

In [36]:
doc.ents = [span_with_label]
doc.ents

(Hello world,)

# 2.2. Word vectors & semantic similarity

In [37]:
nlp = spacy.load('en_core_web_md')

In [38]:
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")

print(doc1.similarity(doc2))

0.8627204117787385


## Compare 2 tokens

In [39]:
doc = nlp("I like pizza and pasta")

token1 = doc[2]
token2 = doc[4]

print(token1.similarity(token2))

0.73695457


## Compare doc with span

In [40]:
doc = nlp("I like pizza")
token = nlp("soap")[0]

print(doc.similarity(token))

0.32531983166759537


## Compare span with doc

In [41]:
span = nlp("I like pizza & pasta")[2:5]
doc = nlp("McDonalds sells burgers")

print(span.similarity(doc))

0.6199092090831612


## Word vector

spaCy predict similarity using word vectors

In [42]:
nlp = spacy.load('en_core_web_md')

In [43]:
doc = nlp('I have a banana')

print(doc[3].vector)

[ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.5564e-01  6.9793e-02 -2

In [44]:
doc1 = nlp('I like cats')
doc2 = nlp('I hate cats')

doc1.similarity(doc2)

0.9501446702124066

# 2.3. Combining models & rules

1. Rule-based systems
2. Statistical models

In [45]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

## Statistical prediction

In [46]:
def matching_pattern_md(doc, pattern, method):
    import spacy
    from spacy.matcher import Matcher, PhraseMatcher
    
    nlp = spacy.load('en_core_web_md')
    
    if method == 'Matcher':
        matcher = Matcher(nlp.vocab)
    elif method == 'PhraseMatcher':
        matcher = PhraseMatcher(nlp.vocab)
    else:
        raise ValueError("Matcher method must be Matcher / PhraseMatcher")
    
    matcher.add('TEMP_PATTERN', None, pattern)
    matches = matcher(doc)
    
    for match_id, start, end in matches:
        span = doc[start:end]
        print('Matched span:', span.text)
        print('Root token:', span.root.text)
        print('Root head token:', span.root.head.text)
        print('Previous token:', doc[start-1].text, doc[start-1].pos_)

In [47]:
doc = nlp('I have a Golden Retriever')
doc

I have a Golden Retriever

In [48]:
pattern = [{'lower':'golden'}, {'lower':'retriever'}]
pattern

[{'lower': 'golden'}, {'lower': 'retriever'}]

In [49]:
%%time
matching_pattern_md(doc, pattern, 'Matcher')

Matched span: Golden Retriever
Root token: Retriever
Root head token: have
Previous token: a DET
CPU times: user 13.6 s, sys: 444 ms, total: 14.1 s
Wall time: 14.1 s


In [50]:
pattern = nlp('Golden retriever')
pattern

Golden retriever

In [51]:
%%time
matching_pattern_md(doc, pattern, 'PhraseMatcher')

CPU times: user 13.9 s, sys: 392 ms, total: 14.3 s
Wall time: 14.3 s


# 3. Processing pipelines

In [52]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [53]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f7cecbd6128>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f7cecb45a08>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f7cecb45a68>)]

# 3.1. Custom pipeline components

In [54]:
nlp = spacy.load('en_core_web_sm')
nlp

<spacy.lang.en.English at 0x7f7cecc1cda0>

In [55]:
def custom_component(doc):
    print('Doc length:', len(doc))
    return doc

In [56]:
nlp.add_pipe(custom_component, first=True)

print('Pipeline:', nlp.pipe_names)

Pipeline: ['custom_component', 'tagger', 'parser', 'ner']


In [57]:
doc = nlp('Hello world!')
doc

Doc length: 3


Hello world!

# 3.2. Extension attributes

1. Attribute extensions
2. Property extensions
3. Method extensions

In [58]:
from spacy.tokens import Doc, Token, Span

Doc.set_extension('title', default=None)
Token.set_extension('is_color', default=False)
Span.set_extension('has_color', default=False)

## Attribute extensions

In [59]:
doc = nlp('The Sky is blue.')
doc[3]._.is_color=True

Doc length: 5


## Property extensions

In [60]:
def get_is_color(token):
    colors = ['red', 'yellow', 'blue']
    return token.text in colors

In [61]:
Token.set_extension('is_color', getter=get_is_color, force=True)

In [62]:
print(doc[3].text, '-', doc[3]._.is_color)

blue - True


In [63]:
def get_is_color(span):
    colors = ['red', 'yellow', 'blue']
    return any(token.text in colors for token in span)

In [64]:
Span.set_extension('has_color', getter=get_is_color, force=True)

In [65]:
doc = nlp('The sky is blue.')

print(doc[1:4]._.has_color, '-', doc[1:4].text)
print(doc[2:5]._.has_color, '-', doc[2:5].text)
print(doc[0:2]._.has_color, '-', doc[0:2].text)

Doc length: 5
True - sky is blue
True - is blue.
False - The sky


## Method extensions

In [66]:
def has_token(doc, token_text):
    in_doc = token_text in [token.text for token in doc]
    return in_doc

In [67]:
Doc.set_extension('has_token', method=has_token, force=True)

In [68]:
print(doc._.has_token('blue'), '- blue')
print(doc._.has_token('cloud'), '- cloud')

True - blue
False - cloud


# 3.3. Scaling and performance

In [69]:
data = [
    ('This is a text', {'id':1, 'page_number':15}),
    ('Add another text', {'id':2, 'page_number':16}),
]

data

[('This is a text', {'id': 1, 'page_number': 15}),
 ('Add another text', {'id': 2, 'page_number': 16})]

In [70]:
for doc, context in nlp.pipe(data, as_tuples=True):
    print(doc.text, context['page_number'])

Doc length: 4
Doc length: 3
This is a text 15
Add another text 16


In [71]:
Doc.set_extension('id', default=None)
Doc.set_extension('page_number', default=None)

In [72]:
for doc, context in nlp.pipe(data, as_tuples=True):
    doc._.id = context['id']
    doc._.page_number = context['page_number']
    print(doc)

Doc length: 4
Doc length: 3
This is a text
Add another text


## Using only tokenizer

In [73]:
nlp.make_doc('Hello world!')

Hello world!

## Disable pipeline components

In [74]:
with nlp.disable_pipes('tagger', 'parser'):
    doc = nlp('Hello world!')
    print(doc.ents)

Doc length: 3
()


In [75]:
nlp.pipe_names

['custom_component', 'tagger', 'parser', 'ner']

# 4. Training and updating models

In [76]:
import random

In [77]:
training_data = [
    ('iPhone X is coming', {'entities': [(0,8,'GADGET')]}),
    ('I need a new phone! Any tips?', {'entities': []}),
    ('How to preorder the iPhone X', {'entities': [(20, 28, 'GADGET')]}),
]

In [78]:
'''Blank English model'''
nlp = spacy.blank('en')

'''Create blank entity recognizer and add it to the pipeline'''
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

'''Add a new label'''
ner.add_label('GADGET')

nlp.begin_training()

for i in range(10):
    '''Shuffle the training data'''
    random.shuffle(training_data)
    
    '''Create batches and iterate over them'''
    for batch in spacy.util.minibatch(training_data):
        
        '''Split the batch in texts and annotations'''
        texts = [text for text, annotation in batch]
        annotations = [annotation for text, annotation in batch]
        
        '''Update the model'''
        nlp.update(texts, annotations)

In [79]:
nlp.to_disk('./C4-model')