# Spacy Transformer Tutorial

Example extracted from:
    - https://explosion.ai/blog/spacy-transformers
    - https://github.com/explosion/spacy-transformers

In [1]:
import spacy
import torch
import numpy
from numpy.testing import assert_almost_equal

In [2]:
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")

In [3]:
nlp = spacy.load("en_trf_bertbaseuncased_lg")
doc = nlp("Here is some text to encode.")
assert doc.tensor.shape == (7, 768)  # Always has one row per token
doc._.trf_word_pieces_  # String values of the wordpieces
doc._.trf_word_pieces  # Wordpiece IDs (note: *not* spaCy's hash values!)
doc._.trf_alignment  # Alignment between spaCy tokens and wordpieces
# The raw transformer output has one row per wordpiece.
assert len(doc._.trf_last_hidden_state) == len(doc._.trf_word_pieces)
# To avoid losing information, we calculate the doc.tensor attribute such that
# the sum-pooled vectors match (apart from numeric error)
assert_almost_equal(doc.tensor.sum(axis=0), doc._.trf_last_hidden_state.sum(axis=0), decimal=5)
span = doc[2:4]
# Access the tensor from Span elements (especially helpful for sentences)
assert numpy.array_equal(span.tensor, doc.tensor[2:4])

In [4]:
# .vector and .similarity use the transformer outputs
apple1 = nlp("Apple shares rose on the news.")
apple2 = nlp("Apple sold fewer iPhones this quarter.")
apple3 = nlp("Apple pie is delicious.")
print(apple1[0].similarity(apple2[0]))  # 0.73428553
print(apple1[0].similarity(apple3[0]))  # 0.43365782

0.7342854
0.43365785


# An Example Closer to Our HSBC Application

In [5]:
# .vector and .similarity use the transformer outputs
hsbc1 = nlp("OECD is directing financial flows towards all SDGs.")
hsbc2 = nlp("Water in rivers flow from high to low altitute.")
hsbc3 = nlp("HSBC is supporting the flow of finance towards a green transition.")
print(hsbc1[0].similarity(hsbc2[0]))
print(hsbc1[0].similarity(hsbc3[0])) 

0.4038766
0.6132948
