# Vectores contextuales con BERT
En este notebook vamos a aplicar un modelo BERT pre-entrenado sobre un texto y vamos a analizar los embeddings generados para ver su adaptación al contexto de cada término 

In [None]:
from transformers import AutoTokenizer, TFAutoModel

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") #podría ser BertTokenizer
model = TFAutoModel.from_pretrained("bert-base-cased") #podría ser TFBertModel


In [None]:
model.summary()

Analizamos un texto en el que aparece 3 veces el término "bank" con dos significados diferentes

In [None]:
text = "After stealing money from the bank vault, the bank robber was seen " \
       "fishing on the Mississippi river bank."

encodings = tokenizer(text, return_tensors="tf")


In [None]:
encodings

In [None]:
len(encodings.input_ids[0])

Hacemos inferencia con el modelo pre-entrenado y obtenemos los embeddings de cada token en la última capa del *encoder*

In [None]:
last_hidden_states = model.predict(encodings['input_ids'])

In [None]:
last_hidden_states.keys()

In [None]:
last_hidden_states.last_hidden_state.shape

In [None]:
last_hidden_states.pooler_output.shape

Vemos los tokens generados en el texto

In [None]:
import pandas as pd

tokens = []
for i in encodings['input_ids'][0].numpy():
    tokens.append({'token_id': i.item(),
     'token': tokenizer.convert_ids_to_tokens(i.item())})

pd.DataFrame(tokens)

In [None]:
token_vectors = last_hidden_states.last_hidden_state[0,:,:] #embeddings del primer documento (único)
token_vectors.shape

In [None]:
token_vectors[6,:10] #embeddings de "bank" (token 6)

In [None]:
token_vectors[10,:10] #embeddings de "bank" (token 10)

In [None]:
token_vectors[21,:10] #embeddings de "bank" (token 21)

Miramos la similitud (coseno) entre los distintos tokens para "bank"

In [None]:
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bank
# in "bank robber" vs "river bank" (different meaning).
diff_bank = 1 - cosine(token_vectors[10], token_vectors[21])

# between "bank robber" vs "bank vault" (same meaning).
same_bank = 1 - cosine(token_vectors[10], token_vectors[6])

print(f'Vector similarity for  *similar*  meanings:  {same_bank:.2f}')
print(f'Vector similarity for *different* meanings:  {diff_bank:.2f}')

In [None]:
diff_bank = 1 - cosine(token_vectors[3], token_vectors[21]) #money y bank (river)
same_bank = 1 - cosine(token_vectors[3], token_vectors[6]) #money y bank (vault)

print(f'Vector similarity for  *similar*  meanings:  {same_bank:.2f}')
print(f'Vector similarity for *different* meanings:  {diff_bank:.2f}')

Por comparar, vamos a comprobar ahora la similitud entre los embeddings de estos mismos tokens después de la primera capa de atención.

In [None]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained("bert-base-cased", output_hidden_states=True) #queremos acceder a todas las capas
model = TFAutoModel.from_pretrained("bert-base-cased", config=config)

outputs = model.predict(encodings['input_ids'])

In [None]:
outputs.keys()

In [None]:
hidden_states = outputs[2] #los embeddings de todas las capas están en el 3 elemento de la salida
len(hidden_states) #lista de vectores de salida de cada capa

In [None]:
hidden_states[0].shape #salida de la primera capa

In [None]:
hidden_states[12][:,6,:10] #embedding del token 6 en la última capa

In [None]:
hidden_states[0][:,6,:10] #embedding del token 6 en la entrada a la primera capa

In [None]:
hidden_states[0][:,10,:10] #embedding del token 10 en la primera capa

In [None]:
hidden_states[0][:,21,:10] #embedding del token 21 en la primera capa

En la primera capa

In [None]:
# in "bank robber" vs "river bank" (different meaning).
diff_bank = 1 - cosine(hidden_states[0][:,10,:].ravel(), hidden_states[0][:,21,:].ravel())

# in "bank robber" vs "bank vault" (same meaning).
same_bank = 1 - cosine(hidden_states[0][:,10,:].ravel(), hidden_states[0][:,6,:].ravel())

print(f'Vector similarity for  *similar*  meanings:  {same_bank:.4f}')
print(f'Vector similarity for *different* meanings:  {diff_bank:.4f}')

En la segunda capa

In [None]:
# in "bank robber" vs "river bank" (different meaning).
diff_bank = 1 - cosine(hidden_states[1][:,10,:].ravel(),
    hidden_states[1][:,21,:].ravel())

# in "bank robber" vs "bank vault" (same meaning).
same_bank = 1 - cosine(hidden_states[1][:,10,:].ravel(),
    hidden_states[1][:,6,:].ravel())

print(f'Vector similarity for  *similar*  meanings:  {same_bank:.4f}')
print(f'Vector similarity for *different* meanings:  {diff_bank:.4f}')

Los embeddings iniciales de los tokens del vocabulario se puede sacar del modelo con:

In [None]:
len(tokenizer.vocab)

In [None]:
input_embeddings = model.get_input_embeddings()
input_embeddings

In [None]:
len(input_embeddings.weights)

The first three elements are the word embedding weights, token type embedding weights, and positional embedding weights. The last two are the gamma and beta of the normalization layer.

In [None]:
for w in input_embeddings.weights:
    print(w.shape)

In [None]:
input_embeddings.weights[0].shape

In [None]:
input_embeddings.weights[0][3085] #embedding de bank

In [None]:
diff_bank = 1 - cosine(input_embeddings.weights[0][1948], input_embeddings.weights[0][3085]) #money y bank (río)

print(f'Vector similarity between money and bank (embedding):  {diff_bank:.2f}')