## Data exploration

In [60]:
import pandas as pd
import numpy as np

In [61]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # use GPU

# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("bert-large-cased")
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased")

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

pipe = pipeline('feature-extraction', model=model, tokenizer=tokenizer, device=device)

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [62]:
vocab = tokenizer.vocab
id2vocab = dict((value, key) for key, value in vocab.items())

In [63]:
cos_sum = lambda a, b: np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

def get_embedding(sentence):
  embedding = np.array(pipe(sentence))[0]
  encoded_dict = tokenizer.encode(sentence)

  word2embedding = dict()

  repeated_words = dict()

  for i, e in enumerate(encoded_dict):
    word = id2vocab[e]
    if word in word2embedding:
      word2embedding[word].append(embedding[i])
    else:
      word2embedding[word] = [embedding[i]]

  return word2embedding

In [64]:
sentence_a = 'The commodity futures is traded at twenty basis points'
sentence_b = 'A commodity futures contract is an agreement to buy or sell a particular commodity at a future date'
sentence_c = 'Like English, German has a future perfect tense that is used to talk about what will in the future be a past event'

a = get_embedding(sentence_a)['future'][0] # The tokenizer break down futures into future and ##s
b = get_embedding(sentence_b)['future'][0] # Take the first future's embedding (there are two "future" in sentence_b)
c = get_embedding(sentence_c)['future'][0]

print('a vs b', cos_sum(a,b))
print('a vs c', cos_sum(a,c))
print('b vs c', cos_sum(b,c))


a vs b 0.9628126261283041
a vs c 0.8273704534093114
b vs c 0.8151347365201804


In [65]:
sentence_a = 'The spot price of gold is very volatile'
sentence_b = 'The margin is calculated using the spot price of gold'
sentence_c = 'I spot a theft running out of a store'

a = get_embedding(sentence_a)['spot'][0]
b = get_embedding(sentence_b)['spot'][0]
c = get_embedding(sentence_c)['spot'][0]

print('a vs b', cos_sum(a,b))
print('a vs c', cos_sum(a,c))
print('b vs c', cos_sum(b,c))

a vs b 0.9780781908739328
a vs c 0.8350452357267761
b vs c 0.8232886264930577


In [66]:
sentence = 'If the head navigator needs the head, he should head this way'

embedding = get_embedding(sentence)

a = embedding['head'][0]
b = embedding['head'][1]
c = embedding['head'][2]


print('a vs b', cos_sum(a,b))
print('a vs c', cos_sum(a,c))
print('b vs c', cos_sum(b,c))

a vs b 0.9551920629970923
a vs c 0.8995614738705667
b vs c 0.9225908007956791


In [67]:
sentence = 'The jet leaves a jet of jet black smoke behind it'

embedding = get_embedding(sentence)

a = embedding['jet'][0]
b = embedding['jet'][1]
c = embedding['jet'][2]

print('a vs b', cos_sum(a,b))
print('a vs c', cos_sum(a,c))
print('b vs c', cos_sum(b,c))

a vs b 0.8956356534507922
a vs c 0.9336614277150953
b vs c 0.8939412732807073
