In [16]:
import gezi
from gezi import tqdm

In [2]:
import torch
from transformers import *

# Transformers has a unified API
# for 10 transformer architectures and 30 pretrained weights.
#          Model          | Tokenizer          | Pretrained weights shortcut
MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
          (GPT2Model,       GPT2Tokenizer,       'gpt2'),
          (CTRLModel,       CTRLTokenizer,       'ctrl'),
          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
          (RobertaModel,    RobertaTokenizer,    'roberta-base'),
          (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
         ]

# To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`

# Let's encode some text in a sequence of hidden-states using each model:
for model_class, tokenizer_class, pretrained_weights in MODELS:
    # Load pretrained model/tokenizer
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights)

    # Encode text
    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [3]:
last_hidden_statees

NameError: name 'last_hidden_statees' is not defined

In [4]:
from transformers import ReformerTokenizer, ReformerModel
import torch

tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
model = ReformerModel.from_pretrained('google/reformer-crime-and-punishment', return_dict=True)

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

ImportError: cannot import name 'ReformerTokenizer'

In [1]:
import torch
from transformers import *

In [7]:
pretrained_weights = 'roberta-base'
tokenizer_class = RobertaTokenizer
model_class = RobertaModel

In [41]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

In [42]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [43]:
def to_vec(sent):
  input_sentence = torch.tensor(tokenizer.encode("[CLS] " + sent)).unsqueeze(0)
  out = model(input_sentence)
  embeddings_of_last_layer = out[0]
  cls_embeddings = embeddings_of_last_layer[0]
  return cls_embeddings.detach().numpy()[0]

In [44]:
gezi.cosine(to_vec('I love beijing'), to_vec('you could do that'))

0.9989729

In [22]:
to_vec('I love beijing')

array([-7.39808008e-02,  8.41250569e-02, -6.36343518e-03, -1.29973292e-01,
        5.30971177e-02, -8.84733051e-02, -2.64015906e-02,  1.90576296e-02,
        3.44603285e-02, -8.72797295e-02, -1.67811140e-02,  4.47641760e-02,
        4.04301733e-02, -4.37791981e-02,  7.44287223e-02,  6.78659528e-02,
       -8.34153146e-02,  2.19657421e-02,  1.65514909e-02, -1.54037056e-02,
       -1.16682246e-01,  3.78034636e-02, -4.36001904e-02,  9.05232579e-02,
       -9.05306637e-03,  9.90579464e-03,  9.81228724e-02,  7.33805597e-02,
       -4.81140316e-02,  8.14239029e-03, -2.23927815e-02, -3.24736387e-02,
        3.57123949e-02, -4.91988249e-02,  3.26975286e-02,  5.12624420e-02,
        7.26121664e-02, -3.80731281e-03, -1.19732223e-01,  1.39283128e-02,
       -2.67313868e-02,  6.49320260e-02,  2.68958881e-02, -1.07663451e-02,
        8.95077884e-02,  3.32027264e-02, -4.50687530e-03,  1.91730261e-02,
       -3.05878203e-02,  2.02493630e-02,  1.09997485e-02,  5.52218929e-02,
       -3.77188101e-02, -

In [23]:
 to_vec('I hate beijing')

array([-6.10325634e-02,  8.55136141e-02, -5.57759497e-03, -1.28400862e-01,
        5.23129180e-02, -9.89680141e-02, -2.76119839e-02,  1.61924306e-02,
        4.63809185e-02, -8.10004696e-02, -2.09209174e-02,  4.53663655e-02,
        4.13174853e-02, -5.33537827e-02,  6.74365386e-02,  5.65480255e-02,
       -8.27705115e-02,  1.56748928e-02,  1.50247430e-02, -1.83106512e-02,
       -1.16284728e-01,  4.33934703e-02, -6.10640198e-02,  8.16112012e-02,
        1.93249923e-03,  1.64508261e-02,  8.66040289e-02,  8.96299034e-02,
       -5.06370105e-02,  7.54478248e-03, -2.40560658e-02, -3.12479157e-02,
        3.97363603e-02, -5.27532697e-02,  3.36972550e-02,  6.12153970e-02,
        6.01794273e-02, -8.00269627e-05, -1.25953913e-01,  3.15684709e-03,
       -2.84808557e-02,  7.10700527e-02,  1.80797987e-02, -1.46361068e-02,
        8.74670297e-02,  1.92072969e-02, -7.42857577e-03,  1.22667672e-02,
       -3.30863409e-02,  1.71683524e-02,  8.00394733e-03,  6.23466671e-02,
       -3.83008644e-02, -

In [24]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [26]:
last_hidden_states.shape

torch.Size([1, 8, 768])

In [32]:
def to(sent):
  input_ids = torch.tensor(tokenizer.encode('[CLS] ' + sent)).unsqueeze(0)  # Batch size 1
  outputs = model(input_ids)
  last_hidden_states = outputs[0]
  emb = last_hidden_states[0][0]
  return emb.detach().numpy()

In [38]:
gezi.cosine(to('macbook pro'), to('where is you'))

0.82390374