In [2]:
import torch
from transformers import AutoTokenizer, AutoModel

In [3]:
model_name = "huawei-noah/TinyBERT_General_4L_312D"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
token = tokenizer("apple is fruit", return_tensors="pt")
res = model(**token)

In [5]:
res.last_hidden_state.shape

torch.Size([1, 5, 312])

In [9]:
res.last_hidden_state.mean(dim=1).shape

torch.Size([1, 312])

In [15]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPUを使用します")
else:
    device = torch.device("cpu")
    print("CPUを使用します")

CPUを使用します


In [18]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#
model_name = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)


def get_embedding(words):
    batch_inputs = tokenizer(
        words, return_tensors="pt", padding=True, truncation=True
    ).to(device)
    with torch.no_grad():
        outputs = model(**batch_inputs)
    embedding = outputs.last_hidden_state.mean(dim=1)
    return embedding


king = get_embedding("king")
man = get_embedding("man")
woman = get_embedding("woman")
result_vector = king - man + woman

all_tokens = list(tokenizer.get_vocab().keys())
batch_size = 64


for i in range(0, len(all_tokens), batch_size):
    batch_tokens = all_tokens[i : i + batch_size]
    batch_embeddings = get_embedding(batch_tokens)

    similarities = cosine_similarity(
        result_vector.detach().cpu().numpy(), batch_embeddings.detach().cpu().numpy()
    )[0]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [16]:
token = tokenizer(
    "king", return_tensors="pt", padding=True, truncation=True
).to(device)

In [17]:
similarities

array([0.55840147, 0.6652018 , 0.6143114 , 0.63101745, 0.6371211 ,
       0.6728245 , 0.66778827, 0.52089566, 0.63109976, 0.66336316,
       0.63431394, 0.7135271 , 0.6066072 , 0.6550435 , 0.6415851 ,
       0.65195054, 0.6812757 , 0.642771  , 0.61123794, 0.6321051 ,
       0.6149322 , 0.6675673 , 0.6672232 , 0.5330621 , 0.5436189 ,
       0.6749656 , 0.6916433 , 0.6354941 , 0.56208   , 0.6408452 ,
       0.71697   , 0.5943855 , 0.5774625 , 0.62896574, 0.6116928 ,
       0.5814994 , 0.6971441 , 0.6188058 , 0.6501277 , 0.6107091 ,
       0.6467501 , 0.65286314, 0.662601  , 0.64830476, 0.67283964,
       0.62762755, 0.59436667, 0.6499065 , 0.5352727 , 0.65918213,
       0.6722705 , 0.5560658 , 0.59670675, 0.6494635 , 0.58055043,
       0.6238123 , 0.6203469 , 0.5345448 ], dtype=float32)