In [1]:
import torch
from transformers import BertModel, BertTokenizer

# 加载BERT模型
model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 准备文本数据
text = "今天天气很好"

# 对文本进行向量化
tokens = tokenizer.tokenize(text)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([ids])

# 输出文本的向量
outputs = model(input_ids)
token_embeddings = outputs[0]

# 获取句向量
sentence_embedding = token_embeddings.mean(dim=0)


In [2]:
print(sentence_embedding)

tensor([[ 0.2198,  0.5749, -0.3634,  ..., -0.2595,  0.2330,  0.0302],
        [ 0.4496,  0.3183, -0.3683,  ..., -0.2013,  0.4386,  0.1510],
        [ 0.3543,  0.3345, -0.3863,  ..., -0.2165,  0.4080,  0.0833],
        [ 0.1203,  0.2754, -0.2645,  ..., -0.1849,  0.3217, -0.0200],
        [ 0.1726,  0.3314, -0.2153,  ..., -0.1739,  0.3235, -0.0773],
        [ 0.1882,  0.4181, -0.2274,  ..., -0.1342,  0.3366, -0.1297]],
       grad_fn=<MeanBackward1>)


In [3]:
print(sentence_embedding.shape)

torch.Size([6, 768])


In [6]:
# 准备文本数据
texts = ["今天天气很好", "明天天气不好", "五月天假唱了"] #这个地方要对齐，

# 对文本进行分词
tokens = [tokenizer.tokenize(text) for text in texts]
ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens]
input_ids = torch.tensor(ids)

# 输出文本的向量
outputs = model(input_ids)
token_embeddings = outputs[0]

# 获取句向量
sentence_embeddings = [token_embeddings.mean(dim=0) for token_embeddings in token_embeddings]

# 返回句向量列表
print(sentence_embeddings)

[tensor([ 2.5079e-01,  3.7543e-01, -3.0419e-01, -3.6443e-01,  3.0078e-01,
        -4.5230e-02,  1.4587e-01, -8.4467e-01,  5.1839e-02, -7.8189e-01,
         5.0748e-01,  2.1065e-01, -6.4598e-01, -1.9690e-01, -1.9568e-01,
         3.4044e-01, -3.2356e-01, -3.8992e-01, -2.5289e-01,  6.9511e-01,
        -3.4540e-01,  4.5250e-02,  3.3896e-01,  6.3786e-01, -4.7721e-01,
        -1.1556e-01,  7.4538e-01, -7.5272e-01, -3.6391e-01,  1.5228e-01,
         1.0886e+00, -1.1385e+00,  1.0800e-01,  5.1022e-01,  8.1114e-01,
        -5.1515e-01,  8.3949e-01, -4.3929e-02,  3.8489e-01,  3.0871e-02,
        -9.3378e-01, -1.2632e-01, -3.7384e-01,  7.1108e-01, -6.0542e-02,
        -3.9813e-01, -8.8559e-02, -1.3796e-03,  3.8387e-01,  4.2330e-01,
        -1.1456e-01, -2.5296e-01,  8.6588e-04,  9.2688e-01,  1.0949e-01,
         2.8386e-01, -3.7456e-02, -7.0649e-01, -2.3412e-01, -2.9271e-01,
         5.4393e-01,  2.2664e-01,  1.2028e-01,  2.9624e-01,  1.9472e-01,
         2.4033e-01, -3.1605e-01,  9.2761e-01, -1.

In [12]:
# 输入多个句子，注意句子的长度可以不同
sentences = ["你好吗？", "我正在学习NLP，很有趣！", "BERT对于自然语言处理非常强大。"]

# 使用分词器将句子转换为BERT输入格式
input_ids = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)['input_ids']
attention_mask = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)['attention_mask']
print(input_ids)
print(attention_mask)

# 获取BERT模型的输出
with torch.no_grad():
    outputs = model(input_ids)

# 获取句子的BERT向量表示
last_hidden_states = outputs.last_hidden_state

# 如果你希望得到整个句子的表示，可以使用平均池化
sentence_embeddings = torch.mean(last_hidden_states, dim=1)
#

# 打印结果
print(sentence_embeddings)

tensor([[  101,   100,   100,   100,  1994,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  1855,  1888,   100,  1817,   100, 17953,  2361,  1989,   100,
          1873,   100,  1986,   102,     0,     0],
        [  101, 14324,   100,   100,   100,   100,   100,   100,   100,   100,
           100,   100,   100,  1810,  1636,   102]])
tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[ 0.0499, -0.2938,  0.5120,  ..., -0.3198,  0.3154, -0.6843],
        [-0.0844, -0.2669,  0.4338,  ..., -0.2038,  0.0258, -0.5052],
        [ 0.2271,  0.4600, -0.0378,  ..., -0.2690,  0.5697, -0.2693]])
