In [1]:
!export CUDA_VISIBLE_DEVICES=1,3
bge_path = "/media/wuyuhuan/bge-small-zh"
from transformers import AutoTokenizer, AutoModel
import torch
# Sentences we want sentence embeddings for
sentences = ["样例数据-1", "样例数据-2"]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(bge_path)
model = AutoModel.from_pretrained(bge_path)
model.eval()

# Tokenize sentences
encoded_input = tokenizer(sentences, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
# for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
    # Perform pooling. In this case, cls pooling.
    sentence_embeddings = model_output[0][:, 0]
# normalize embeddings
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
print("Sentence embeddings:", sentence_embeddings)

  from .autonotebook import tqdm as notebook_tqdm


Sentence embeddings: tensor([[-0.0141, -0.0318,  0.0410,  ..., -0.0393, -0.0152,  0.0084],
        [-0.0252, -0.0326,  0.0487,  ..., -0.0431, -0.0079,  0.0234]])


In [2]:
# tokenize a pair of sentence with tokenizer, test the output is as 
# [[CLS_token_id] [cv_token_ids0], ..[cv_token_ids], [SEP_token_id], [jd_token_ids0], ..., [jd_token_ids] [SEP_token_id]]

text_pair = ("简历简历简历简历简历简历简历简历。 简历简历简历简历简历简历。", "工作描述工作描述工作描述工作描述工作描述工作描述工作描述工作描述。工作工作工作工作。工作很辛苦，但是很有意义。")
cv_text = "简历简历简历简历简历简历简历简历。 简历简历简历简历简历简历。"
jd_text = "工作描述工作描述工作描述工作描述工作描述工作描述工作描述工作描述。工作工作工作工作。工作很辛苦，但是很有意义。"
encoded_input = tokenizer(text = cv_text, text_pair = jd_text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
print(encoded_input)

# decode the token ids to text
decoded_input = tokenizer.decode(encoded_input['input_ids'][0])
print(decoded_input)

{'input_ids': tensor([[ 101, 5042, 1325, 5042, 1325, 5042, 1325, 5042, 1325, 5042, 1325, 5042,
         1325, 5042, 1325, 5042, 1325,  511, 5042, 1325, 5042, 1325, 5042, 1325,
         5042, 1325, 5042, 1325, 5042, 1325,  511,  102, 2339,  868, 2989, 6835,
         2339,  868, 2989, 6835, 2339,  868, 2989, 6835, 2339,  868, 2989, 6835,
         2339,  868, 2989, 6835, 2339,  868, 2989, 6835, 2339,  868, 2989, 6835,
         2339,  868, 2989, 6835,  511, 2339,  868, 2339,  868, 2339,  868, 2339,
          868,  511, 2339,  868, 2523, 6789, 5736, 8024,  852, 3221, 2523, 3300,
         2692,  721,  511,  102,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,

In [5]:
encoded_input['input_ids'].shape

torch.Size([1, 512])

'[SEP]'