diff --git a/bert4torch/pipelines/base.py b/bert4torch/pipelines/base.py index f50ead30..bee5cbd7 100644 --- a/bert4torch/pipelines/base.py +++ b/bert4torch/pipelines/base.py @@ -1,4 +1,4 @@ -from typing import List, Union, Dict +from typing import List, Union, Dict, Literal import numpy as np import os import torch @@ -11,21 +11,26 @@ class PipeLineBase: '''基类 ''' - def __init__(self, checkpoint_path:str, device:str=None, **kwargs) -> None: + def __init__(self, checkpoint_path:str, device:str=None, tokenizer_type:Literal['b4t', 'hf']='b4t', **kwargs) -> None: self.checkpoint_path = checkpoint_path self.config_path = kwargs.get('config_path') or checkpoint_path if device is None: self.device = 'cuda' if torch.cuda.is_available() else 'cpu' else: self.device = device + + if (tokenizer_type == 'b4t') and os.path.exists(os.path.join(self.checkpoint_path, 'vocab.txt')): + self.tokenizer_type = 'b4t' + else: + self.tokenizer_type = 'hf' self.tokenizer = self.build_tokenizer() self.model = self.build_model(kwargs) self.config = self.model.config def build_tokenizer(self): - vocab_path = os.path.join(self.checkpoint_path, 'vocab.txt') - if os.path.exists(vocab_path): - tokenizer = Tokenizer(vocab_path, do_lower_case=True) + # TODO: 默认优先使用默认的Tokenizer,如果没有vocab文件,则使用AutoTokenizer,后续可能修改 + if self.tokenizer_type == 'b4t': + tokenizer = Tokenizer(os.path.join(self.checkpoint_path, 'vocab.txt'), do_lower_case=True) else: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.checkpoint_path) diff --git a/bert4torch/pipelines/text2vec.py b/bert4torch/pipelines/text2vec.py index 9428c670..e2ac04ea 100644 --- a/bert4torch/pipelines/text2vec.py +++ b/bert4torch/pipelines/text2vec.py @@ -46,18 +46,22 @@ def encode( all_embeddings = [] for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar): sentences_batch = sentences_sorted[start_index: start_index + batch_size] - batch = self.tokenizer(sentences_batch, maxlen=max_seq_length) - batch_input = [torch.tensor(sequence_padding(item), dtype=torch.long, device=self.device) for item in batch] - output = self.model(batch_input) + if self.tokenizer_type == 'b4t': + batch_input = self.tokenizer(sentences_batch, max_length=max_seq_length, return_tensors='pt').to(self.device) + output = self.model(batch_input) + else: + batch_input = self.tokenizer(sentences_batch, max_length=max_seq_length, return_tensors='pt').to(self.device) + output = self.model(**batch_input) last_hidden_state = output.get('last_hidden_state') pooler = output.get('pooled_output') if isinstance(batch_input, list): - attention_mask = (batch_input[0] != self.tokenizer._token_pad_id).long() + attention_mask = (batch_input[0] != self.tokenizer.pad_token_id).long() elif isinstance(batch_input, torch.Tensor): - attention_mask = (batch_input != self.tokenizer._token_pad_id).long() - else: - raise TypeError('Args `batch_input` only support list(tensor)/tensor format') + attention_mask = (batch_input != self.tokenizer.pad_token_id).long() + else: # 类似字典格式的 + attention_mask = batch_input.get('attention_mask', (batch_input['input_ids'] != self.tokenizer.pad_token_id).long()) + pool_strategy = pool_strategy or self.pool_strategy embs = get_pool_emb(last_hidden_state, pooler, attention_mask, pool_strategy, custom_layer) if normalize_embeddings: diff --git a/bert4torch/snippets/hub.py b/bert4torch/snippets/hub.py index d99ad695..bc095ccb 100644 --- a/bert4torch/snippets/hub.py +++ b/bert4torch/snippets/hub.py @@ -6,7 +6,7 @@ import json from pathlib import Path from typing import Union, Optional, Dict -from torch4keras.snippets import log_error_once, log_info_once, log_error, is_safetensors_available +from torch4keras.snippets import log_error_once, log_info_once, log_error, is_safetensors_available, check_file_modified if os.environ.get('SAFETENSORS_FIRST', False): @@ -163,7 +163,9 @@ def snapshot_download( ) if resolved_file.endswith('config.json'): storage_folder = os.path.dirname(resolved_file) - log_info_once(f'Download {repo_id} to {storage_folder}') + if check_file_modified(resolved_file, duration=2): + # 如果文件在2s内下载的,则不打印 + log_info_once(f'Download {repo_id} to {storage_folder}') if os.path.exists(resolved_file + ".lock"): os.remove(resolved_file + ".lock") return storage_folder @@ -194,7 +196,9 @@ def snapshot_download( user_agent = user_agent, endpoint = HF_ENDPOINT ) - log_info_once(f'Download {repo_id} to {resolved_file}') + if check_file_modified(resolved_file, duration=2): + # 如果文件在2s内下载的,则不打印 + log_info_once(f'Download {repo_id} to {resolved_file}') except EntryNotFoundError: log_error( f"{repo_id} does not appear to have a file named {filename}. Checkout " diff --git a/docs/History.md b/docs/History.md index c038b4ec..c8cbe9ff 100644 --- a/docs/History.md +++ b/docs/History.md @@ -1,5 +1,7 @@ ## 更新历史 +- **20240403**:修改Text2Vec的bug +- **20240331**: 修复chatglm3的bug, 修复save_pretrained时多文件的bug,增加CausalLMLoss, 修改deepspeed的传参逻辑 - **20240317**: 修复config_path的bug,允许num_key_value_heads参数 - **20240316**: 增加get_weight_decay_optim_groups函数, attention中允许is_causal,修改repetition_penalty的bug,把baichuan从llama中剥离,[torch4keras-v0.2.1](https://github.com/Tongjilibo/torch4keras/releases/tag/v0.2.1)更新特性 - **20240216**: fastapi发布服务允许闲时offload到cpu, `build_transformer_model`允许从hf下载, 添加`FillMask`的pipeline, 添加`SequenceClassificationTrainer` diff --git a/examples/basic/embedding/basic_bge.py b/examples/basic/embedding/basic_bge.py index 6a3e9e93..71a590fd 100644 --- a/examples/basic/embedding/basic_bge.py +++ b/examples/basic/embedding/basic_bge.py @@ -1,5 +1,8 @@ +import os +os.environ['HF_ENDPOINT'] = "https://hf-mirror.com" # root_model_path = 'E:\pretrain_ckpt\embedding\BAAI@bge-large-en-v1.5' -root_model_path = 'E:\pretrain_ckpt\embedding\BAAI@bge-large-zh-v1.5' +root_model_path = 'BAAI/bge-large-zh-v1.5' +# root_model_path = '/data/pretrain_ckpt/embedding/BAAI--bge-large-zh-v1.5' sentences_1 = ["样例数据-1", "样例数据-2"] sentences_2 = ["样例数据-3", "样例数据-4"]