Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update retrieval based classification README.md #3322

Merged
merged 8 commits into from
Sep 22, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
|—— base_model.py # 语义索引模型基类
|—— train.py # In-batch Negatives 策略的训练主脚本
|—— model.py # In-batch Negatives 策略核心网络结构
|—— ann_util.py # Ann 建索引库相关函数

|—— recall.py # 基于训练好的语义索引模型,从召回库中召回给定文本的相似文本
|—— evaluate.py # 根据召回结果和评估集计算评估指标
Expand Down Expand Up @@ -167,7 +166,7 @@ unzip baike_qa_category.zip

### 单机单卡训练/单机多卡训练

这里采用单机多卡方式进行训练,通过如下命令,指定 GPU 0,1,2,3 卡;如果采用单机单卡训练,只需要把`--gpus`参数设置成单卡的卡号即可。
这里采用单机多卡方式进行训练,通过如下命令,指定 GPU 0,1 卡;如果采用单机单卡训练,只需要把`--gpus`参数设置成单卡的卡号即可。

如果使用CPU进行训练,则需要吧`--gpus`参数去除,然后吧`device`设置成cpu即可,详细请参考train.sh文件的训练设置

Expand All @@ -176,7 +175,7 @@ unzip baike_qa_category.zip
```
root_path=inbatch
data_path=data
python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
python -u -m paddle.distributed.launch --gpus "0,1" \
train.py \
--device gpu \
--save_dir ./checkpoints/${root_path} \
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,49 @@
# limitations under the License.

import os

import hnswlib
import numpy as np
import paddle
from paddlenlp.utils.log import logger


def build_index(corpus_data_loader, model, output_emb_size, hnsw_max_elements,
hnsw_ef, hnsw_m):

index = hnswlib.Index(space='ip',
dim=output_emb_size if output_emb_size > 0 else 768)

# Initializing index
# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
# during insertion of an element.
# The capacity can be increased by saving/loading the index, see below.
#
# ef_construction - controls index search speed/build speed tradeoff
#
# M - is tightly connected with internal dimensionality of the data. Strongly affects memory consumption (~M)
# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction
index.init_index(max_elements=hnsw_max_elements,
ef_construction=hnsw_ef,
M=hnsw_m)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
index.set_ef(hnsw_ef)

# Set number of threads used during batch search/construction
# By default using all available cores
index.set_num_threads(16)
logger.info("start build index..........")
all_embeddings = []
for text_embeddings in model.get_semantic_embedding(corpus_data_loader):
all_embeddings.append(text_embeddings.numpy())
all_embeddings = np.concatenate(all_embeddings, axis=0)
index.add_items(all_embeddings)
logger.info("Total index number:{}".format(index.get_current_count()))
return index


def create_dataloader(dataset,
mode='train',
batch_size=1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,23 +32,25 @@
help="The path of model parameter in static graph to be saved.")
parser.add_argument("--output_emb_size", default=0,
type=int, help="output_embedding_size")
parser.add_argument("--model_name_or_path", default='rocketqa-zh-dureader-query-encoder',
type=str, help='The pretrained model used for training')
args = parser.parse_args()
# yapf: enable

if __name__ == "__main__":
# If you want to use ernie1.0 model, plesace uncomment the following code
pretrained_model = AutoModel.from_pretrained(
"rocketqa-zh-dureader-query-encoder")
tokenizer = AutoTokenizer.from_pretrained(
"rocketqa-zh-dureader-query-encoder")
pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
model = SemanticIndexBaseStatic(pretrained_model,
output_emb_size=args.output_emb_size)

if args.params_path and os.path.isfile(args.params_path):
state_dict = paddle.load(args.params_path)
model.set_dict(state_dict)
print("Loaded parameters from %s" % args.params_path)

else:
raise ValueError(
"Please set --params_path with correct pretrained model file")
model.eval()
# Convert to static graph with specific input description
model = paddle.jit.to_static(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
help="Select which device to train model, defaults to gpu.")
parser.add_argument("--pad_to_max_seq_len", action="store_true",
help="Whether to pad to max seq length.")
parser.add_argument("--model_name_or_path", default='rocketqa-zh-dureader-query-encoder',
type=str, help='The pretrained model used for training')
args = parser.parse_args()
# yapf: enable

Expand Down Expand Up @@ -77,8 +79,7 @@ def predict(model, data_loader):
if __name__ == "__main__":
paddle.set_device(args.device)

tokenizer = AutoTokenizer.from_pretrained(
"rocketqa-zh-dureader-query-encoder")
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
trans_func = partial(convert_example,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
Expand All @@ -101,8 +102,7 @@ def predict(model, data_loader):
batch_size=args.batch_size,
batchify_fn=batchify_fn,
trans_fn=trans_func)
pretrained_model = AutoModel.from_pretrained(
"rocketqa-zh-dureader-query-encoder")
pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
model = SemanticIndexBase(pretrained_model,
output_emb_size=args.output_emb_size)
if args.params_path and os.path.isfile(args.params_path):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@
type=int, help="Recall number for each query from Ann index.")
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu",
help="Select which device to train model, defaults to gpu.")
parser.add_argument("--model_name_or_path", default='rocketqa-zh-dureader-query-encoder',
type=str, help='The pretrained model used for training')
args = parser.parse_args()
# yapf: enable

Expand All @@ -71,8 +73,7 @@
rank = paddle.distributed.get_rank()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
tokenizer = AutoTokenizer.from_pretrained(
'rocketqa-zh-dureader-query-encoder')
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
trans_func = partial(convert_corpus_example,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length)
Expand All @@ -82,8 +83,7 @@
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"
), # text_segment
): [data for data in fn(samples)]
pretrained_model = AutoModel.from_pretrained(
"rocketqa-zh-dureader-query-encoder")
pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
model = SemanticIndexBase(pretrained_model,
output_emb_size=args.output_emb_size)
model = paddle.DataParallel(model)
Expand All @@ -106,7 +106,12 @@
trans_fn=trans_func)
# Need better way to get inner model of DataParallel
inner_model = model._layers
final_index = build_index(args, corpus_data_loader, inner_model)
final_index = build_index(corpus_data_loader,
inner_model,
output_emb_size=args.output_emb_size,
hnsw_max_elements=args.hnsw_max_elements,
hnsw_ef=args.hnsw_ef,
hnsw_m=args.hnsw_m)
text_list, text2similar_text = gen_text_file(args.similar_text_pair_file)
query_ds = MapDataset(text_list)
query_data_loader = create_dataloader(query_ds,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,21 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# GPU training
root_path=inbatch
data_path=data
python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
python -u -m paddle.distributed.launch --gpus "0,1" \
train.py \
--device gpu \
--save_dir ./checkpoints/${root_path} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from model import SemanticIndexBatchNeg
from data import read_text_pair, convert_example, create_dataloader, gen_id2corpus, gen_text_file, convert_corpus_example
from data import convert_label_example
from ann_util import build_index
from data import build_index

# yapf: disable
parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -62,19 +62,16 @@
parser.add_argument('--log_steps', type=int, default=10,
help="Inteval steps to print log")
parser.add_argument("--train_set_file", type=str,
default='./recall/train.csv',
default='./data/train.txt',
help="The full path of train_set_file.")
parser.add_argument("--dev_set_file", type=str,
default='./recall/dev.csv',
help="The full path of dev_set_file.")
parser.add_argument("--margin", default=0.2, type=float,
help="Margin beteween pos_sample and neg_samples")
parser.add_argument("--scale", default=30, type=int,
help="Scale for pair-wise margin_rank_loss")
parser.add_argument("--corpus_file", type=str, default='./recall/corpus.csv',
parser.add_argument("--corpus_file", type=str, default='./data/label.txt',
help="The full path of input file")
parser.add_argument("--similar_text_pair_file", type=str,
default='./recall/dev.csv',
default='./data/dev.txt',
help="The full path of similar text pair file")
parser.add_argument("--recall_result_dir", type=str, default='./recall_result_dir',
help="The full path of recall result file to save")
Expand Down Expand Up @@ -113,7 +110,12 @@ def evaluate(model, corpus_data_loader, query_data_loader, recall_result_file,
text_list, id2corpus):
# Load pretrained semantic model
inner_model = model._layers
final_index = build_index(args, corpus_data_loader, inner_model)
final_index = build_index(corpus_data_loader,
inner_model,
output_emb_size=args.output_emb_size,
hnsw_max_elements=args.hnsw_max_elements,
hnsw_ef=args.hnsw_ef,
hnsw_m=args.hnsw_m)
query_embedding = inner_model.get_semantic_embedding(query_data_loader)
with open(recall_result_file, 'w', encoding='utf-8') as f:
for batch_index, batch_query_embedding in enumerate(query_embedding):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
|—— base_model.py # 语义索引模型基类
|—— train.py # In-batch Negatives 策略的训练主脚本
|—— model.py # In-batch Negatives 策略核心网络结构
|—— ann_util.py # Ann 建索引库相关函数

|—— recall.py # 基于训练好的语义索引模型,从召回库中召回给定文本的相似文本
|—— evaluate.py # 根据召回结果和评估集计算评估指标
Expand Down Expand Up @@ -147,7 +146,7 @@ unzip webtext2019zh_qa.zip

### 单机单卡训练/单机多卡训练

这里采用单机多卡方式进行训练,通过如下命令,指定 GPU 0,1,2,3 卡;如果采用单机单卡训练,只需要把`--gpus`参数设置成单卡的卡号即可。
这里采用单机多卡方式进行训练,通过如下命令,指定 GPU 0,1 卡;如果采用单机单卡训练,只需要把`--gpus`参数设置成单卡的卡号即可。

如果使用CPU进行训练,则需要吧`--gpus`参数去除,然后吧`device`设置成cpu即可,详细请参考train.sh文件的训练设置

Expand All @@ -156,7 +155,7 @@ unzip webtext2019zh_qa.zip
```
root_path=inbatch
data_path=data
python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
python -u -m paddle.distributed.launch --gpus "0,1" \
train.py \
--device gpu \
--save_dir ./checkpoints/${root_path} \
Expand All @@ -172,7 +171,7 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
--recall_result_file "recall_result.txt" \
--train_set_file ${data_path}/train.txt \
--corpus_file ${data_path}/label.txt \
--similar_text_pair ${data_path}/dev.txt \
--similar_text_pair_file ${data_path}/dev.txt \
--evaluate True
```

Expand Down

This file was deleted.

Loading