PaddlePaddle · w5688414 · Sep 22, 2022 · Sep 20, 2022 · Sep 20, 2022 · Sep 21, 2022
diff --git a/applications/text_classification/hierarchical/retrieval_based/README.md b/applications/text_classification/hierarchical/retrieval_based/README.md
@@ -37,7 +37,6 @@
 |—— base_model.py # 语义索引模型基类
 |—— train.py # In-batch Negatives 策略的训练主脚本
 |—— model.py # In-batch Negatives 策略核心网络结构
-|—— ann_util.py # Ann 建索引库相关函数
 
 |—— recall.py # 基于训练好的语义索引模型，从召回库中召回给定文本的相似文本
 |—— evaluate.py # 根据召回结果和评估集计算评估指标
@@ -167,7 +166,7 @@ unzip  baike_qa_category.zip
 
 ### 单机单卡训练/单机多卡训练
 
-这里采用单机多卡方式进行训练，通过如下命令，指定 GPU 0,1,2,3 卡;如果采用单机单卡训练，只需要把`--gpus`参数设置成单卡的卡号即可。
+这里采用单机多卡方式进行训练，通过如下命令，指定 GPU 0,1 卡;如果采用单机单卡训练，只需要把`--gpus`参数设置成单卡的卡号即可。
 
 如果使用CPU进行训练，则需要吧`--gpus`参数去除，然后吧`device`设置成cpu即可，详细请参考train.sh文件的训练设置
 
@@ -176,7 +175,7 @@ unzip  baike_qa_category.zip
 ```
 root_path=inbatch
 data_path=data
-python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
+python -u -m paddle.distributed.launch --gpus "0,1" \
     train.py \
     --device gpu \
     --save_dir ./checkpoints/${root_path} \

diff --git a/applications/text_classification/hierarchical/retrieval_based/ann_util.py b/applications/text_classification/hierarchical/retrieval_based/ann_util.py
diff --git a/applications/text_classification/hierarchical/retrieval_based/data.py b/applications/text_classification/hierarchical/retrieval_based/data.py
@@ -13,10 +13,49 @@
 # limitations under the License.
 
 import os
+
+import hnswlib
+import numpy as np
 import paddle
 from paddlenlp.utils.log import logger
 
 
+def build_index(corpus_data_loader, model, output_emb_size, hnsw_max_elements,
+                hnsw_ef, hnsw_m):
+
+    index = hnswlib.Index(space='ip',
+                          dim=output_emb_size if output_emb_size > 0 else 768)
+
+    # Initializing index
+    # max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
+    # during insertion of an element.
+    # The capacity can be increased by saving/loading the index, see below.
+    #
+    # ef_construction - controls index search speed/build speed tradeoff
+    #
+    # M - is tightly connected with internal dimensionality of the data. Strongly affects memory consumption (~M)
+    # Higher M leads to higher accuracy/run_time at fixed ef/efConstruction
+    index.init_index(max_elements=hnsw_max_elements,
+                     ef_construction=hnsw_ef,
+                     M=hnsw_m)
+
+    # Controlling the recall by setting ef:
+    # higher ef leads to better accuracy, but slower search
+    index.set_ef(hnsw_ef)
+
+    # Set number of threads used during batch search/construction
+    # By default using all available cores
+    index.set_num_threads(16)
+    logger.info("start build index..........")
+    all_embeddings = []
+    for text_embeddings in model.get_semantic_embedding(corpus_data_loader):
+        all_embeddings.append(text_embeddings.numpy())
+    all_embeddings = np.concatenate(all_embeddings, axis=0)
+    index.add_items(all_embeddings)
+    logger.info("Total index number:{}".format(index.get_current_count()))
+    return index
+
+
 def create_dataloader(dataset,
                       mode='train',
                       batch_size=1,

diff --git a/applications/text_classification/hierarchical/retrieval_based/export_model.py b/applications/text_classification/hierarchical/retrieval_based/export_model.py
@@ -32,23 +32,25 @@
                     help="The path of model parameter in static graph to be saved.")
 parser.add_argument("--output_emb_size", default=0,
                     type=int, help="output_embedding_size")
+parser.add_argument("--model_name_or_path", default='rocketqa-zh-dureader-query-encoder',
+                    type=str, help='The pretrained model used for training')
 args = parser.parse_args()
 # yapf: enable
 
 if __name__ == "__main__":
     # If you want to use ernie1.0 model, plesace uncomment the following code
-    pretrained_model = AutoModel.from_pretrained(
-        "rocketqa-zh-dureader-query-encoder")
-    tokenizer = AutoTokenizer.from_pretrained(
-        "rocketqa-zh-dureader-query-encoder")
+    pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
     model = SemanticIndexBaseStatic(pretrained_model,
                                     output_emb_size=args.output_emb_size)
 
     if args.params_path and os.path.isfile(args.params_path):
         state_dict = paddle.load(args.params_path)
         model.set_dict(state_dict)
         print("Loaded parameters from %s" % args.params_path)
-
+    else:
+        raise ValueError(
+            "Please set --params_path with correct pretrained model file")
     model.eval()
     # Convert to static graph with specific input description
     model = paddle.jit.to_static(

diff --git a/applications/text_classification/hierarchical/retrieval_based/predict.py b/applications/text_classification/hierarchical/retrieval_based/predict.py
@@ -45,6 +45,8 @@
                     help="Select which device to train model, defaults to gpu.")
 parser.add_argument("--pad_to_max_seq_len", action="store_true",
                     help="Whether to pad to max seq length.")
+parser.add_argument("--model_name_or_path", default='rocketqa-zh-dureader-query-encoder',
+                    type=str, help='The pretrained model used for training')
 args = parser.parse_args()
 # yapf: enable
 
@@ -77,8 +79,7 @@ def predict(model, data_loader):
 if __name__ == "__main__":
     paddle.set_device(args.device)
 
-    tokenizer = AutoTokenizer.from_pretrained(
-        "rocketqa-zh-dureader-query-encoder")
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
     trans_func = partial(convert_example,
                          tokenizer=tokenizer,
                          max_seq_length=args.max_seq_length,
@@ -101,8 +102,7 @@ def predict(model, data_loader):
                                           batch_size=args.batch_size,
                                           batchify_fn=batchify_fn,
                                           trans_fn=trans_func)
-    pretrained_model = AutoModel.from_pretrained(
-        "rocketqa-zh-dureader-query-encoder")
+    pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
     model = SemanticIndexBase(pretrained_model,
                               output_emb_size=args.output_emb_size)
     if args.params_path and os.path.isfile(args.params_path):

diff --git a/applications/text_classification/hierarchical/retrieval_based/recall.py b/applications/text_classification/hierarchical/retrieval_based/recall.py
@@ -63,6 +63,8 @@
                     type=int, help="Recall number for each query from Ann index.")
 parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu",
                     help="Select which device to train model, defaults to gpu.")
+parser.add_argument("--model_name_or_path", default='rocketqa-zh-dureader-query-encoder',
+                    type=str, help='The pretrained model used for training')
 args = parser.parse_args()
 # yapf: enable
 
@@ -71,8 +73,7 @@
     rank = paddle.distributed.get_rank()
     if paddle.distributed.get_world_size() > 1:
         paddle.distributed.init_parallel_env()
-    tokenizer = AutoTokenizer.from_pretrained(
-        'rocketqa-zh-dureader-query-encoder')
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
     trans_func = partial(convert_corpus_example,
                          tokenizer=tokenizer,
                          max_seq_length=args.max_seq_length)
@@ -82,8 +83,7 @@
         Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"
             ),  # text_segment
     ): [data for data in fn(samples)]
-    pretrained_model = AutoModel.from_pretrained(
-        "rocketqa-zh-dureader-query-encoder")
+    pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
     model = SemanticIndexBase(pretrained_model,
                               output_emb_size=args.output_emb_size)
     model = paddle.DataParallel(model)
@@ -106,7 +106,12 @@
                                            trans_fn=trans_func)
     # Need better way to get inner model of DataParallel
     inner_model = model._layers
-    final_index = build_index(args, corpus_data_loader, inner_model)
+    final_index = build_index(corpus_data_loader,
+                              inner_model,
+                              output_emb_size=args.output_emb_size,
+                              hnsw_max_elements=args.hnsw_max_elements,
+                              hnsw_ef=args.hnsw_ef,
+                              hnsw_m=args.hnsw_m)
     text_list, text2similar_text = gen_text_file(args.similar_text_pair_file)
     query_ds = MapDataset(text_list)
     query_data_loader = create_dataloader(query_ds,

diff --git a/applications/text_classification/hierarchical/retrieval_based/scripts/train.sh b/applications/text_classification/hierarchical/retrieval_based/scripts/train.sh
@@ -1,7 +1,21 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # GPU training
 root_path=inbatch
 data_path=data
-python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
+python -u -m paddle.distributed.launch --gpus "0,1" \
     train.py \
     --device gpu \
     --save_dir ./checkpoints/${root_path} \

diff --git a/applications/text_classification/hierarchical/retrieval_based/train.py b/applications/text_classification/hierarchical/retrieval_based/train.py
@@ -30,7 +30,7 @@
 from model import SemanticIndexBatchNeg
 from data import read_text_pair, convert_example, create_dataloader, gen_id2corpus, gen_text_file, convert_corpus_example
 from data import convert_label_example
-from ann_util import build_index
+from data import build_index
 
 # yapf: disable
 parser = argparse.ArgumentParser()
@@ -62,19 +62,16 @@
 parser.add_argument('--log_steps', type=int, default=10,
                     help="Inteval steps to print log")
 parser.add_argument("--train_set_file", type=str,
-                    default='./recall/train.csv',
+                    default='./data/train.txt',
                     help="The full path of train_set_file.")
-parser.add_argument("--dev_set_file", type=str,
-                    default='./recall/dev.csv',
-                    help="The full path of dev_set_file.")
 parser.add_argument("--margin", default=0.2, type=float,
                     help="Margin beteween pos_sample and neg_samples")
 parser.add_argument("--scale", default=30, type=int,
                     help="Scale for pair-wise margin_rank_loss")
-parser.add_argument("--corpus_file", type=str, default='./recall/corpus.csv',
+parser.add_argument("--corpus_file", type=str, default='./data/label.txt',
                     help="The full path of input file")
 parser.add_argument("--similar_text_pair_file", type=str,
-                    default='./recall/dev.csv',
+                    default='./data/dev.txt',
                     help="The full path of similar text pair file")
 parser.add_argument("--recall_result_dir", type=str, default='./recall_result_dir',
                     help="The full path of recall result file to save")
@@ -113,7 +110,12 @@ def evaluate(model, corpus_data_loader, query_data_loader, recall_result_file,
              text_list, id2corpus):
     # Load pretrained semantic model
     inner_model = model._layers
-    final_index = build_index(args, corpus_data_loader, inner_model)
+    final_index = build_index(corpus_data_loader,
+                              inner_model,
+                              output_emb_size=args.output_emb_size,
+                              hnsw_max_elements=args.hnsw_max_elements,
+                              hnsw_ef=args.hnsw_ef,
+                              hnsw_m=args.hnsw_m)
     query_embedding = inner_model.get_semantic_embedding(query_data_loader)
     with open(recall_result_file, 'w', encoding='utf-8') as f:
         for batch_index, batch_query_embedding in enumerate(query_embedding):

diff --git a/applications/text_classification/multi_class/retrieval_based/README.md b/applications/text_classification/multi_class/retrieval_based/README.md
@@ -29,7 +29,6 @@
 |—— base_model.py # 语义索引模型基类
 |—— train.py # In-batch Negatives 策略的训练主脚本
 |—— model.py # In-batch Negatives 策略核心网络结构
-|—— ann_util.py # Ann 建索引库相关函数
 
 |—— recall.py # 基于训练好的语义索引模型，从召回库中召回给定文本的相似文本
 |—— evaluate.py # 根据召回结果和评估集计算评估指标
@@ -147,7 +146,7 @@ unzip  webtext2019zh_qa.zip
 
 ### 单机单卡训练/单机多卡训练
 
-这里采用单机多卡方式进行训练，通过如下命令，指定 GPU 0,1,2,3 卡;如果采用单机单卡训练，只需要把`--gpus`参数设置成单卡的卡号即可。
+这里采用单机多卡方式进行训练，通过如下命令，指定 GPU 0,1 卡;如果采用单机单卡训练，只需要把`--gpus`参数设置成单卡的卡号即可。
 
 如果使用CPU进行训练，则需要吧`--gpus`参数去除，然后吧`device`设置成cpu即可，详细请参考train.sh文件的训练设置
 
@@ -156,7 +155,7 @@ unzip  webtext2019zh_qa.zip
 ```
 root_path=inbatch
 data_path=data
-python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
+python -u -m paddle.distributed.launch --gpus "0,1" \
     train.py \
     --device gpu \
     --save_dir ./checkpoints/${root_path} \
@@ -172,7 +171,7 @@ python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
     --recall_result_file "recall_result.txt" \
     --train_set_file ${data_path}/train.txt \
     --corpus_file ${data_path}/label.txt   \
-    --similar_text_pair ${data_path}/dev.txt \
+    --similar_text_pair_file ${data_path}/dev.txt \
     --evaluate True
 ```
 

diff --git a/applications/text_classification/multi_class/retrieval_based/ann_util.py b/applications/text_classification/multi_class/retrieval_based/ann_util.py