From d82766be1d22308a2b167e5e53f0add484e2012b Mon Sep 17 00:00:00 2001
From: Jiaqi Liu <liujiaqi06@baidu.com>
Date: Tue, 28 Dec 2021 16:01:32 +0800
Subject: [PATCH] pdate general distill in ppminilm (#1520)

---
 examples/model_compression/pp-minilm/README.md   |  2 +-
 .../pp-minilm/general_distill/README.md          |  4 ++--
 .../pp-minilm/general_distill/general_distill.py | 16 ++++++++--------
 .../pp-minilm/general_distill/run.sh             |  2 +-
 paddlenlp/transformers/distill_utils.py          |  6 +++---
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/examples/model_compression/pp-minilm/README.md b/examples/model_compression/pp-minilm/README.md
index ed77ffc9993d..b8234c33f11e 100644
--- a/examples/model_compression/pp-minilm/README.md
+++ b/examples/model_compression/pp-minilm/README.md
@@ -81,7 +81,7 @@ PP-MiniLM 压缩方案以面向预训练模型的任务无关知识蒸馏(Task-a
 │ └── run_clue.sh                # CLUE 上的微调启动脚本
 │ └── run_one_search.sh          # 单数据集下精调脚本
 │ └── run_all_search.sh          # CLUE数据集下精调脚本
-│ └── export_model.sh            # 导出 fine-tuned 部署模型脚本
+│ └── export_model.py            # 导出 fine-tuned 部署模型脚本
 ├── pruning                      # 裁剪、蒸馏目录
 │ └── prune.py                   # 裁剪、蒸馏脚本
 │ └── prune.sh                   # 裁剪、蒸馏启动脚本
diff --git a/examples/model_compression/pp-minilm/general_distill/README.md b/examples/model_compression/pp-minilm/general_distill/README.md
index df8767f5a50a..13f8c66dd010 100644
--- a/examples/model_compression/pp-minilm/general_distill/README.md
+++ b/examples/model_compression/pp-minilm/general_distill/README.md
@@ -29,9 +29,9 @@ cd ..
 
 其中 `general_distill.py` 参数释义如下：
 
-- `model_type` 指示了学生模型类型，当前仅支持 'ernie'、'roberta'。
+- `model_type` 指示了学生模型类型，当前仅支持 'ppminilm'、'roberta'。
 - `num_relation_heads` relation head 的个数，一般对于 large-size 的教师模型是64，对于 base-size 的教师模型是 48。
-- `teacher_model_type`指示了教师模型类型，当前仅支持 'ernie'、'roberta'。
+- `teacher_model_type`指示了教师模型类型，当前仅支持 'roberta'。
 - `teacher_layer_index`蒸馏时使用的教师模型的层
 - `student_layer_index` 蒸馏时使用的学生模型的层
 - `teacher_model_name_or_path`教师模型的名称，例如`'roberta-wwm-ext-large'`
diff --git a/examples/model_compression/pp-minilm/general_distill/general_distill.py b/examples/model_compression/pp-minilm/general_distill/general_distill.py
index 81f04f5e889f..d324bdbb7556 100644
--- a/examples/model_compression/pp-minilm/general_distill/general_distill.py
+++ b/examples/model_compression/pp-minilm/general_distill/general_distill.py
@@ -32,12 +32,12 @@
 from paddlenlp.utils.tools import TimeCostAverage
 from paddlenlp.transformers import LinearDecayWithWarmup
 from paddlenlp.transformers import RobertaModel, RobertaTokenizer
-from paddlenlp.transformers import ErnieModel, ErnieForSequenceClassification, ErnieTokenizer
+from paddlenlp.transformers import PPMiniLMModel, PPMiniLMForSequenceClassification, PPMiniLMTokenizer
 from paddlenlp.transformers.distill_utils import to_distill, calc_multi_relation_loss
 
 MODEL_CLASSES = {
     "roberta": (RobertaModel, RobertaTokenizer),
-    "ernie": (ErnieForSequenceClassification, ErnieTokenizer)
+    "ppminilm": (PPMiniLMForSequenceClassification, PPMiniLMTokenizer)
 }
 
 
@@ -47,14 +47,14 @@ def parse_args():
     # Required parameters
     parser.add_argument(
         "--model_type",
-        default="ernie",
+        default="ppminilm",
         type=str,
         required=True,
         help="Model type selected in the list: " +
         ", ".join(MODEL_CLASSES.keys()), )
     parser.add_argument(
         "--teacher_model_type",
-        default="ernie",
+        default="roberta",
         type=str,
         required=True,
         help="Model type selected in the list: " +
@@ -276,14 +276,14 @@ def do_train(args):
     # For student
     model_class, _ = MODEL_CLASSES[args.model_type]
     if args.num_layers == 6:
-        ernie = ErnieModel(
+        ppminilm = PPMiniLMModel(
             vocab_size=tokenizer.vocab_size,
             num_hidden_layers=6,
             hidden_act='relu',
             intermediate_size=3072,
             hidden_size=768)  # layer: 6
     elif args.num_layers == 4:
-        ernie = ErnieModel(
+        ppminilm = PPMiniLMModel(
             vocab_size=tokenizer.vocab_size,
             num_hidden_layers=4,
             hidden_act='relu',
@@ -291,13 +291,13 @@ def do_train(args):
             hidden_size=256,
             num_attention_heads=16)  # layer: 4
     else:
-        ernie = ErnieModel(
+        ppminilm = PPMiniLMModel(
             vocab_size=tokenizer.vocab_size,
             num_hidden_layers=2,
             hidden_act='relu',
             hidden_size=128,
             intermediate_size=512)  # layer: 2
-    student = model_class(ernie)
+    student = model_class(ppminilm)
 
     teacher = teacher_model_class.from_pretrained(
         args.teacher_model_name_or_path)
diff --git a/examples/model_compression/pp-minilm/general_distill/run.sh b/examples/model_compression/pp-minilm/general_distill/run.sh
index 3db0d135973b..be940e7c6d8b 100644
--- a/examples/model_compression/pp-minilm/general_distill/run.sh
+++ b/examples/model_compression/pp-minilm/general_distill/run.sh
@@ -47,7 +47,7 @@ cp ../../../../paddlenlp/transformers/distill_utils.py ${output_dir}/
 
 
 python3 -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" general_distill.py \
-    --model_type ernie \
+    --model_type ppminilm \
     --num_relation_heads ${numH} \
     --teacher_model_type ${teacher} \
     --teacher_layer_index ${teacher_layer_index} \
diff --git a/paddlenlp/transformers/distill_utils.py b/paddlenlp/transformers/distill_utils.py
index 3f67c0d022b1..c83cd8b045fd 100644
--- a/paddlenlp/transformers/distill_utils.py
+++ b/paddlenlp/transformers/distill_utils.py
@@ -21,7 +21,7 @@
 from paddle.fluid.data_feeder import convert_dtype
 
 from paddlenlp.utils.log import logger
-from paddlenlp.transformers import ErnieForSequenceClassification
+from paddlenlp.transformers import PPMiniLMForSequenceClassification
 from paddlenlp.transformers import TinyBertForPretraining
 from paddlenlp.transformers import BertForSequenceClassification
 
@@ -208,7 +208,7 @@ def to_distill(self,
     if return_qkv:
         # forward function of student class should be replaced for distributed training.
         TinyBertForPretraining._forward = minilm_pretraining_forward
-        ErnieForSequenceClassification._forward = minilm_pretraining_forward
+        PPMiniLMForSequenceClassification._forward = minilm_pretraining_forward
     else:
         TinyBertForPretraining._forward = tinybert_forward
 
@@ -216,7 +216,7 @@ def init_func(layer):
         if isinstance(layer, (MultiHeadAttention, TransformerEncoderLayer,
                               TransformerEncoder, TinyBertForPretraining,
                               BertForSequenceClassification,
-                              ErnieForSequenceClassification)):
+                              PPMiniLMForSequenceClassification)):
             layer.forward = layer._forward
             if isinstance(layer, TransformerEncoder):
                 layer.return_layer_outputs = return_layer_outputs