PaddlePaddle · yanfeich · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 9, 2024
diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py
@@ -577,29 +577,39 @@ def _preprocess(self, source):
             )
 
         else:
-            for i in range(inputs["input_ids"].shape[0]):
-                length = inputs["seq_len_encoder"][i][0]
-                if self.attention_mask is not None:
-                    self.attention_mask[i, 0, :length, :length] = paddle.tril(
-                        paddle.ones(shape=(length, length), dtype=self.config.dtype)
-                    )
-
-                if pre_caches_length > 0:
-                    if self.config.prefix_path is None:
-                        prefix_attention_mask = paddle.zeros(
-                            [1, length, pre_caches_length], dtype=self.attention_mask.dtype
-                        )
-                    else:
-                        prefix_attention_mask = paddle.ones(
-                            [1, length, pre_caches_length], dtype=self.attention_mask.dtype
-                        )
-                    post_attention_mask = paddle.tril(
-                        paddle.ones(shape=(length, length), dtype=self.attention_mask.dtype)
-                    ).unsqueeze_(axis=0)
+            if inputs["attention_mask"] is not None:
+                bsz, src_len = inputs["attention_mask"].shape
+                causal_4d_mask = paddle.tril(
+                    paddle.ones(shape=(bsz, 1, self.config.total_max_length, self.config.total_max_length), dtype=self.config.dtype)
+                )
+                attention_mask_2d = paddle.ones(shape=(bsz, self.config.total_max_length), dtype='int64')
+                attention_mask_2d[:,0:src_len] = inputs["attention_mask"]
+                bool_mask = attention_mask_2d != 1
+                expanded_attn_mask = bool_mask[:, None, None, :].expand([bsz, 1, self.config.total_max_length, self.config.total_max_length])
+                self.attention_mask = causal_4d_mask.masked_fill(expanded_attn_mask, 0)
+            else:
+                for i in range(inputs["input_ids"].shape[0]):
+                    length = inputs["seq_len_encoder"][i][0]
                     if self.attention_mask is not None:
-                        self.attention_mask[i, 0, :length, : length + pre_caches_length] = paddle.concat(
-                            [prefix_attention_mask, post_attention_mask], axis=2
+                        self.attention_mask[i, 0, :length, :length] = paddle.tril(
+                            paddle.ones(shape=(1, 1, length, length), dtype=self.config.dtype)
                         )
+                    if pre_caches_length > 0:
+                        if self.config.prefix_path is None:
+                            prefix_attention_mask = paddle.zeros(
+                                [1, length, pre_caches_length], dtype=self.attention_mask.dtype
+                            )
+                        else:
+                            prefix_attention_mask = paddle.ones(
+                                [1, length, pre_caches_length], dtype=self.attention_mask.dtype
+                            )
+                        post_attention_mask = paddle.tril(
+                            paddle.ones(shape=(length, length), dtype=self.attention_mask.dtype)
+                        ).unsqueeze_(axis=0)
+                        if self.attention_mask is not None:
+                            self.attention_mask[i, 0, :length, : length + pre_caches_length] = paddle.concat(
+                                [prefix_attention_mask, post_attention_mask], axis=2
+                            )
 
         inputs["pre_ids"] = self.pre_ids
         inputs["attention_mask"] = self.attention_mask
@@ -1295,6 +1305,7 @@ def create_predictor(
 ):
     tokenizer = AutoTokenizer.from_pretrained(
         predictor_args.model_name_or_path,
+        padding_side="left"
     )
     # init chat_template for tokenizer
     llm_utils.init_chat_template(tokenizer, predictor_args.model_name_or_path, predictor_args.chat_template)
@@ -1304,7 +1315,6 @@ def create_predictor(
         tokenizer.pad_token = tokenizer.eos_token
 
     config = AutoConfig.from_pretrained(predictor_args.model_name_or_path)
-
     max_position_embeddings = llm_utils.get_model_max_position_embeddings(config)
     if max_position_embeddings is None:
         max_position_embeddings = predictor_args.src_length + predictor_args.max_length
@@ -1451,13 +1461,13 @@ def predict():
 def benchmark(predictor, predictor_args, model_args):
     # Just construct a simple benchmark input. We pad input to the src_length.
     test_texts = "hello world, how are you?"
-    benchmark_texts = [test_texts + "<pad>" * predictor_args.src_length for _ in range(predictor_args.batch_size)]
+    benchmark_texts = [test_texts for _ in range(predictor_args.batch_size)]
 
     batch_benchmark_texts = batchfy_text(benchmark_texts, predictor_args.batch_size)
     print("***********Start Benchmark**********")
 
-    warmup_time = 5
-    test_time = 20
+    warmup_time = 1
+    test_time = 5
 
     print("***********Start Warmup**********")
     for _ in range(warmup_time):
@@ -1469,8 +1479,14 @@ def benchmark(predictor, predictor_args, model_args):
     output_tokens = 0
     for _ in range(test_time):
         for bs, batch_source_text in enumerate(batch_benchmark_texts):
-            outputs, batch_tokens = predictor.predict(batch_source_text, return_tokens=True)
-            output_tokens += sum([len(tokens) for tokens in batch_tokens])
+            if paddle.distributed.get_rank() == 0:
+                outputs, batch_tokens = predictor.predict(batch_source_text, return_tokens=True)
+                output_tokens += sum([len(tokens) for tokens in batch_tokens])
+            else:
+                outputs = predictor.predict(batch_source_text)
+
+    if outputs == None:
+        return
     end = time.perf_counter()
     print("Avg Elapse time is: ", (end - start) / test_time)
     print("Output tokens is: ", output_tokens)