Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
3541fc7
e2e integration
LeoZhao-Intel Dec 5, 2024
5f4bcc3
Merge branch 'develop' into e2e_integration
LeoZhao-Intel Dec 5, 2024
7bfa060
minial fix
LeoZhao-Intel Dec 5, 2024
a5822bc
Merge branch 'PaddlePaddle:develop' into e2e_integration
LeoZhao-Intel Dec 9, 2024
ee0bc14
refine code and remove unnecessary changes
LeoZhao-Intel Dec 9, 2024
86975f9
reference pp kernels implementation
JianyuLi01 Dec 16, 2024
cd1964b
integrate all fused ops, first fused op still need to optimize on
LeoZhao-Intel Dec 16, 2024
3776a92
Merge branch 'pr-1' into e2e_integration
LeoZhao-Intel Dec 16, 2024
43fb86f
align final rms_norm shape with expected.
LeoZhao-Intel Dec 16, 2024
4522e3f
e2e shape fix
yanfeich Dec 18, 2024
bf09d1c
e2e bugs fix
yanfeich Dec 20, 2024
ad31fca
e2e bugs fix2
yanfeich Dec 23, 2024
9b537f3
e2e w/a cast place
yanfeich Dec 24, 2024
e7d8b0a
e2e batchsize left padding fix
yanfeich Jan 6, 2025
47e1a98
Refine _post_process code in generation_utils.py
JianyuLi01 Jan 8, 2025
41a26fd
Merge pull request #1 from JianyuLi01/e2e_brach_leo_43fb86f41
yanfeich Jan 8, 2025
87c9844
combine k/v together
yanfeich Jan 8, 2025
ad594ec
shape and benchmark
yanfeich Jan 9, 2025
2974c87
Refine Decoder Post-Processing HPU path code
JianyuLi01 Jan 14, 2025
94e0ffe
fix multi-card perf benchmark hang issue
LeoZhao-Intel Jan 15, 2025
6935984
batch tokens early stop
yanfeich Jan 15, 2025
92ab063
optimize for-loop index
yanfeich Jan 16, 2025
528f8ef
fsdpa and rmsNorm updates
yanfeich Feb 5, 2025
a583406
code refine to reduce cpu operations
LeoZhao-Intel Feb 12, 2025
9b44b32
PP refine to remove cpu/gpu copy on python
LeoZhao-Intel Feb 13, 2025
b7fc1f6
support static graph mode execution
LeoZhao-Intel Feb 17, 2025
9fea1c9
use cpu to concat all next_tokens to avoid compile and improve perf
LeoZhao-Intel Feb 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 43 additions & 27 deletions llm/predict/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,29 +577,39 @@ def _preprocess(self, source):
)

else:
for i in range(inputs["input_ids"].shape[0]):
length = inputs["seq_len_encoder"][i][0]
if self.attention_mask is not None:
self.attention_mask[i, 0, :length, :length] = paddle.tril(
paddle.ones(shape=(length, length), dtype=self.config.dtype)
)

if pre_caches_length > 0:
if self.config.prefix_path is None:
prefix_attention_mask = paddle.zeros(
[1, length, pre_caches_length], dtype=self.attention_mask.dtype
)
else:
prefix_attention_mask = paddle.ones(
[1, length, pre_caches_length], dtype=self.attention_mask.dtype
)
post_attention_mask = paddle.tril(
paddle.ones(shape=(length, length), dtype=self.attention_mask.dtype)
).unsqueeze_(axis=0)
if inputs["attention_mask"] is not None:
bsz, src_len = inputs["attention_mask"].shape
causal_4d_mask = paddle.tril(
paddle.ones(shape=(bsz, 1, self.config.total_max_length, self.config.total_max_length), dtype=self.config.dtype)
)
attention_mask_2d = paddle.ones(shape=(bsz, self.config.total_max_length), dtype='int64')
attention_mask_2d[:,0:src_len] = inputs["attention_mask"]
bool_mask = attention_mask_2d != 1
expanded_attn_mask = bool_mask[:, None, None, :].expand([bsz, 1, self.config.total_max_length, self.config.total_max_length])
self.attention_mask = causal_4d_mask.masked_fill(expanded_attn_mask, 0)
else:
for i in range(inputs["input_ids"].shape[0]):
length = inputs["seq_len_encoder"][i][0]
if self.attention_mask is not None:
self.attention_mask[i, 0, :length, : length + pre_caches_length] = paddle.concat(
[prefix_attention_mask, post_attention_mask], axis=2
self.attention_mask[i, 0, :length, :length] = paddle.tril(
paddle.ones(shape=(1, 1, length, length), dtype=self.config.dtype)
)
if pre_caches_length > 0:
if self.config.prefix_path is None:
prefix_attention_mask = paddle.zeros(
[1, length, pre_caches_length], dtype=self.attention_mask.dtype
)
else:
prefix_attention_mask = paddle.ones(
[1, length, pre_caches_length], dtype=self.attention_mask.dtype
)
post_attention_mask = paddle.tril(
paddle.ones(shape=(length, length), dtype=self.attention_mask.dtype)
).unsqueeze_(axis=0)
if self.attention_mask is not None:
self.attention_mask[i, 0, :length, : length + pre_caches_length] = paddle.concat(
[prefix_attention_mask, post_attention_mask], axis=2
)

inputs["pre_ids"] = self.pre_ids
inputs["attention_mask"] = self.attention_mask
Expand Down Expand Up @@ -1295,6 +1305,7 @@ def create_predictor(
):
tokenizer = AutoTokenizer.from_pretrained(
predictor_args.model_name_or_path,
padding_side="left"
)
# init chat_template for tokenizer
llm_utils.init_chat_template(tokenizer, predictor_args.model_name_or_path, predictor_args.chat_template)
Expand All @@ -1304,7 +1315,6 @@ def create_predictor(
tokenizer.pad_token = tokenizer.eos_token

config = AutoConfig.from_pretrained(predictor_args.model_name_or_path)

max_position_embeddings = llm_utils.get_model_max_position_embeddings(config)
if max_position_embeddings is None:
max_position_embeddings = predictor_args.src_length + predictor_args.max_length
Expand Down Expand Up @@ -1451,13 +1461,13 @@ def predict():
def benchmark(predictor, predictor_args, model_args):
# Just construct a simple benchmark input. We pad input to the src_length.
test_texts = "hello world, how are you?"
benchmark_texts = [test_texts + "<pad>" * predictor_args.src_length for _ in range(predictor_args.batch_size)]
benchmark_texts = [test_texts for _ in range(predictor_args.batch_size)]

batch_benchmark_texts = batchfy_text(benchmark_texts, predictor_args.batch_size)
print("***********Start Benchmark**********")

warmup_time = 5
test_time = 20
warmup_time = 1
test_time = 5

print("***********Start Warmup**********")
for _ in range(warmup_time):
Expand All @@ -1469,8 +1479,14 @@ def benchmark(predictor, predictor_args, model_args):
output_tokens = 0
for _ in range(test_time):
for bs, batch_source_text in enumerate(batch_benchmark_texts):
outputs, batch_tokens = predictor.predict(batch_source_text, return_tokens=True)
output_tokens += sum([len(tokens) for tokens in batch_tokens])
if paddle.distributed.get_rank() == 0:
outputs, batch_tokens = predictor.predict(batch_source_text, return_tokens=True)
output_tokens += sum([len(tokens) for tokens in batch_tokens])
else:
outputs = predictor.predict(batch_source_text)

if outputs == None:
return
end = time.perf_counter()
print("Avg Elapse time is: ", (end - start) / test_time)
print("Output tokens is: ", output_tokens)
Expand Down
Loading