-
Notifications
You must be signed in to change notification settings - Fork 3.1k
Closed
Labels
questionFurther information is requestedFurther information is requested
Description
请提出你的问题
Run the benchmark code
from paddlenlp.transformers import GPTTokenizer
import tiktoken as tik
import time
import statistics
def benchmark_tokenizers(text, num_iterations=1000):
# 初始化tokenizers
paddle_tokenizer = GPTTokenizer.from_pretrained(
"gpt2-en",
bos_token='<|endoftext|>'
)
tik_tokenizer = tik.get_encoding("gpt2")
# 测试编码性能
paddle_encode_times = []
tik_encode_times = []
for _ in range(num_iterations):
# PaddleNLP tokenizer
start = time.perf_counter()
paddle_tokens = paddle_tokenizer.encode(text)
paddle_encode_times.append(time.perf_counter() - start)
# Tiktoken
start = time.perf_counter()
tik_tokens = tik_tokenizer.encode(text)
tik_encode_times.append(time.perf_counter() - start)
# 测试解码性能
paddle_decode_times = []
tik_decode_times = []
for _ in range(num_iterations):
# PaddleNLP tokenizer
start = time.perf_counter()
paddle_tokenizer.decode(paddle_tokens.input_ids)
paddle_decode_times.append(time.perf_counter() - start)
# Tiktoken
start = time.perf_counter()
tik_tokenizer.decode(tik_tokens)
tik_decode_times.append(time.perf_counter() - start)
return {
"paddle_encode_avg": statistics.mean(paddle_encode_times) * 1000, # 转换为毫秒
"tik_encode_avg": statistics.mean(tik_encode_times) * 1000,
"paddle_decode_avg": statistics.mean(paddle_decode_times) * 1000,
"tik_decode_avg": statistics.mean(tik_decode_times) * 1000
}
# 测试文本
text = """Hello, World! How are you doing today? What's the weather like in Beijing?
This is a longer text sample to get more accurate benchmarking results.
We need to process enough text to get meaningful performance measurements."""
# 运行基准测试
results = benchmark_tokenizers(text)
# 打印结果
print(f"\nPerformance Comparison (average time in milliseconds):")
print(f"{'=' * 50}")
print(f"PaddleNLP Tokenizer:")
print(f" Encoding: {results['paddle_encode_avg']:.4f} ms")
print(f" Decoding: {results['paddle_decode_avg']:.4f} ms")
print(f"\nTiktoken:")
print(f" Encoding: {results['tik_encode_avg']:.4f} ms")
print(f" Decoding: {results['tik_decode_avg']:.4f} ms")You will see the performance of GPTTokenizer is much worse than Tiktoken
Metadata
Metadata
Assignees
Labels
questionFurther information is requestedFurther information is requested
