In [4]:
import os
import json
from concurrent.futures import ThreadPoolExecutor
from charset_normalizer import from_path
from tqdm import tqdm

input_dir = 'output'
output_dir = 'unique'
os.makedirs(output_dir, exist_ok=True)

# 编码检测函数，失败时返回默认编码
def detect_encoding(filepath, default='utf-8'):
    result = from_path(filepath)
    best = result.best()
    if best is None:
        tqdm.write(f"[警告] 无法识别 {filepath} 编码，使用默认编码 {default}")
        return default
    return best.encoding

# 处理单个文件
def process_file(filename):
    input_path = os.path.join(input_dir, filename)
    output_path = os.path.join(output_dir, filename)

    try:
        encoding = detect_encoding(input_path)
        with open(input_path, 'r', encoding=encoding, errors='ignore') as infile:
            lines = infile.readlines()
    except Exception as e:
        tqdm.write(f"[错误] 读取 {filename} 失败：{e}")
        return

    seen_contents = set()
    unique_lines = []

    for line in lines:
        try:
            data = json.loads(line)
            content = data.get('content', '').strip()
            if content and content not in seen_contents:
                seen_contents.add(content)
                unique_lines.append(json.dumps(data, ensure_ascii=False))
        except json.JSONDecodeError:
            continue

    with open(output_path, 'w', encoding='utf-8') as outfile:
        for line in unique_lines:
            outfile.write(line + '\n')

    tqdm.write(f"✅ 已完成：{filename}，保留 {len(unique_lines)} 条")

# 获取所有 jsonl 文件
files = [f for f in os.listdir(input_dir) if f.endswith('.jsonl')]

# 多线程执行
with ThreadPoolExecutor(max_workers=4) as executor:
    list(tqdm(executor.map(process_file, files), total=len(files), desc="整体进度"))


整体进度:   0%|          | 0/5 [00:00<?, ?it/s]

整体进度:   0%|          | 0/5 [00:04<?, ?it/s]    

[警告] 无法识别 output/ja_news.jsonl 编码，使用默认编码 utf-8


整体进度:   0%|          | 0/5 [00:12<?, ?it/s]    

✅ 已完成：ja_news.jsonl，保留 10757 条


整体进度:   0%|          | 0/5 [00:15<?, ?it/s]    

[警告] 无法识别 output/zh_news.jsonl 编码，使用默认编码 utf-8


整体进度:  20%|██        | 1/5 [00:16<01:05, 16.33s/it]

✅ 已完成：es_news.jsonl，保留 15399 条


整体进度: 100%|██████████| 5/5 [00:16<00:00,  1.98s/it]    

✅ 已完成：en_news.jsonl，保留 19436 条
✅ 已完成：fr_news.jsonl，保留 20942 条
✅ 已完成：zh_news.jsonl，保留 23110 条


整体进度: 100%|██████████| 5/5 [00:16<00:00,  3.35s/it]
