In [None]:
import os


#使用wikiextractor提取bz2文件，并将其保存为txt格式

wiki_input = "zhwiki-20250201-pages-articles-multistream.xml.bz2"
output_dir = "wiki_extracted"
os.makedirs(output_dir, exist_ok=True)


!python WikiExtractor.py --infn {wiki_input} --compress
#在当前目录得到一个wiki.txt文件

In [None]:
import json
from opencc import OpenCC
from tqdm import tqdm
import re

def traditional_2_easy(inputname, outputname):
    """
    将文本文件中的繁体转换为简体
    """
    cc = OpenCC('t2s')  # OpenCC 繁体2简体
    with open(inputname, "r", encoding="utf-8") as f:
        lines = f.readlines()

    with open(outputname, "w", encoding="utf-8") as f_out:
        for line in tqdm(lines, desc="繁体转换简体中"):
            converted_line = cc.convert(line)
            f_out.write(converted_line)
    
    print(f"转换完成，结果保存至：{outputname}")

def handle(inputname, outputname):
    """
    清理文本格式，处理括号、引号、符号等，使其符合中文标准格式
    """
    with open(inputname, "r", encoding="utf-8") as f:
        lines = f.readlines()

    with open(outputname, "w", encoding="utf-8") as f_out:
        for line in tqdm(lines, desc="文本格式处理中"):
            # 删除空括号
            line = re.sub(r'\（\）', '', line)

            # 统一中文引号
            line = re.sub(r"\「|\」|\｢|\｣|\『|\』", '\"', line)

            # 修正括号错误
            line = re.sub(r"\，\）|\；\）", '）', line)

            # 修正括号错误
            line = re.sub(r"\（\，|\(\，", '（', line)

            # 统一括号格式
            line = re.sub(r"\(", '（', line)
            line = re.sub(r"\)", '）', line)

            f_out.write(line)
    
    print(f"文本格式处理完成，结果保存至：{outputname}")

input_file = "wiki.txt"
simplified_file = "wiki_simplified.txt"
cleaned_file = "wiki_cleaned.txt"


traditional_2_easy(input_file, simplified_file)


handle(simplified_file, cleaned_file)

In [3]:
import json

#进一步处理数据，满足题目要求的jsonl格式
input_txt = "wiki_cleaned.txt"
output_jsonl = "wiki.jsonl"

with open(input_txt, "r", encoding="utf-8") as f_in, open(output_jsonl, "w", encoding="utf-8") as f_out:
    title = None
    text = []

    for line in f_in:
        line = line.strip()

        if not line:  
            if title and text:  # 确保数据有效
                json.dump({"meta": {"title": title}, "text": "\n".join(text)}, f_out, ensure_ascii=False)
                f_out.write("\n") 
            title = None  # 重新初始化
            text = []
        
        elif title is None and line.endswith(":"):  # 识别标题
            title = line[:-1].strip()  # 去掉标题的 `:`
        
        else:  
            text.append(line)

    
    if title and text:
        json.dump({"text": "\n".join(text), "meta": {"title": title}}, f_out, ensure_ascii=False)
        f_out.write("\n")

print(f"清理后的数据保存至保存到 {output_jsonl}")

清理后的数据保存至保存到 wiki.jsonl
