In [None]:
import bz2
import re
import json
import opencc
import codecs
from gensim.corpora.wikicorpus import extract_pages, filter_wiki
from tqdm import tqdm

# 最大文章数（根据需求随意更改）
MAX_ARTICLES = 1000

# OpenCC 繁转简转换器
cc = opencc.OpenCC('t2s')

def wiki_replace(d):
    """清理 Wikipedia 文章文本"""
    title, text, _ = d  # 提取 (标题, 正文, ID)

    text = re.sub(r':*{\|[\s\S]*?\|}', '', text)
    text = re.sub(r'<gallery>[\s\S]*?</gallery>', '', text)
    text = re.sub(r'(.){{([^{}\n]*?\|[^{}\n]*?)}}', r'\1[[\2]]', text)
    text = filter_wiki(text)
    text = re.sub(r'\* *\n|\'{2,}', '', text)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\n[:;]|\n +', '\n', text)
    text = re.sub(r'\n==', r'\n\n==', text)

    return {
        "text": cc.convert(text).strip(),
        "meta": {"title": cc.convert(title)}  
    }


output_file = "wiki_genism.jsonl"


with codecs.open(output_file, 'w', encoding='utf-8') as f:
    i = 0
    for d in tqdm(extract_pages(bz2.open('zhwiki-20250201-pages-articles-multistream.xml.bz2', 'rt', encoding='utf-8')), 
                  desc="已获取 0 篇文章", total=MAX_ARTICLES):
        if i >= MAX_ARTICLES:
            break  

        if not re.findall(r'^[a-zA-Z]+:', d[0]) and d[0] and not re.findall(r'^#', d[1]):
            json.dump(wiki_replace(d), f, ensure_ascii=False)
            f.write('\n')  
            i += 1
            tqdm.write(f"已获取 {i} 篇文章")

print(f"共提取 {i} 篇文章，存储在 {output_file}")