In [1]:
import os
import re

In [2]:
def process_chinese_text(
    folder_path,       # 文本文档所在的文件夹
    keyword="根",       # 要搜索的关键词
    context_chars=20,  # 字符上下文窗口大小
    out_folder="context",  # 保存上下文的输出文件夹
    max_files=20       # 处理的文本文档数量（cn-1 到 cn-20）
):
    """
    统计每个文本文档的字数，关键词出现次数，并提取上下文。
    """
    # 如果输出文件夹不存在，则创建
    os.makedirs(out_folder, exist_ok=True)

    # 结果统计
    stats = []

    for i in range(1, max_files + 1):
        filename = f"cn-{i}.txt"
        file_path = os.path.join(folder_path, filename)

        if not os.path.isfile(file_path):
            print(f"文件 {filename} 不存在，跳过。")
            continue

        # 读取文件内容，假设文件为 UTF-8 编码
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()

        # 统计字数（中文文本中的每个字符计为一个字）
        total_chars = len(text)

        # 统计关键词出现次数
        keyword_count = len(re.findall(re.escape(keyword), text))

        # 提取关键词的上下文
        context_snippets = []
        for match in re.finditer(re.escape(keyword), text):
            start_idx = max(0, match.start() - context_chars)
            end_idx = min(len(text), match.end() + context_chars)
            snippet = text[start_idx:end_idx]
            context_snippets.append(snippet)

        # 保存上下文到文件
        output_file = os.path.join(out_folder, f"context-cn-{i}.txt")
        with open(output_file, "w", encoding="utf-8") as out_f:
            if not context_snippets:
                out_f.write("未找到关键词的上下文。\n")
            else:
                for snippet in context_snippets:
                    out_f.write(snippet.replace("\n", " ") + "\n")

        # 保存统计信息
        stats.append({
            "file": filename,
            "total_chars": total_chars,
            "keyword_count": keyword_count,
            "output_file": output_file
        })

    # 输出统计结果
    print(f"{'文件名':<15}{'字数':<10}{'关键词次数':<10}{'上下文文件':<20}")
    print("-" * 60)
    for stat in stats:
        print(f"{stat['file']:<15}{stat['total_chars']:<10}{stat['keyword_count']:<10}{stat['output_file']:<20}")


In [4]:
source_folder = "./dumps-text"  # 文件夹路径 (当前文件夹为例)
process_chinese_text(
    folder_path=source_folder,
    keyword="根",          # 要统计的关键词
    context_chars=20,      # 上下文字符窗口大小
    out_folder="context",  # 保存上下文的文件夹
    max_files=20           # 要处理的文本文档数量
)

文件名            字数        关键词次数     上下文文件               
------------------------------------------------------------
cn-1.txt       12793     202       context/context-cn-1.txt
cn-2.txt       7225      7         context/context-cn-2.txt
cn-3.txt       68533     60        context/context-cn-3.txt
cn-4.txt       1382      27        context/context-cn-4.txt
cn-5.txt       2079      49        context/context-cn-5.txt
cn-6.txt       9911      21        context/context-cn-6.txt
cn-7.txt       1429      48        context/context-cn-7.txt
cn-8.txt       1771      65        context/context-cn-8.txt
cn-9.txt       917       4         context/context-cn-9.txt
cn-10.txt      2734      12        context/context-cn-10.txt
cn-11.txt      2205      34        context/context-cn-11.txt
cn-12.txt      3484      61        context/context-cn-12.txt
cn-13.txt      4595      25        context/context-cn-13.txt
cn-14.txt      2599      22        context/context-cn-14.txt
cn-15.txt      20234     76        con