In [1]:
import os
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# 下载 nltk 需要的资源
nltk.download('punkt')
nltk.download('stopwords')

# 加载停用词表
stop_words = set(stopwords.words('english'))  # 停用词集合

def list_files(directory):
    """获取所有文本文件路径"""
    return [os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

def preprocess_text(text):
    """文本预处理：分词、去除标点、小写化、去除数字、停用词过滤、词干提取"""
    stemmer = PorterStemmer()
    tokens = nltk.word_tokenize(text)  # 分词
    processed_tokens = []
    for token in tokens:
        token = re.sub(r'[^\w\s]', '', token).lower() # 归一化：去除标点 & 小写化
        if token in stop_words: # 跳过停用词
            continue
        # 词干提取
        if token.isalpha():  # 过滤掉纯数字
            stemmed_token = stemmer.stem(token)
            processed_tokens.append(stemmed_token)
    return processed_tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\62493\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\62493\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import json
import time

class BSBIIndexer:
    def __init__(self, block_size=100000, output_dir="bsbi_blocks"):
        self.block_size = block_size
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def index_documents(self, directory):
        """构建 BSBI 索引"""
        doc_files = list_files(directory)
        block_id = 0
        term_doc_pairs = []

        for doc_id, file_path in enumerate(doc_files):
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
                tokens = preprocess_text(text)
                term_doc_pairs.extend([(token, doc_id) for token in tokens])

            if len(term_doc_pairs) >= self.block_size:
                self.write_block(term_doc_pairs, block_id)
                term_doc_pairs = []
                block_id += 1

        if term_doc_pairs:
            self.write_block(term_doc_pairs, block_id)
            block_id += 1

        return block_id

    def write_block(self, term_doc_pairs, block_id):
        """对 block 排序并写入磁盘"""
        start_time = time.time();
        term_doc_pairs.sort()
        block_file = os.path.join(self.output_dir, f"block_{block_id}.json")
        with open(block_file, 'w', encoding='utf-8') as f:
            json.dump(term_doc_pairs, f)
        end_time = time.time();
        print(f"Block {block_id} written with {len(term_doc_pairs)} pairs. Sorting took {end_time - start_time:.2f} seconds.")

In [3]:
import heapq
import psutil  # 用于监测内存占用

class BSBI_Merger:
    def __init__(self, output_dir="bsbi_blocks", final_index="final_index.json"):
        self.output_dir = output_dir
        self.final_index = final_index

    def merge_blocks(self, block_count):
        """归并多个已排序 block"""
        min_heap = []
        file_iters = [open(os.path.join(self.output_dir, f"block_{i}.json"), "r") for i in range(block_count)]
        block_data = [json.load(f) for f in file_iters]

        # 记录归并前内存
        memory_before = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)

        # 初始化 heap
        for i, block in enumerate(block_data):
            if block:
                heapq.heappush(min_heap, (block[0][0], block[0][1], i, 0))  # (term, doc_id, block_index, position)

        merged_index = {}

        while min_heap:
            term, doc_id, block_idx, pos = heapq.heappop(min_heap)

            if term not in merged_index:
                merged_index[term] = set()
            merged_index[term].add(doc_id)

            # 继续从相应的 block 取数据
            if pos + 1 < len(block_data[block_idx]):
                next_term, next_doc_id = block_data[block_idx][pos + 1]
                heapq.heappush(min_heap, (next_term, next_doc_id, block_idx, pos + 1))

        # 记录归并后内存
        memory_after = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
        
        # 关闭文件
        for f in file_iters:
            f.close()

        # 写入最终索引
        with open(self.final_index, "w", encoding="utf-8") as f:
            json.dump({term: sorted(list(doc_ids)) for term, doc_ids in merged_index.items()}, f)


        print(f"Pre-merge memory: {memory_before:.2f} MB | Post-merge memory: {memory_after:.2f} MB | Increase: {memory_after - memory_before:.2f} MB\n")
        print(f"Final index written to {self.final_index}\n")

In [4]:
import time

if __name__ == "__main__":
    input_directory = "dataset/HillaryEmails"  # 你的数据集目录
    block_size = 100000  # 约为1.2MB,每个 block 处理的最大 term-document 对数

    # 1. 运行 BSBI 索引构建
    indexer = BSBIIndexer(block_size=block_size)
    start_time = time.time()
    block_count = indexer.index_documents(input_directory)
    end_time = time.time()
    print(f"\nIndexing completed in {end_time - start_time:.2f} seconds.\n")

    # 2. 归并所有 block
    merger = BSBI_Merger()
    start_time = time.time()
    merger.merge_blocks(block_count)
    end_time = time.time()
    print(f"Merge completed in {end_time - start_time:.2f} seconds.")

Block 0 written with 101237 pairs. Sorting took 0.18 seconds.
Block 1 written with 100201 pairs. Sorting took 0.20 seconds.
Block 2 written with 100026 pairs. Sorting took 0.30 seconds.
Block 3 written with 100445 pairs. Sorting took 0.18 seconds.
Block 4 written with 100048 pairs. Sorting took 0.18 seconds.
Block 5 written with 100093 pairs. Sorting took 0.20 seconds.
Block 6 written with 101953 pairs. Sorting took 0.20 seconds.
Block 7 written with 100076 pairs. Sorting took 0.28 seconds.
Block 8 written with 100046 pairs. Sorting took 0.31 seconds.
Block 9 written with 101223 pairs. Sorting took 0.33 seconds.
Block 10 written with 100052 pairs. Sorting took 0.22 seconds.
Block 11 written with 102402 pairs. Sorting took 0.21 seconds.
Block 12 written with 100134 pairs. Sorting took 0.28 seconds.
Block 13 written with 100414 pairs. Sorting took 0.21 seconds.
Block 14 written with 100046 pairs. Sorting took 0.20 seconds.
Block 15 written with 100091 pairs. Sorting took 0.27 seconds.
Bl