In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import os
import re
import json
import time
import heapq
import psutil
import math
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# 下载 nltk 需要的资源
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# 加载停用词表
stop_words = set(stopwords.words('english'))  # 停用词集合

def list_files(directory):
    """获取所有文本文件路径"""
    return [os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

def preprocess_text(text):
    """文本预处理：分词、去除标点、小写化、去除数字、停用词过滤、词干提取"""
    stemmer = PorterStemmer()
    tokens = nltk.word_tokenize(text)  # 分词
    processed_tokens = []
    for token in tokens:
        token = re.sub(r'[^\w\s]', '', token).lower() # 归一化：去除标点 & 小写化
        if token in stop_words: # 跳过停用词
            continue
        # 词干提取
        if token.isalpha():  # 过滤掉纯数字
            stemmed_token = stemmer.stem(token)
            processed_tokens.append(stemmed_token)
    return processed_tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
class BSBIIndexer:
    def __init__(self, block_size=100000, output_dir="bsbi_blocks"):
        self.block_size = block_size
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def index_documents(self, directory):
        doc_files = list_files(directory)
        block_id = 0
        term_doc_pairs = []
        self.doc_lengths = {}
        self.total_docs = len(doc_files)

        for doc_id, file_path in enumerate(doc_files):
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
                tokens = preprocess_text(text)
                self.doc_lengths[doc_id] = len(tokens)
                term_doc_pairs.extend([(token, doc_id) for token in tokens])

            if len(term_doc_pairs) >= self.block_size:
                self.write_block(term_doc_pairs, block_id)
                term_doc_pairs = []
                block_id += 1

        if term_doc_pairs:
            self.write_block(term_doc_pairs, block_id)
            block_id += 1

        return block_id

    def write_block(self, term_doc_pairs, block_id):
        term_doc_pairs.sort()
        block_file = os.path.join(self.output_dir, f"block_{block_id}.json")
        with open(block_file, 'w', encoding='utf-8') as f:
            json.dump(term_doc_pairs, f)

In [4]:
class BSBI_Merger:
    def __init__(self, output_dir="bsbi_blocks", final_index="final_index.json"):
        self.output_dir = output_dir
        self.final_index = final_index

    def merge_blocks(self, block_count):
        min_heap = []
        file_iters = [open(os.path.join(self.output_dir, f"block_{i}.json"), "r") for i in range(block_count)]
        block_data = [json.load(f) for f in file_iters]

        for i, block in enumerate(block_data):
            if block:
                heapq.heappush(min_heap, (block[0][0], block[0][1], i, 0))

        merged_index = {}

        while min_heap:
            term, doc_id, block_idx, pos = heapq.heappop(min_heap)
            if term not in merged_index:
                merged_index[term] = set()
            merged_index[term].add(doc_id)

            if pos + 1 < len(block_data[block_idx]):
                next_term, next_doc_id = block_data[block_idx][pos + 1]
                heapq.heappush(min_heap, (next_term, next_doc_id, block_idx, pos + 1))

        for f in file_iters:
            f.close()

        with open(self.final_index, "w", encoding="utf-8") as f:
            json.dump({term: sorted(list(doc_ids)) for term, doc_ids in merged_index.items()}, f)

# Ranking
Ranking Algorithm: TF-IDF

We use a standard TF-IDF scoring approach, where each term in a query is scored based on its frequency in the document (TF) and how rare it is across the collection (IDF). Final document scores are computed by summing the TF-IDF scores of each query term.

In [5]:
class RankedQueryProcessor:
    def __init__(self, index_file, doc_lengths, total_docs):
        with open(index_file, 'r', encoding='utf-8') as f:
            self.inverted_index = json.load(f)
        self.doc_lengths = doc_lengths
        self.total_docs = total_docs

    def compute_idf(self, term):
        df = len(self.inverted_index.get(term, []))
        return math.log((self.total_docs + 1) / (df + 1)) + 1

    def process_query(self, query):
        tokens = preprocess_text(query)
        scores = {}
        for term in tokens:
            postings = self.inverted_index.get(term, [])
            idf = self.compute_idf(term)
            for doc_id in postings:
                tf = 1 / self.doc_lengths.get(str(doc_id), 1)
                scores[doc_id] = scores.get(doc_id, 0) + tf * idf
        return sorted(scores.items(), key=lambda x: x[1], reverse=True)

In [8]:
if __name__ == "__main__":
  input_dir = "/content/drive/MyDrive/HillaryEmails/HillaryEmails"
  indexer = BSBIIndexer()
  block_count = indexer.index_documents(input_dir)
  merger = BSBI_Merger()
  merger.merge_blocks(block_count)
  ranked_processor = RankedQueryProcessor("final_index.json", indexer.doc_lengths, indexer.total_docs)
  results = ranked_processor.process_query("iran nuclear")
  print(results)

[(35, 8.776154621534795), (37, 8.776154621534795), (80, 8.776154621534795), (93, 8.776154621534795), (102, 8.776154621534795), (164, 8.776154621534795), (397, 8.776154621534795), (442, 8.776154621534795), (465, 8.776154621534795), (468, 8.776154621534795), (476, 8.776154621534795), (487, 8.776154621534795), (502, 8.776154621534795), (524, 8.776154621534795), (540, 8.776154621534795), (611, 8.776154621534795), (660, 8.776154621534795), (758, 8.776154621534795), (798, 8.776154621534795), (880, 8.776154621534795), (1138, 8.776154621534795), (1367, 8.776154621534795), (1394, 8.776154621534795), (1397, 8.776154621534795), (1445, 8.776154621534795), (1530, 8.776154621534795), (1626, 8.776154621534795), (1711, 8.776154621534795), (1714, 8.776154621534795), (1737, 8.776154621534795), (1866, 8.776154621534795), (1939, 8.776154621534795), (1971, 8.776154621534795), (2161, 8.776154621534795), (2196, 8.776154621534795), (2267, 8.776154621534795), (2387, 8.776154621534795), (2401, 8.776154621534795

In [10]:
print("Ranked Retrieval:")
print(ranked_processor.process_query("clinton iraq"))

Ranked Retrieval:
[(80, 7.597040336062574), (90, 7.597040336062574), (102, 7.597040336062574), (267, 7.597040336062574), (301, 7.597040336062574), (310, 7.597040336062574), (374, 7.597040336062574), (406, 7.597040336062574), (447, 7.597040336062574), (487, 7.597040336062574), (502, 7.597040336062574), (540, 7.597040336062574), (610, 7.597040336062574), (611, 7.597040336062574), (660, 7.597040336062574), (858, 7.597040336062574), (860, 7.597040336062574), (866, 7.597040336062574), (1009, 7.597040336062574), (1146, 7.597040336062574), (1202, 7.597040336062574), (1280, 7.597040336062574), (1345, 7.597040336062574), (1354, 7.597040336062574), (1363, 7.597040336062574), (1626, 7.597040336062574), (1642, 7.597040336062574), (1696, 7.597040336062574), (1704, 7.597040336062574), (1827, 7.597040336062574), (2048, 7.597040336062574), (2089, 7.597040336062574), (2161, 7.597040336062574), (2196, 7.597040336062574), (2223, 7.597040336062574), (2309, 7.597040336062574), (2543, 7.597040336062574), (2