In [1]:
import re
import matplotlib.pyplot as plt
import nltk
from collections import Counter
import seaborn as sns
import os
import heapq
import numpy as np
import math
from numpy.linalg import norm

# Download Gutenberg corpus
nltk.download('gutenberg')
from nltk.corpus import gutenberg

# 文書読み込み
fileids = gutenberg.fileids()
docs = [gutenberg.raw(fid) for fid in fileids]

import numpy as np

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/takatakiyugo/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
def tokenize_en(text):
    """英語テキストを小文字化して単語リストに分割する簡易トークナイザ"""
    return re.findall(r"[a-zA-Z]+", text.lower())


### 文書のチャンキング
EDAは作品単位でしたが，検索以降は500ワードごとに分割した文書を使います．
これにより文書数が増え，RAG実習にも適した形になります．

In [3]:
def chunk_documents(docs, chunk_size=500):
    chunks = []
    for doc in docs:
        words = tokenize_en(doc)
        for i in range(0, len(words), chunk_size):
            chunk = " ".join(words[i:i+chunk_size])
            chunks.append(chunk)
    return chunks

chunked_docs = chunk_documents(docs, 500)
print("Original docs:", len(docs))
print("Chunked docs:", len(chunked_docs))

# チャンキング後の基礎統計
total_tokens = sum(len(tokenize_en(doc)) for doc in chunked_docs)
unique_tokens = len(set(w for doc in chunked_docs for w in tokenize_en(doc)))

print("Total tokens after chunking:", total_tokens)
print("Unique vocab after chunking:", unique_tokens)


Original docs: 18
Chunked docs: 4279
Total tokens after chunking: 2136080
Unique vocab after chunking: 41509


## 転置インデックス (array+bisect)

In [25]:
# --- 転置インデックス（TF付き + TF-IDF + BM25 + skip_merge対応） ---
import math
from collections import Counter

class InvertedIndexArray:
    def __init__(self):
        self.vocab = []        # ソート済み語彙リスト
        self.postings = {}     # term -> [(doc_id, tf)]
        self.doc_count = 0     # 総文書数
        self.avgdl = 0.0       # 平均文書長
        self.doc_lens = []     # 各文書の長さ（BM25用）

    # ======================
    # インデックス構築
    # ======================
    def build(self, docs):
        """TF付き転置インデックスを構築"""
        self.doc_count = len(docs)
        vocab_set = set()
        postings = {}
        self.doc_lens = []

        for doc_id, doc in enumerate(docs):
            tokens = tokenize_en(doc)
            tf_counts = Counter(tokens)
            self.doc_lens.append(len(tokens))
            for term, tf in tf_counts.items():
                vocab_set.add(term)
                postings.setdefault(term, []).append((doc_id, tf))

        self.avgdl = sum(self.doc_lens) / len(self.doc_lens) #ここはアップデートがあると管理がめんどくさい
        self.vocab = sorted(vocab_set)
        for t in postings:
            postings[t] = sorted(postings[t], key=lambda x: x[0])
        self.postings = postings

    # ======================
    # 基本関数
    # ======================
    def binary_search(self, arr, target):
        left, right = 0, len(arr) - 1
        while left <= right:
            mid = (left + right) // 2
            if arr[mid] == target:
                return mid
            elif arr[mid] < target:
                left = mid + 1
            else:
                right = mid - 1
        return -1

    def search(self, term):
        """termを含むdoc_idリストを返す"""
        i = self.binary_search(self.vocab, term)
        if i == -1:
            return []
        return [doc_id for doc_id, _ in self.postings[self.vocab[i]]]

    def show_postings(self, term):
        """termのposting listを(doc_id, tf)形式で表示"""
        i = self.binary_search(self.vocab, term)
        if i == -1:
            print(f"{term}: (not found)")
            return
        print(f"{term}: {self.postings[self.vocab[i]]}")

    # ======================
    # AND検索
    # ======================
    def and_search_merge(self, t1, t2):
        """2ポインタ法"""
        p1 = [d for d, _ in self.postings.get(t1, [])]
        p2 = [d for d, _ in self.postings.get(t2, [])]
        i, j = 0, 0
        result = []
        while i < len(p1) and j < len(p2):
            if p1[i] == p2[j]:
                result.append(p1[i])
                i += 1
                j += 1
            elif p1[i] < p2[j]:
                i += 1
            else:
                j += 1
        return result

    def and_search_skip(self, t1, t2):
        """スキップマージ法"""
        p1 = [d for d, _ in self.postings.get(t1, [])]
        p2 = [d for d, _ in self.postings.get(t2, [])]
        n1, n2 = len(p1), len(p2)
        skip1 = int(math.sqrt(n1)) or 1
        skip2 = int(math.sqrt(n2)) or 1

        i, j = 0, 0
        result = []
        while i < n1 and j < n2:
            if p1[i] == p2[j]:
                result.append(p1[i])
                i += 1
                j += 1
            elif p1[i] < p2[j]:
                if (i + skip1 < n1) and (p1[i + skip1] <= p2[j]):
                    i += skip1
                else:
                    i += 1
            else:
                if (j + skip2 < n2) and (p2[j + skip2] <= p1[i]):
                    j += skip2
                else:
                    j += 1
        return result

    # ======================
    # TF-IDF
    # ======================
    def tfidf(self, term, doc_id):
        """検索時に動的にTF-IDFを計算"""
        plist = self.postings.get(term, [])
        df = len(plist)
        if df == 0:
            return 0.0
        idf = math.log((self.doc_count / df), 10)
        for d_id, tf in plist:
            if d_id == doc_id:
                tf_weight = 1 + math.log(tf, 10)
                return tf_weight * idf
        return 0.0

    def query_tfidf_scores(self, query_terms):
        """クエリ内のtermごとにTF-IDFスコアを加算"""
        scores = {doc_id: 0.0 for doc_id in range(self.doc_count)}
        for term in query_terms:
            plist = self.postings.get(term, [])
            df = len(plist)
            if df == 0:
                continue
            idf = math.log((self.doc_count / df), 10)
            for doc_id, tf in plist:
                tf_weight = 1 + math.log(tf, 10)
                scores[doc_id] += tf_weight * idf
        return scores

    # ======================
    # BM25
    # ======================
    def bm25(self, query_terms, k1=1.5, b=0.75):
        """BM25スコアを計算"""
        scores = {doc_id: 0.0 for doc_id in range(self.doc_count)}
        for term in query_terms:
            plist = self.postings.get(term, [])
            df = len(plist)
            if df == 0:
                continue
            idf = math.log((self.doc_count - df + 0.5) / (df + 0.5) + 1)
            for doc_id, tf in plist:
                score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * self.doc_lens[doc_id] / self.avgdl)) # ここを授業内で追加。
                scores[doc_id] += score
        return scores

    def tfidf_topk(self, query_terms, k=10): # 実はバグあり（下の関数にはバグなし）複数単語で検索する場合、ヒープから出てしまった文書の方がスコアが高い場合が出てくる。
        """
        TF-IDF上位k件をストリーム処理で探索
        postingsを逐次読み出してスコア上位Kのみ保持
        """
        heap = []  # (score, doc_id) の最小ヒープ

        # 各termのpostingリストを順に処理
        for term in query_terms:
            plist = self.postings.get(term, [])
            df = len(plist)
            if df == 0:
                continue
            idf = math.log((self.doc_count / df), 10)

            for doc_id, tf in plist:
                tf_weight = 1 + math.log(tf, 10)
                score = tf_weight * idf

                # 既にdoc_idがヒープ内にあるかをチェック
                # → 単純ループで探す（Kが小さい前提）
                for i, (_, d_id) in enumerate(heap):
                    if d_id == doc_id:
                        # 同一ドキュメントが複数termに出現：スコアを加算更新
                        new_score = heap[i][0] + score
                        heap[i] = (new_score, d_id)
                        heapq.heapify(heap)
                        break
                else:
                    # 新規doc_idなら追加
                    heapq.heappush(heap, (score, doc_id))
                    if len(heap) > k:
                        heapq.heappop(heap)

        # スコア降順に並べ替えて返す
        topk = sorted(heap, key=lambda x: x[0], reverse=True)
        return [(doc_id, score) for score, doc_id in topk]
    
    def tfidf_daat_topk(self, query_terms, k=10):
        """
        DAAT(Document-At-A-Time) によるTF-IDFスコアリング
        posting listを同時に走査し，doc_id単位でスコアを合算してTop-Kを求める．
        """

        # --- クエリ内のpostingリストを取得 ---
        postings = []
        for term in query_terms:
            plist = self.postings.get(term, [])
            if plist:
                df = len(plist)
                idf = math.log((self.doc_count / df), 10)
                # 各要素: (doc_id, tf, idf)
                postings.append([(doc_id, tf, idf) for doc_id, tf in plist])

        if not postings:
            return []

        # 各postingのカーソルを初期化（indexごとに位置を保持）
        pointers = [0] * len(postings)
        heap = []  # 最小ヒープ（スコア, doc_id）

        # === Document-At-A-Timeループ ===
        while True:
            # 各リストの現在doc_idを収集
            current_ids = []
            for i, plist in enumerate(postings):
                if pointers[i] < len(plist):
                    current_ids.append(plist[pointers[i]][0])

            if not current_ids:
                break  # 全リスト走査完了

            min_doc = min(current_ids)  # 現在の最小doc_idを決定
            score_sum = 0.0

            # --- すべてのpostingリストを確認 ---
            for i, plist in enumerate(postings):
                if pointers[i] < len(plist) and plist[pointers[i]][0] == min_doc:
                    _, tf, idf = plist[pointers[i]]
                    tf_weight = 1 + math.log(tf, 10)
                    score_sum += tf_weight * idf
                    pointers[i] += 1  # doc_idが一致したリストは1つ進める

            # --- スコアをTop-Kヒープに追加 ---
            heapq.heappush(heap, (score_sum, min_doc))
            if len(heap) > k:
                heapq.heappop(heap)

        # スコア降順に整列して返す
        topk = sorted(heap, key=lambda x: x[0], reverse=True)
        return [(doc_id, score) for score, doc_id in topk]


## TFIDFの加算による類似度

TF-IDFは単語の重み付け手法であり，文書全体の類似度を直接表すわけではない．
ここではTF-IDFを用いて，クエリに対する文書スコアを計算する．

### 数式
$$
\mathrm{Score}(Q, d)
= \sum_{t \in Q}
\left( 1 + \log_{10}\!\bigl(\mathrm{tf}(t, d)\bigr) \right)
\times
\log_{10}\!\left( \frac{N}{\mathrm{df}(t)} \right)
$$

In [26]:
inv_arr=InvertedIndexArray(); inv_arr.build(chunked_docs)


In [27]:
# (doc_id, tf)で表示
inv_arr.show_postings("love")
inv_arr.show_postings("god")


love: [(6, 1), (7, 2), (18, 1), (23, 1), (24, 3), (25, 2), (28, 1), (30, 2), (36, 1), (37, 1), (38, 1), (40, 1), (42, 1), (46, 1), (48, 3), (49, 1), (50, 2), (54, 2), (55, 2), (56, 1), (57, 1), (58, 1), (60, 1), (61, 1), (62, 1), (67, 1), (72, 2), (73, 1), (74, 1), (81, 1), (83, 1), (85, 2), (88, 1), (89, 2), (90, 1), (93, 1), (98, 1), (109, 2), (117, 1), (120, 3), (132, 2), (133, 1), (134, 1), (135, 1), (136, 1), (138, 1), (140, 1), (142, 1), (144, 2), (148, 1), (149, 2), (170, 1), (172, 1), (173, 3), (174, 5), (178, 1), (191, 2), (207, 1), (208, 2), (215, 1), (217, 1), (219, 2), (231, 1), (242, 1), (245, 1), (256, 1), (264, 1), (265, 1), (266, 1), (269, 1), (271, 1), (279, 1), (285, 2), (288, 1), (292, 1), (296, 2), (297, 1), (299, 1), (300, 1), (307, 1), (308, 2), (311, 1), (313, 1), (315, 1), (317, 1), (321, 1), (332, 2), (339, 3), (358, 1), (359, 1), (363, 1), (370, 1), (376, 4), (402, 1), (405, 1), (408, 1), (413, 1), (414, 1), (417, 2), (422, 1), (431, 3), (432, 2), (433, 2), (4

In [28]:
print("AND merge:", inv_arr.and_search_merge("love", "god"))
print("AND skip:", inv_arr.and_search_skip("love", "god"))


AND merge: [264, 480, 481, 488, 666, 703, 770, 776, 841, 843, 910, 911, 1002, 1004, 1005, 1010, 1011, 1012, 1015, 1023, 1041, 1042, 1081, 1084, 1095, 1114, 1245, 1382, 1427, 1496, 1497, 1502, 1511, 1517, 1533, 1534, 1549, 1560, 1563, 1568, 1570, 1579, 1613, 1619, 1622, 1690, 1695, 1700, 1702, 1704, 1709, 1747, 1813, 1827, 1845, 1873, 1889, 1898, 1899, 1903, 1904, 1906, 1914, 1927, 1937, 1946, 1947, 1962, 1964, 1985, 1991, 2026, 2047, 2050, 2058, 2060, 2069, 2077, 2095, 2102, 2112, 2124, 2178, 2182, 2183, 2187, 2188, 2191, 2194, 2196, 2200, 2211, 2212, 2213, 2215, 2216, 2217, 2218, 2220, 2222, 2223, 2229, 2230, 2231, 2232, 2233, 2234, 2235, 2236, 2237, 2240, 2241, 2243, 2244, 2246, 2247, 2249, 2250, 2254, 2255, 2257, 2258, 2259, 2260, 2264, 2269, 2273, 2274, 2275, 2279, 2280, 2281, 2287, 2288, 2289, 2290, 2291, 2292, 2293, 2294, 2296, 2298, 2320, 2321, 2327, 2332, 2525, 2526, 2534, 2552, 2554, 2555, 2557, 2569, 2606, 2628, 2754, 2838, 2980, 3003, 3024, 3268, 3579, 3634, 3684, 3691, 3731

In [29]:
query = ["love", "love", "god"]
print("TF-IDF:", inv_arr.query_tfidf_scores(query))
print("BM25:", inv_arr.bm25(query))


TF-IDF: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 1.638195179367127, 7: 2.1313410671087683, 8: 0.0, 9: 0.0, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0, 15: 0.0, 16: 0.0, 17: 0.0, 18: 1.638195179367127, 19: 0.0, 20: 0.0, 21: 0.0, 22: 0.0, 23: 1.638195179367127, 24: 2.419812918822473, 25: 2.1313410671087683, 26: 0.0, 27: 0.0, 28: 1.638195179367127, 29: 0.0, 30: 2.1313410671087683, 31: 0.0, 32: 0.0, 33: 0.0, 34: 0.0, 35: 0.0, 36: 1.638195179367127, 37: 1.638195179367127, 38: 1.638195179367127, 39: 0.0, 40: 1.638195179367127, 41: 0.0, 42: 1.638195179367127, 43: 0.0, 44: 0.0, 45: 0.0, 46: 1.638195179367127, 47: 0.0, 48: 2.419812918822473, 49: 1.638195179367127, 50: 2.1313410671087683, 51: 0.0, 52: 0.0, 53: 0.0, 54: 2.1313410671087683, 55: 2.1313410671087683, 56: 1.638195179367127, 57: 1.638195179367127, 58: 1.638195179367127, 59: 0.0, 60: 1.638195179367127, 61: 1.638195179367127, 62: 1.638195179367127, 63: 0.0, 64: 0.0, 65: 0.0, 66: 0.0, 67: 1.638195179367127, 68: 0.0, 69: 0.0, 7

In [32]:
query = ["love", "god", "love"]
top_docs = inv_arr.tfidf_topk(query, k=5)

print("=== TF-IDF Top-5 (streamed) ===")
for doc_id, score in top_docs:
    print(f"Doc{doc_id}: {score:.4f}")


=== TF-IDF Top-5 (streamed) ===
Doc2290: 4.5026
Doc4077: 3.5649
Doc3826: 3.5158
Doc4244: 3.3442
Doc4091: 3.2764


In [33]:
query = ["love", "god", "love"]
top_docs = inv_arr.tfidf_daat_topk(query, k=5)

print("=== TF-IDF Top-5 (By DAAT) ===")
for doc_id, score in top_docs:
    print(f"Doc{doc_id}: {score:.4f}")


=== TF-IDF Top-5 (By DAAT) ===
Doc2290: 4.5026
Doc2291: 4.2344
Doc2026: 3.7801
Doc2289: 3.7431
Doc2292: 3.6134



## コサイン類似度 (Cosine Similarity)

TF-IDFは単語の重み付け手法であり，文書全体の類似度を直接表すわけではない．
ここではTF-IDFベクトルを用いて，クエリと文書の「方向の近さ」（コサイン類似度）を計算する．

### 数式
$$
\mathrm{sim}(\vec{q}, \vec{d})
= \frac{\vec{q} \cdot \vec{d}}{\|\vec{q}\| \, \|\vec{d}\|}
= \frac{\sum_t w_{t,q} \, w_{t,d}}
       {\sqrt{\sum_t w_{t,q}^2} \, \sqrt{\sum_t w_{t,d}^2}}
$$

- 値域は **[0, 1]** （1に近いほど類似）  
- 文書長や語数の影響を正規化できる  
- 実際の検索エンジンでは **TF-IDF × Cosine** が標準構成


In [None]:
def cosine_similarity_topk(inv, query_terms, k=10):
    """
    コサイン類似度に基づくTop-K検索
    ---------------------------------
    TF-IDFベクトル w_{t,q}, w_{t,d} を用いて，
    次の式に基づく類似度を求める：
    
        sim(q, d) = (Σ_t w_{t,q} w_{t,d}) / (||q|| ||d||)
    
    ここで:
        w_{t,q} = IDF(t)
        w_{t,d} = (1 + log10(TF_{t,d})) * IDF(t)
    
    Parameters
    ----------
    inv : InvertedIndexArray
        転置インデックス（TF情報付き）
    query_terms : list[str]
        クエリ中の単語リスト
    k : int
        上位k件を返す

    Returns
    -------
    list of (doc_id, score)
        コサイン類似度スコアの降順上位k件
    """
    vocab = inv.vocab
    N = inv.doc_count

    # --- クエリベクトル q と 文書行列 D の初期化 ---
    q_vec = np.zeros(len(vocab))
    d_mat = np.zeros((N, len(vocab)))

    # --- TF-IDF重み付け ---
    # クエリはIDFのみ（TF=1とみなす）
    for term in query_terms:
        if term not in inv.postings:
            continue
        term_idx = vocab.index(term)
        plist = inv.postings[term]
        df = len(plist)
        idf = math.log(N / df, 10)

        # クエリ側重み w_{t,q}
        q_vec[term_idx] = idf

        # 文書側重み w_{t,d}
        for doc_id, tf in plist:
            d_mat[doc_id, term_idx] = (1 + math.log(tf, 10)) * idf

    # --- コサイン類似度計算 ---
    scores = []
    q_norm = norm(q_vec)
    for doc_id in range(N):
        d_norm = norm(d_mat[doc_id])
        if q_norm == 0 or d_norm == 0:
            score = 0.0
        else:
            score = np.dot(q_vec, d_mat[doc_id]) / (q_norm * d_norm) # 式に当たる部分
        scores.append((doc_id, score))

    # --- 上位k件を返す ---
    topk = sorted(scores, key=lambda x: x[1], reverse=True)[:k]
    return topk


| 数式の記号                            | コードでの変数                              | 説明                       |
| -------------------------------- | ------------------------------------ | ------------------------ |
| ( $w_{t,q}$ )                      | `q_vec[idx]`                         | クエリ語 t のTF-IDF重み（TF=1仮定） |
| ( $w_{t,d}$ )                      | `d_mat[doc_id, idx]`                 | 文書 d における語 t のTF-IDF重み   |
| ( $\vec{q} \cdot \vec{d}$ )        | `np.dot(q_vec, d_mat[doc_id])`       | 内積                       |
| ( \| $\vec{q}$ \| , \|$\vec{d}$\| )         | `norm(q_vec)`, `norm(d_mat[doc_id])` | ベクトルのL2ノルム               |
| ($\mathrm{sim}(\vec{q}, \vec{d}$)) | `score`                              | コサイン類似度スコア               |


In [None]:

# === コサイン類似度の実行例 ===
#query = ["love", "god", "love"]
#query = ["love", "battle", "heaven", "war"]
query = ["love", "wisdom", "truth", "justice"]
"""
wisdomのようなレアな単語が入ると、その単語が入っている文書のコサイン類似度が高くなる。
"""


top_docs = cosine_similarity_topk(inv_arr, query, k=10)

print("=== Cosine Similarity (TF-IDFベクトル基盤) ===")
for doc_id, score in top_docs:
    print(f"Doc{doc_id}: {score:.4f}")


=== Cosine Similarity (TF-IDFベクトル基盤) ===
Doc1587: 0.9694
Doc3755: 0.9068
Doc1244: 0.8348
Doc1371: 0.8348
Doc1580: 0.8348
Doc292: 0.8159
Doc613: 0.8159
Doc1599: 0.8159
Doc3206: 0.8159
Doc541: 0.8112



## Qに対する文書dの類似度のポイントまとめ

| 観点 | TF-IDF/BM25 | TF-IDF＋Cosine |
|------|-------------|----------------|
| 評価対象 | 各単語の重要度 | ベクトル全体の方向一致 |
| 比較対象 | スカラー値 | ベクトル同士 |
| 正規化 | なし | 文書長・語数で正規化される |
| 値の意味 | 大きいほどその語が重要 | 1に近いほど文書全体が類似 |

- TF-IDFのみ → 「重要単語が多い文書」を上げる  
- Cosine → 「重要単語が同じ方向に並んでいる文書」を上げる  
- 実際の検索エンジンは BM25 を基本としている．ただし，今後説明するembedding等を使ったベクトルには，Cosine類似度が用いられる
