In [None]:
from typing import List
from pathlib import Path

def read_text(path: str) -> str:
    for encoding in ("utf-8-sig", "utf-8", "cp932"):
        try:
            return Path(path).read_text(encoding=encoding)
        except UnicodeDecodeError:
            pass
    return Path(path).read_text(encoding="utf-8", errors="replace")

def split_into_chunks(doc_file: str) -> List[str]:
    content = read_text(doc_file)

    return [chunk for chunk in content.split("\n\n")]

chunks = split_into_chunks("doc_test2.md")

for i, chunk in enumerate(chunks):
    print(f"[{i}] {chunk}\n")

In [None]:
from sentence_transformers import SentenceTransformer

# embedding_model = SentenceTransformer("sonoisa/sentence-bert-base-ja-mean-tokens-v2")
# embedding_model = SentenceTransformer("sentence-transformers/LaBSE")
embedding_model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)


def embed_chunk(chunk: str) -> List[float]:
    embedding = embedding_model.encode(chunk, normalize_embeddings=True)
    return embedding.tolist()


embedding = embed_chunk("テストです。")
print(len(embedding))
print(embedding)

In [None]:
embeddings = [embed_chunk(chunk) for chunk in chunks]

print(len(embeddings))
print(embeddings[0])

In [None]:
import chromadb

chromadb_client = chromadb.EphemeralClient()
chromadb_collection = chromadb_client.get_or_create_collection(name="default")

def save_embeddings(chunks: List[str], embeddings: List[List[float]]) -> None:
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        chromadb_collection.add(
            documents=[chunk],
            embeddings=[embedding],
            ids=[str(i)]
        )

save_embeddings(chunks, embeddings)

In [None]:
def retrieve(query: str, top_k: int) -> List[str]:
    query_embedding = embed_chunk(query)
    results = chromadb_collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    return results['documents'][0]

#query = "哆啦A梦使用的3个秘密道具分别是什么？"
#query = "大雄使用了哪些道具，分别是什么？"
query = "ドラえもんの道具はいくつか、のび太が使った道具は何ですか？役割は何ですか？"

retrieved_chunks = retrieve(query, 7)

for i, chunk in enumerate(retrieved_chunks):
    print(f"[{i}] {chunk}\n")

In [None]:
from sentence_transformers import CrossEncoder

def rerank(query: str, retrieved_chunks: List[str], top_k: int) -> List[str]:
    cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1')
    pairs = [(query, chunk) for chunk in retrieved_chunks]
    scores = cross_encoder.predict(pairs)

    scored_chunks = list(zip(retrieved_chunks, scores))
    scored_chunks.sort(key=lambda x: x[1], reverse=True)

    return [chunk for chunk, _ in scored_chunks][:top_k]

reranked_chunks = rerank(query, retrieved_chunks, 5)

for i, chunk in enumerate(reranked_chunks):
    print(f"[{i}] {chunk}\n")

In [None]:
from dotenv import load_dotenv, find_dotenv
from google import genai

import os

dotenv_path = find_dotenv(usecwd=True)
load_dotenv(dotenv_path, override=True)
api_key = os.getenv("GEMINI_API_KEY")
#print("GEMINI_API_KEY loaded:", bool(api_key))
google_client = genai.Client(api_key=api_key) if api_key else genai.Client()

def generate(query: str, chunks: List[str]) -> str:
    prompt = f"""あなたは知識アシスタントです。ユーザーの質問と、以下の断片（チャンク）に基づいて、正確な回答を生成してください。日本語に翻訳。

問題: {query}

チャンク:
{"\n\n".join(chunks)}

上記の内容に基づいて回答し、記載のない情報は作りません。日本語で対応します。"""

    print(f"{prompt}\n\n---\n")

    response = google_client.models.generate_content(
        model="gemini-2.5-flash",  # モデルを指定 flashの場合、回答が正しいこと
        #model="gemini-2.5-flash-lite", # モデルを指定 liteの場合、回答が正しくないこと
        contents=prompt
    )

    return response.text

answer = generate(query, reranked_chunks)
print(answer)