In [1]:
# -*- coding: utf-8 -*-
import sqlite3
from groq import Groq, RateLimitError, APIError # Import specific Groq errors
import dotenv
import json
import os
import time
import faiss
import chromadb
from pydantic import BaseModel, Field, ValidationError
import textwrap
from typing import Optional, Union, List, Dict
import numpy as np
import logging
from sentence_transformers import SentenceTransformer

# --- Configuration ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

DB_FILE = 'w3c_data.db'

# --- List of Llama models on Groq to try in order ---
LLAMA_MODEL_FALLBACK_LIST = [
    "meta-llama/llama-4-scout-17b-16e-instruct",
    "meta-llama/llama-4-maverick-17b-128e-instruct",
    "llama-3.3-70b-specdec",
    "llama-3.3-70b-versatile",
    "llama-3.2-90b-vision-preview",
    "llama-3.2-11b-vision-preview",
    "llama-3.1-8b-instant",
    "llama3-70b-8192",
    "llama3-8b-8192",
]
logging.info(f"訊息： 將依序嘗試以下 Groq Llama 模型: {LLAMA_MODEL_FALLBACK_LIST}")

EN_EMBEDDING_MODEL_NAME = 'all-mpnet-base-v2'
ZH_EMBEDDING_MODEL_NAME = 'shibing624/text2vec-base-chinese'

MAX_CONTENT_LENGTH = 6000
MAX_TRANSLATION_LENGTH = 2000
MAX_OUTPUT_TOKENS_EXTRACT = 6144
MAX_OUTPUT_TOKENS_TRANSLATE = 2048
# Keep sleep intervals as they reduce overall likelihood of hitting limits
SLEEP_INTERVAL_EXTRACT = 5.0
SLEEP_INTERVAL_TRANSLATE = 2.0

FAISS_BASE_DIR = "faiss_w3c_stores"
CHROMA_BASE_DIR = "chroma_w3c_stores"

FIELDS_TO_VECTORIZE = ['abstract', 'status_of_document', 'content_summary', 'original_content_snippet']
LANGUAGES = ['en', 'zh']

# --- Pydantic 模型定義 (Class Structure) ---
class W3CStandardDetails(BaseModel):
    title: str = Field(description="The main official title of the document.")
    publish_date: Optional[str] = Field(default=None, description="The publication date mentioned in the document (e.g., '10 October 2023'), if clearly stated.")
    detail: Optional[str] = Field(default=None, description="Extracted metadata block (versions, editors, etc.).")
    abstract: str = Field(description="Extracted 'Abstract' section content.")
    status_of_document: str = Field(description="Extracted 'Status of This Document' section content.")
    content_summary: str = Field(description="Extracted main body text content (excluding abstract, status, detail).")
    original_content_snippet: Optional[str] = Field(default=None, description="The original content snippet fed to the LLM for extraction.")
    abstract_zh: Optional[str] = Field(default=None, description="Chinese translation of the abstract.")
    status_of_document_zh: Optional[str] = Field(default=None, description="Chinese translation of the status_of_document.")
    content_summary_zh: Optional[str] = Field(default=None, description="Chinese translation of the content_summary.")
    original_content_snippet_zh: Optional[str] = Field(default=None, description="Chinese translation of the original_content_snippet.")
    model_config = {"extra": "ignore"}


# --- Helper Functions ---
def get_faiss_paths(base_dir: str, field_name: str, lang: str):
    filename_base = f"{field_name}_{lang}"
    index_path = os.path.join(base_dir, f"{filename_base}.index")
    metadata_path = os.path.join(base_dir, f"{filename_base}_meta.json")
    return index_path, metadata_path

def get_chroma_config(base_dir: str, field_name: str, lang: str):
    collection_name = f"w3c_{field_name}_{lang}"
    db_path = os.path.join(base_dir, f"{field_name}_{lang}_db")
    return db_path, collection_name

# --- Modified Helper Functions with Model Fallback ---

def process_data_to_detailed_structure(client: Groq, title: str, content: str) -> Union[W3CStandardDetails, None]:
    """Uses Groq LLM with fallback to extract structured details."""
    content_snippet = content[:MAX_CONTENT_LENGTH] + ("..." if len(content) > MAX_CONTENT_LENGTH else "")
    schema = W3CStandardDetails.model_json_schema()
    # Prepare schema for prompt (excluding translation/original snippet fields)
    schema_properties = schema.get('properties', {})
    fields_to_exclude_for_extraction = ['abstract_zh', 'status_of_document_zh', 'content_summary_zh', 'original_content_snippet_zh', 'original_content_snippet']
    for field in fields_to_exclude_for_extraction: schema_properties.pop(field, None)
    if 'required' in schema: schema['required'] = [req for req in schema['required'] if req not in fields_to_exclude_for_extraction]

    system_prompt = textwrap.dedent(f"""
        You are an AI assistant specialized in extracting specific sections... [SAME AS BEFORE] ...
        JSON Schema (Target Structure for your output):
        ```json
        {json.dumps(schema, indent=2)}
        ```
        Output ONLY the single, valid JSON object adhering to the schema... [SAME AS BEFORE] ...
        """).strip()
    user_prompt = textwrap.dedent(f"""
        Please extract the detailed information from the following text... [SAME AS BEFORE] ...
        Title: {title}
        Content Snippet: {content_snippet}
        """).strip()

    for model_id in LLAMA_MODEL_FALLBACK_LIST:
        logging.debug(f"    嘗試使用模型 (Extraction): {model_id}")
        try:
            chat_completion = client.chat.completions.create(
                messages=[{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}],
                model=model_id, # Use current model from list
                temperature=0.1,
                max_tokens=MAX_OUTPUT_TOKENS_EXTRACT,
                response_format={"type": "json_object"},
            )
            response_content = chat_completion.choices[0].message.content

            logging.info(f"--- Raw JSON Response for Title: '{title[:50]}...' (using {model_id}) ---")
            logging.info(response_content)
            logging.info(f"--- End Raw JSON Response ---")

            json_output = json.loads(response_content)
            for field in ['abstract', 'status_of_document', 'content_summary']:
                if field not in json_output: json_output[field] = ""

            try:
                structured_object = W3CStandardDetails.model_validate(json_output)
                structured_object.original_content_snippet = content_snippet
                logging.info(f"    => 成功使用模型 (Extraction): {model_id}")
                return structured_object # Success! Return result.

            except ValidationError as e_pydantic:
                logging.error(f"Pydantic 驗證失敗 (模型: {model_id})。標題：'{title}'. 錯誤：{e_pydantic}")
                logging.error(f"導致驗證失敗的原始 JSON 內容 (模型: {model_id})：{response_content}")
                # Pydantic error likely means model output format is wrong, maybe try next model?
                # Or maybe just fail here? Let's try failing here for now.
                return None # Fail on validation error

        except json.JSONDecodeError as e_json:
            logging.error(f"解碼來自 Groq (模型: {model_id}) 的 JSON 回應時出錯：{e_json}")
            logging.error(f"原始回應內容 (模型: {model_id}, 非 JSON 或格式錯誤)：{response_content}")
            # JSON error likely means model didn't follow instructions, fail this attempt.
            return None # Fail on JSON decode error

        except RateLimitError as e_rate:
             # Specific rate limit error from Groq library
             logging.warning(f"遭遇速率限制 (429) 使用模型 {model_id} (Extraction)。正在嘗試下一個模型...")
             time.sleep(1) # Add a small extra delay before trying next model
             continue # Try next model in the list

        except APIError as e_api:
             # Catch other Groq API errors (e.g., 5xx server errors, bad request 400)
             logging.error(f"Groq API 錯誤 (模型: {model_id}, Extraction)，標題 '{title}': {e_api}")
             # For some API errors (like temporary server issues), retrying next model might work
             if e_api.status_code >= 500:
                 logging.warning("檢測到伺服器端錯誤，嘗試下一個模型...")
                 time.sleep(1)
                 continue
             else:
                 # For other errors (like bad request), probably won't be fixed by switching model
                 return None # Fail the attempt

        except Exception as e:
             # Catch-all for unexpected errors
             error_message = str(e)
             logging.error(f"意外錯誤 (模型: {model_id}, Extraction)，標題 '{title}': {error_message}")
             # Check for rate limit text just in case RateLimitError wasn't caught
             if "rate limit" in error_message.lower() or "429" in error_message.lower():
                 logging.warning(f"檢測到速率限制錯誤文本，嘗試下一個模型...")
                 time.sleep(1)
                 continue # Try next model
             else:
                 # For other unexpected errors, fail the attempt
                 return None

    # If loop finishes without returning, all models failed
    logging.error(f"所有備選模型均無法成功提取標題 '{title}' 的資料。")
    return None


def translate_text_with_groq(client: Groq, text: str, target_language: str = "Traditional Chinese") -> Optional[str]:
    """Uses Groq LLM with fallback to translate text."""
    if not text or not text.strip(): return None
    text_snippet = text[:MAX_TRANSLATION_LENGTH] + ("..." if len(text) > MAX_TRANSLATION_LENGTH else "")
    system_prompt = f"You are a high-quality machine translation engine. Translate the following text accurately into {target_language}. Output ONLY the translated text, without any introductory phrases, explanations, or quotation marks."
    user_prompt = text_snippet

    for model_id in LLAMA_MODEL_FALLBACK_LIST:
        logging.debug(f"      嘗試使用模型 (Translation): {model_id}")
        try:
            chat_completion = client.chat.completions.create(
                messages=[{"role": "system", "content": system_prompt},{"role": "user", "content": user_prompt}],
                model=model_id, # Use current model from list
                temperature=0.2,
                max_tokens=MAX_OUTPUT_TOKENS_TRANSLATE,
            )
            translated_text = chat_completion.choices[0].message.content.strip()
            # Basic check if output looks reasonable (not empty, not an error message)
            if translated_text and len(translated_text) > 0:
                 logging.debug(f"      => 成功使用模型 (Translation): {model_id}")
                 return translated_text # Success!
            else:
                 logging.warning(f"模型 {model_id} (Translation) 返回空或無效輸出: '{translated_text[:50]}...'")
                 # Don't immediately fail, try next model
                 continue

        except RateLimitError as e_rate:
             logging.warning(f"遭遇速率限制 (429) 使用模型 {model_id} (Translation)。正在嘗試下一個模型...")
             time.sleep(1) # Add a small extra delay
             continue # Try next model

        except APIError as e_api:
             logging.error(f"Groq API 錯誤 (模型: {model_id}, Translation): {e_api}")
             if e_api.status_code >= 500:
                 logging.warning("檢測到伺服器端錯誤，嘗試下一個模型...")
                 time.sleep(1)
                 continue
             else:
                 return None # Fail

        except Exception as e:
             error_message = str(e)
             logging.error(f"意外錯誤 (模型: {model_id}, Translation): {error_message}")
             if "rate limit" in error_message.lower() or "429" in error_message.lower():
                 logging.warning(f"檢測到速率限制錯誤文本，嘗試下一個模型...")
                 time.sleep(1)
                 continue
             else:
                 return None # Fail

    # If loop finishes without returning, all models failed
    logging.error(f"所有備選模型均無法成功翻譯文本片段: '{text_snippet[:50]}...'")
    return None


# ==============================================
# --- Script Start ---
# ==============================================
logging.info("--- 開始執行 W3C 資料處理、翻譯與多向量儲存腳本 (逐項處理 + 模型回退) ---")

# --- 1. Load Env Vars & Init Groq ---
logging.info("--- 1. 載入環境變數與初始化 Groq Client ---")
if os.path.exists('.env'): dotenv.load_dotenv(); logging.info("訊息： .env 檔案已載入。")
else: logging.warning("警告： 找不到 .env 檔案...")
groq_api_key = os.getenv('GROQ_API_KEY')
if not groq_api_key: logging.error("錯誤： 找不到 GROQ_API_KEY。"); exit()
else: logging.info("訊息： 已找到 GROQ_API_KEY。")
try:
    groq_client = Groq(api_key=groq_api_key)
    logging.info(f"訊息： Groq 客戶端已成功初始化。") # Model used will vary
except Exception as e: logging.error(f"錯誤： 初始化 Groq 客戶端時發生問題：{e}"); exit()

# --- 2. Load Database Data ---
logging.info("--- 2. 連接 SQLite 資料庫並讀取資料 ---")
rows = []
conn = None
try:
    logging.info(f"訊息： 正在嘗試連接資料庫：{DB_FILE}...")
    conn = sqlite3.connect(DB_FILE)
    cursor = conn.cursor()
    logging.info("訊息： 資料庫連接成功。")
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='w3c_standards';")
    if not cursor.fetchone(): logging.error(f"錯誤： 資料庫 '{DB_FILE}' 中找不到資料表 'w3c_standards'。"); exit()
    logging.info("訊息： 已找到 'w3c_standards' 資料表。")
    cursor.execute('SELECT title, content FROM w3c_standards')
    rows = cursor.fetchall()
    logging.info(f"訊息： 已成功讀取 {len(rows)} 筆資料。")
except Exception as e: logging.error(f"資料庫讀取錯誤： {e}"); exit()
finally:
    if conn: conn.close(); logging.info("訊息： 資料庫連接已關閉。")
if not rows: logging.error("錯誤： 未能從資料庫讀取到任何資料，程式終止。"); exit()

# --- 3. Initialize Embedding Models ---
logging.info("\n--- 3. 初始化 Embedding Models ---")
try:
    logging.info(f"訊息： 正在載入英文 Embedding 模型 ('{EN_EMBEDDING_MODEL_NAME}')...")
    en_embedding_model = SentenceTransformer(EN_EMBEDDING_MODEL_NAME)
    en_embedding_dim = en_embedding_model.get_sentence_embedding_dimension()
    logging.info(f"訊息： 英文模型載入完成，維度: {en_embedding_dim}")
    logging.info(f"訊息： 正在載入中文 Embedding 模型 ('{ZH_EMBEDDING_MODEL_NAME}')...")
    zh_embedding_model = SentenceTransformer(ZH_EMBEDDING_MODEL_NAME)
    zh_embedding_dim = zh_embedding_model.get_sentence_embedding_dimension()
    logging.info(f"訊息： 中文模型載入完成，維度: {zh_embedding_dim}")
except Exception as e: logging.error(f"錯誤： 載入 Embedding 模型時發生錯誤: {e}"); exit()


# --- 4. Initialize Vector Store Handlers ---
logging.info("\n--- 4. 初始化向量儲存處理器 ---")
os.makedirs(FAISS_BASE_DIR, exist_ok=True)
faiss_indices_in_memory: Dict[str, faiss.Index] = {}
faiss_metadata_in_memory: Dict[str, List[dict]] = {}
faiss_embeddings_in_memory: Dict[str, List[np.ndarray]] = {}
faiss_dims: Dict[str, int] = {}
for field in FIELDS_TO_VECTORIZE:
    for lang in LANGUAGES:
        key = f"{field}_{lang}"
        dim = en_embedding_dim if lang == 'en' else zh_embedding_dim
        faiss_metadata_in_memory[key] = []
        faiss_embeddings_in_memory[key] = []
        faiss_dims[key] = dim
logging.info("訊息： FAISS 內存結構準備完成。")
try:
    chroma_client = chromadb.PersistentClient(path=CHROMA_BASE_DIR)
    logging.info(f"訊息： ChromaDB Persistent Client 初始化完成 (路徑: {CHROMA_BASE_DIR})")
    en_ef = chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EN_EMBEDDING_MODEL_NAME)
    zh_ef = chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(model_name=ZH_EMBEDDING_MODEL_NAME)
    chroma_ef_map = {'en': en_ef, 'zh': zh_ef}
except Exception as e: logging.error(f"錯誤： 初始化 ChromaDB Client 時發生錯誤: {e}"); exit()


# --- 5. Main Processing Loop (Item by Item with Fallback) ---
logging.info("\n--- 5. 開始逐項處理、翻譯、嵌入和儲存 (帶模型回退) ---")
start_main_loop_time = time.time()
items_processed_successfully = 0

for i, (title, content) in enumerate(rows):
    logging.info(f"\n===== 正在處理第 {i+1}/{len(rows)} 筆資料：'{title[:50]}...' =====")
    content_str = content if content is not None else ""

    # --- 5a. Extraction with Fallback ---
    logging.info("  步驟 5a: 提取結構化資料...")
    structured_object = process_data_to_detailed_structure(groq_client, title, content_str)
    time.sleep(SLEEP_INTERVAL_EXTRACT) # Still sleep between ITEMS

    if not structured_object:
        logging.error(f"  => 提取失敗 (嘗試所有模型後)，跳過此筆資料的後續處理。")
        continue

    # --- 5b. Translation with Fallback ---
    logging.info("  步驟 5b: 翻譯...")
    translations_done_for_item = 0
    for field_en in FIELDS_TO_VECTORIZE:
        field_zh = f"{field_en}_zh"
        english_text = getattr(structured_object, field_en, None)
        if isinstance(english_text, str) and english_text.strip():
            # logging.debug(f"    翻譯欄位 '{field_en}'...")
            translated_text = translate_text_with_groq(groq_client, english_text) # Fallback is inside
            if translated_text:
                setattr(structured_object, field_zh, translated_text)
                translations_done_for_item += 1
            else:
                 logging.warning(f"    => 翻譯失敗 (嘗試所有模型後): 欄位='{field_en}'")
            time.sleep(SLEEP_INTERVAL_TRANSLATE) # Sleep between different fields' translation calls
        # else: logging.debug(f"    跳過空欄位 '{field_en}'")
    logging.info(f"  => 完成 {translations_done_for_item} 個欄位的翻譯嘗試。")


    # --- 5c. Embedding and Saving ---
    logging.info("  步驟 5c: 嵌入並儲存至向量庫...")
    vector_stores_updated = 0
    for field in FIELDS_TO_VECTORIZE:
        for lang in LANGUAGES:
            store_key = f"{field}_{lang}"
            text_to_embed = None
            source_field_name = field
            if lang == 'en': text_to_embed = getattr(structured_object, field, None)
            elif lang == 'zh': text_to_embed = getattr(structured_object, f"{field}_zh", None); source_field_name = f"{field}_zh"

            if isinstance(text_to_embed, str) and text_to_embed.strip():
                logging.debug(f"    處理儲存: 欄位='{field}', 語言='{lang}'")
                try:
                    embedding_model = en_embedding_model if lang == 'en' else zh_embedding_model
                    embedding = embedding_model.encode([text_to_embed])[0]
                    embedding_np = np.array(embedding).astype('float32').reshape(1, -1)
                    metadata = {"title": structured_object.title,"publish_date": structured_object.publish_date if structured_object.publish_date else "N/A","status": structured_object.status_of_document if structured_object.status_of_document else "N/A","source_field": source_field_name,"language": lang}
                    item_id = f"w3c_{field}_{lang}_{i}"

                    faiss_embeddings_in_memory[store_key].append(embedding_np)
                    faiss_metadata_in_memory[store_key].append(metadata)

                    chroma_db_path, chroma_collection_name = get_chroma_config(CHROMA_BASE_DIR, field, lang)
                    chroma_ef = chroma_ef_map[lang]
                    try:
                        collection = chroma_client.get_or_create_collection(name=chroma_collection_name, embedding_function=chroma_ef, metadata={"hnsw:space": "l2"})
                        collection.upsert(embeddings=[embedding.tolist()], documents=[text_to_embed], metadatas=[metadata], ids=[item_id])
                        logging.debug(f"      => ChromaDB upsert 成功 ({chroma_collection_name})")
                        vector_stores_updated += 1
                    except Exception as e_chroma: logging.error(f"      錯誤： 儲存 ChromaDB 時發生錯誤 (Collection: {chroma_collection_name}, ID: {item_id}): {e_chroma}")
                except Exception as e_embed_save: logging.error(f"    錯誤： 嵌入或儲存向量時發生錯誤 (欄位='{field}', 語言='{lang}'): {e_embed_save}")
            # else: logging.debug(f"    跳過空文本: 欄位='{field}', 語言='{lang}'")

    if vector_stores_updated > 0:
        items_processed_successfully += 1
    logging.info(f"  => 為此項目更新了 {vector_stores_updated} 個向量儲存。")


end_main_loop_time = time.time()
logging.info(f"\n--- 主要處理循環完成，耗時: {end_main_loop_time - start_main_loop_time:.2f} 秒 ---")
logging.info(f"訊息： 共有 {items_processed_successfully} / {len(rows)} 筆資料成功完成向量儲存步驟。") # Corrected total rows


# --- 6. Save Aggregated FAISS Data ---
logging.info("\n--- 6. 儲存聚合的 FAISS 資料 ---")
# ... [PASTE FAISS SAVING CODE HERE] ...
faiss_saved_count = 0
start_faiss_save_time = time.time()
for field in FIELDS_TO_VECTORIZE:
    for lang in LANGUAGES:
        key = f"{field}_{lang}"
        embeddings_list = faiss_embeddings_in_memory[key]
        metadata_list = faiss_metadata_in_memory[key]
        dim = faiss_dims[key]
        faiss_index_path, faiss_metadata_path = get_faiss_paths(FAISS_BASE_DIR, field, lang)
        if embeddings_list:
            logging.info(f"訊息： 正在儲存 FAISS: 欄位='{field}', 語言='{lang}' ({len(embeddings_list)} 向量)...")
            try:
                all_embeddings_np = np.concatenate(embeddings_list, axis=0).astype('float32')
                index = faiss.IndexFlatL2(dim)
                index.add(all_embeddings_np)
                faiss.write_index(index, faiss_index_path)
                with open(faiss_metadata_path, 'w', encoding='utf-8') as f:
                    json.dump(metadata_list, f, ensure_ascii=False, indent=2)
                logging.info(f"  => FAISS 儲存成功 (索引: {faiss_index_path})")
                faiss_saved_count +=1
            except Exception as e_faiss_save: logging.error(f"  錯誤： 儲存 FAISS 時發生錯誤 (欄位='{field}', 語言='{lang}'): {e_faiss_save}")
        else: logging.info(f"訊息： 跳過儲存空的 FAISS: 欄位='{field}', 語言='{lang}'")
end_faiss_save_time = time.time()
logging.info(f"--- FAISS 資料儲存完成 ({faiss_saved_count} 個索引)，耗時: {end_faiss_save_time - start_faiss_save_time:.2f} 秒 ---")


# --- 7. Verification Query (Optional) ---
logging.info("\n--- 7. 執行驗證查詢 (範例) ---")
# ... [PASTE VERIFICATION CODE HERE] ...
try:
    example_field = 'content_summary'; example_lang = 'en'; example_query = "DID Methods list"; n_results = 2
    logging.info(f"測試查詢: 欄位='{example_field}', 語言='{example_lang}', 查詢='{example_query}'")
    faiss_index_path_ex, faiss_metadata_path_ex = get_faiss_paths(FAISS_BASE_DIR, example_field, example_lang)
    if os.path.exists(faiss_index_path_ex) and os.path.exists(faiss_metadata_path_ex):
        logging.info("--- 在 FAISS 中查詢範例 ---")
        loaded_index_ex = faiss.read_index(faiss_index_path_ex);
        with open(faiss_metadata_path_ex, 'r', encoding='utf-8') as f: loaded_metadata_ex = json.load(f)
        query_vector_ex = en_embedding_model.encode([example_query]).astype('float32')
        distances_ex, indices_ex = loaded_index_ex.search(query_vector_ex, n_results)
        logging.info(f"FAISS 查詢結果:")
        for i in range(min(n_results, len(indices_ex[0]))):
            idx = indices_ex[0][i]; dist = distances_ex[0][i]
            if idx < len(loaded_metadata_ex): meta = loaded_metadata_ex[idx]; logging.info(f"  Rank {i+1}: Index={idx}, Dist={dist:.4f}, Title='{meta.get('title', 'N/A')}'")
            else: logging.warning(f"  Rank {i+1}: Index={idx} out of bounds.")
    else: logging.warning(f"FAISS 範例索引/元資料未找到: {faiss_index_path_ex}")
    chroma_db_path_ex, chroma_collection_name_ex = get_chroma_config(CHROMA_BASE_DIR, example_field, example_lang)
    try:
        logging.info("--- 在 ChromaDB 中查詢範例 ---")
        client_load = chromadb.PersistentClient(path=CHROMA_BASE_DIR)
        existing_collections = [col.name for col in client_load.list_collections()]
        if chroma_collection_name_ex in existing_collections:
            collection_load = client_load.get_collection(name=chroma_collection_name_ex, embedding_function=en_ef)
            results = collection_load.query(query_texts=[example_query], n_results=n_results, include=['metadatas', 'distances'])
            logging.info(f"ChromaDB 查詢結果:")
            ids = results.get('ids', [[]])[0]; dists = results.get('distances', [[]])[0]; metas = results.get('metadatas', [[]])[0]
            for i in range(len(ids)): logging.info(f"  Rank {i+1}: ID={ids[i]}, Dist={dists[i]:.4f}, Title='{metas[i].get('title', 'N/A')}'")
        else: logging.warning(f"ChromaDB 範例 collection '{chroma_collection_name_ex}' 不存在。")
    except Exception as e: logging.warning(f"無法查詢 ChromaDB 範例 collection '{chroma_collection_name_ex}': {e}")
except Exception as e: logging.error(f"執行驗證查詢時發生錯誤: {e}")

logging.info("\n--- 腳本執行完畢 ---")

  from .autonotebook import tqdm as notebook_tqdm
2025-04-11 01:17:25,888 - INFO - 訊息： 將依序嘗試以下 Groq Llama 模型: ['meta-llama/llama-4-scout-17b-16e-instruct', 'meta-llama/llama-4-maverick-17b-128e-instruct', 'llama-3.3-70b-specdec', 'llama-3.3-70b-versatile', 'llama-3.2-90b-vision-preview', 'llama-3.2-11b-vision-preview', 'llama-3.1-8b-instant', 'llama3-70b-8192', 'llama3-8b-8192']
2025-04-11 01:17:25,891 - INFO - --- 開始執行 W3C 資料處理、翻譯與多向量儲存腳本 (逐項處理 + 模型回退) ---
2025-04-11 01:17:25,891 - INFO - --- 1. 載入環境變數與初始化 Groq Client ---
2025-04-11 01:17:25,892 - INFO - 訊息： .env 檔案已載入。
2025-04-11 01:17:25,893 - INFO - 訊息： 已找到 GROQ_API_KEY。
2025-04-11 01:17:25,926 - INFO - 訊息： Groq 客戶端已成功初始化。
2025-04-11 01:17:25,926 - INFO - --- 2. 連接 SQLite 資料庫並讀取資料 ---
2025-04-11 01:17:25,927 - INFO - 訊息： 正在嘗試連接資料庫：w3c_data.db...
2025-04-11 01:17:25,927 - INFO - 訊息： 資料庫連接成功。
2025-04-11 01:17:25,928 - INFO - 訊息： 已找到 'w3c_standards' 資料表。
2025-04-11 01:17:26,199 - INFO - 訊息： 已成功讀取 1134 筆資料。
2025-04-11 01:17:26,200 - IN