# Kaggle eval (VNPT-only: LLM API + Embedding API)

Yêu cầu:
- Có `api-keys.json` (bạn upload vào Kaggle Dataset hoặc tạo trong `/kaggle/working`).
- KB JSONL nằm trong `/kaggle/input/data-vnpt-ai`.

Notebook sẽ:
1) Cài dependency.
2) (Tuỳ chọn) build `kb_vnpt_embedding_index.pkl` bằng Embedding API.
3) Convert `val.csv/test.csv` -> JSON format cho `predict.py`.
4) Chạy `predict.py` và xuất `submission.csv`.

In [None]:
!pip install -q -r requirements.txt pandas

In [None]:
import os

# ==== Paths ==== 
API_KEYS = "/kaggle/working/api-keys.json"  # hoặc trỏ sang /kaggle/input/... nếu bạn upload dạng Dataset
KB_DIR = "/kaggle/input/data-vnpt-ai"       # nơi chứa các file *.jsonl

VAL_CSV = "/kaggle/input/your-dataset/val.csv"   # chỉnh lại
TEST_CSV = "/kaggle/input/your-dataset/test.csv" # chỉnh lại

VNPT_INDEX = "/kaggle/working/kb_vnpt_embedding_index.pkl"
TOP_K = 6
LLM_MODEL = "large"   # small|large|auto

In [None]:
# (Tuỳ chọn) Build VNPT embedding index nếu chưa có.
# Lưu ý: việc build index sẽ tiêu tốn quota embedding (1 request/chunk).

import os
if not os.path.exists(VNPT_INDEX):
    cmd = (
        f"API_KEYS_PATH={API_KEYS} "
        f"python build_index.py --kb_dir {KB_DIR} --out {VNPT_INDEX} --resume --sleep_s 0.12"
    )
    print(cmd)
    os.system(cmd)
else:
    print("Found existing index:", VNPT_INDEX)

In [None]:
# Convert CSV -> JSON input for predict.py
import json
import pandas as pd

def _pick_column(df, targets):
    for t in targets:
        for c in df.columns:
            if c.lower() == t:
                return c
    return None


def convert_csv_to_json(csv_path: str, out_path: str):
    df = pd.read_csv(csv_path)
    qid_col = _pick_column(df, ["qid", "id"])
    q_col = _pick_column(df, ["question", "prompt"])
    if q_col is None:
        raise ValueError("Không tìm thấy cột question trong CSV")

    choice_cols = [c for c in df.columns if c.lower() in ["a","b","c","d","e","f"]]
    if not choice_cols:
        choice_cols = [c for c in df.columns if c.lower().startswith("option")]
    if not choice_cols:
        raise ValueError("Không tìm thấy cột lựa chọn (A/B/C/D hoặc option*)")

    records=[]
    answers={}
    ans_col = _pick_column(df, ["answer", "label"])

    for idx, row in df.iterrows():
        qid = str(row[qid_col]) if qid_col else str(idx)
        qtext = str(row[q_col])
        choices=[]
        for c in choice_cols:
            v = row.get(c)
            if pd.isna(v):
                continue
            choices.append(str(v))
        records.append({"qid": qid, "question": qtext, "choices": choices})
        if ans_col:
            a = str(row[ans_col]).strip().upper()
            if a:
                answers[qid]=a

    with open(out_path,"w",encoding="utf-8") as f:
        json.dump(records,f,ensure_ascii=False,indent=2)
    return out_path, (answers if answers else None)

val_json, val_answers = convert_csv_to_json(VAL_CSV, "/kaggle/working/val.json")
test_json, _ = convert_csv_to_json(TEST_CSV, "/kaggle/working/test.json")
print("val_json:", val_json)
print("test_json:", test_json)

In [None]:
# Run val
import os
cmd = (
    f"API_KEYS_PATH={API_KEYS} "
    f"python predict.py --input /kaggle/working/val.json --output /kaggle/working/val_pred.csv "
    f"--api_keys {API_KEYS} --vnpt_index {VNPT_INDEX} --llm_model {LLM_MODEL} --top_k_retrieval {TOP_K}"
)
print(cmd)
os.system(cmd)

In [None]:
# Quick eval (if val has labels)
import pandas as pd

if 'val_answers' in globals() and val_answers:
    pred = pd.read_csv('/kaggle/working/val_pred.csv')
    pred_dict = dict(zip(pred.qid.astype(str), pred.answer.astype(str)))
    total=len(val_answers)
    correct=sum(1 for k,v in val_answers.items() if pred_dict.get(k,'')==v)
    print(f"Accuracy: {correct}/{total} = {correct/total:.4f}")
else:
    print('No labels in val.csv')

In [None]:
# Run test -> submission.csv
import os
cmd = (
    f"API_KEYS_PATH={API_KEYS} "
    f"python predict.py --input /kaggle/working/test.json --output /kaggle/working/submission.csv "
    f"--api_keys {API_KEYS} --vnpt_index {VNPT_INDEX} --llm_model {LLM_MODEL} --top_k_retrieval {TOP_K}"
)
print(cmd)
os.system(cmd)

import pandas as pd
print(pd.read_csv('/kaggle/working/submission.csv').head())