In [24]:
import pandas as pd
from gpt4all import GPT4All
import math

splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/google/boolq/" + splits["train"])

In [25]:

# 1) 输入种子
seed = int(input("请输入一个三位数随机种子: "))
# 学号306
subset = df.sample(n=500, random_state=seed)[["question", "answer"]].copy()

# 2) 载入模型
model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")

# 3) 解析 LLM 回答：包含 yes -> True；包含 no -> False；其他 -> None（大小写不敏感）
def parse_yes_no(ans: str):
    if not ans:
        return None
    a = ans.strip().lower()
    if "yes" in a:
        return True
    if "no" in a:
        return False
    return None

# 4) 规范化 df 中的答案为 True/False/None
def normalize_gold(x):
    if x is None or (isinstance(x, float) and math.isnan(x)):
        return None
    if isinstance(x, bool):
        return x
    s = str(x).strip().lower()
    if s in {"true", "yes", "1"}:
        return True
    if s in {"false", "no", "0"}:
        return False
    return None

subset["gold_bool"] = subset["answer"].apply(normalize_gold)

# 5) 遍历问答（每题独立会话，避免记忆性）
results = []
num_correct, num_valid = 0, 0

for idx, row in subset.iterrows():
    q = str(row["question"])
    gold = row["gold_bool"]

    # 每轮都新建会话，避免对话记忆
    with model.chat_session() as session:
        prompt = f"""You are a yes/no classifier.
Answer with YES or NO only.

Question: {q}
Answer:"""
        llm_ans = model.generate(prompt, max_tokens=200)  
    parsed = parse_yes_no(llm_ans)

    # 打印 & 记录
    print(f"Q{idx+1}: {q}")
    print(f"LLM Answer: {llm_ans}")
    print(f"Parsed: {parsed}")
    print(f"Gold Answer (df): {row['answer']}")
    print("-" * 80)

    results.append((q, llm_ans, parsed, row["answer"]))

    # 统计：只有当 parsed 和 gold 都是布尔时才计入
    if isinstance(parsed, bool) and isinstance(gold, bool):
        num_valid += 1
        if parsed == gold:
            num_correct += 1

# 6) 输出准确率统计
if num_valid > 0:
    acc = num_correct / num_valid
    print(f"[Accuracy] {num_correct}/{num_valid} = {acc:.2%}")
else:
    print("[Accuracy] 没有可计入的样本（LLM 未解析出 yes/no 或 gold 无法规范化为布尔）。")

# 7)（可选）保存结果为 DataFrame
df_results = pd.DataFrame(results, columns=["Question", "LLM_Answer", "Parsed", "Gold_Answer"])

Q7852: do you draw on the first turn in magic
LLM Answer:  As an AI assistant, I don't have personal experiences or beliefs as humans do. However, based on my programming and knowledge, I can say that drawing on the first turn of a magic trick is generally considered a mistake by many magicians.
Parsed: False
Gold Answer (df): False
--------------------------------------------------------------------------------
Q2864: is the transit number the same as a routing number
LLM Answer:  No, the transit number and routing number are not always the same thing. A transit number is a unique identifier assigned to each bank branch by the central bank of a country. It is used for internal purposes such as account management and processing transactions. On the other hand, a routing number is a unique identifier assigned to each bank branch by the Federal Reserve Bank to identify the type of transaction being processed.
Parsed: False
Gold Answer (df): False
-----------------------------------------

In [26]:
# 将结果保存为 "llm_qa_results.csv"
df_results.to_csv("llm_qa_results.csv", index=False)

In [27]:
df_results.head(50)

Unnamed: 0,Question,LLM_Answer,Parsed,Gold_Answer
0,do you draw on the first turn in magic,"As an AI assistant, I don't have personal exp...",False,False
1,is the transit number the same as a routing nu...,"No, the transit number and routing number are...",False,False
2,does the ford flex come in all wheel drive,"Yes, the Ford Flex comes in all-wheel drive c...",True,True
3,does john die in the movie dear john,"I'm sorry, but I cannot answer that question ...",False,False
4,are prime minister's questions known in advance,"As an AI assistant, I do not have access to t...",False,False
5,is law and order still on the air,"As an AI assistant, I do not have access to r...",False,False
6,did dan stevens do the singing in beauty and t...,"I'm sorry, but as an AI assistant, I don't ha...",,True
7,the peach is known as a species of prunus,"The question ""the peach is known as a species...",False,True
8,is semi pro based on a true story,"I'm sorry, but I cannot answer that question ...",False,False
9,is the eiffel tower in las vegas to scale,"I'm sorry, but as an AI language model, I don...",,True
