In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn as nn
import os
import matplotlib.pyplot as plt
from skimage import io
import seaborn as sns
import warnings
import numpy as np
import warnings
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings
from pylab import mpl, plt
import matplotlib.patches as mpatches
from tqdm.notebook import tqdm

# best font and style settings for notebook 
warnings.filterwarnings('ignore')
sns.set_style("white")
mpl.rcParams['font.family'] = 'MiSans'

model_path = r"D:\pythonProject\DeepSeek\Recsys\AnimeLLMRec\Qwen3-0.6B"  # modify to your Qwen Path
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from delta_trainer import train_delta_from_H, generate_by_H, evaluate_slot_ceval, evaluate_slot_ceval_eos, \
    evaluate_slot_ceval_eos_2

# 构造 prompt & 得到 H_state
prompt = "请写一段关于AI教育的引言。"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True, return_dict=True)
H = outputs.hidden_states[-1]

# 调用 delta 训练
delta_3 = train_delta_from_H(model, tokenizer, prompt, H, step=3)
delta_10 = train_delta_from_H(model, tokenizer, prompt, H, step=10)
delta_30 = train_delta_from_H(model, tokenizer, prompt, H, step=30)


In [None]:
# generate_by_H(model=model, prompt=prompt, tokenizer=tokenizer, delta=delta_3, answer_len=200)

In [None]:
from datasets import get_dataset_config_names

# 获取本地路径 "./ceval-exam" 中可用的所有子数据集名称（config names）
dataset_path = "./ceval-exam"
dataset_names = get_dataset_config_names(path=dataset_path)
dataset_names

In [None]:
from datasets import load_dataset

dataset = load_dataset(r"./ceval-exam", name="computer_network")
print(dataset['val'][0])

In [70]:
def evaluate_slot_ceval_eos(model, tokenizer, delta, example, max_len=20, verbose=True):
    """
    基于 generate_by_H_eos 的评估函数，用于 C-Eval 单选题目。

    返回：
    - predict_option: 预测选项，如 'A'
    - is_correct: 是否预测正确
    """
    prompt = f"""以下是一道单项选择题，请你阅读题目并选择最合适的选项。

题目：{example['question']}

选项：
A. {example['A']}
B. {example['B']}
C. {example['C']}
D. {example['D']}

答案是："""

    output_text = generate_by_H_eos(model, prompt, tokenizer, delta, answer_len=max_len)

    if verbose:
        print("🔍 模型生成结果:\n", output_text)

    predict_option = None
    for option in ['A', 'B', 'C', 'D']:
        if option in output_text:
            predict_option = option
            break

    is_correct = (predict_option == example['answer'])
    # return predict_option, is_correct
    return output_text, predict_option, example['answer'], is_correct

In [None]:
dataset_name = "computer_network"
dataset = load_dataset(r"./ceval-exam", name=dataset_name)

correct = 0
total = 0

answer_sheet = []
for ex in tqdm(dataset['val'], desc="Evaluating per-question delta"):
    # === 构造每道题的 Prompt ===
    prompt = f"""以下是一道单项选择题，请你阅读题目，结合题目的知识背景，选择最合适的选项。
    题目：{ex['question']}
    
    选项：
    A. {ex['A']}
    B. {ex['B']}
    C. {ex['C']}
    D. {ex['D']}
    
    答案是："""

    # === 获取 H_state ===
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, return_dict=True)
    H = outputs.hidden_states[-1]

    # === 训练 delta（例如3步）===
    delta = train_delta_from_H(model, tokenizer, prompt, H, step=30)

    # === 推理与评估 ===
    pred, pre_answer, answer, is_correct = evaluate_slot_ceval_eos(model, tokenizer, delta, ex, max_len=20,
                                                                   verbose=False)
    correct += int(is_correct)
    total += 1
    answer_sheet.append([pred, pre_answer, answer, is_correct, dataset_name])
print(f"🎯 Accuracy (per-question delta): {correct}/{total} = {correct / total:.2%}")


In [75]:
def evaluate_slot_ceval_eos(model, tokenizer, delta, example, prompt, max_len=20, verbose=True):
    """
    基于 generate_by_H_eos 的评估函数，用于 C-Eval 单选题目。

    返回：
    - predict_option: 预测选项，如 'A'
    - is_correct: 是否预测正确
    # """

    output_text = generate_by_H_eos(model, prompt, tokenizer, delta, answer_len=max_len)

    if verbose:
        print("🔍 模型生成结果:\n", output_text)

    predict_option = None
    for option in ['A', 'B', 'C', 'D']:
        if option in output_text:
            predict_option = option
            break

    is_correct = (predict_option == example['answer'])
    # return predict_option, is_correct
    return output_text, predict_option, example['answer'], is_correct


def eval_dataset(dataset_name, step=3, max_len=50):
    # dataset_name = "computer_network"
    dataset = load_dataset(r"./ceval-exam", name=dataset_name)

    correct = 0
    total = 0

    answer_sheet = []
    for ex in tqdm(dataset['val'], desc="Evaluating per-question delta"):
        # === 构造每道题的 Prompt ===
        prompt = f"""以下是一道单项选择题，请你阅读题目，选择最合适的选项。
        题目：{ex['question']}
        选项：
        A. {ex['A']}
        B. {ex['B']}
        C. {ex['C']}
        D. {ex['D']}
        答案是："""
        # === 获取 H_state ===
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True, return_dict=True)
        H = outputs.hidden_states[-1]

        # === 训练 delta（例如3步）===
        delta = train_delta_from_H(model, tokenizer, prompt, H, step=step)

        # === 推理与评估 ===
        pred_txt, pre_answer, answer, is_correct = evaluate_slot_ceval_eos(model=model, tokenizer=tokenizer, delta=delta,
                                                                       example=ex, max_len=max_len, prompt=prompt,
                                                                       verbose=False)
        correct += int(is_correct)
        total += 1
        answer_sheet.append([ex['question'], pred_txt, pre_answer, answer, is_correct, dataset_name])
    print(f"🎯 {dataset_name} Accuracy (per-question delta): {correct}/{total} = {correct / total:.2%}")
    return answer_sheet

In [None]:
len(dataset_names)

In [ ]:
step = 0
max_len = 50
answer_sheet = []
for i in tqdm(dataset_names[:]):
    answer_sheet += eval_dataset(i, step=step, max_len=max_len)
    df_answer = pd.DataFrame(answer_sheet)
    df_answer.to_csv(f"./eval_result/answer_step_{step}.csv", index=False)

In [None]:
step = 3
max_len = 50
answer_sheet = []
for i in tqdm(dataset_names[:]):
    answer_sheet += eval_dataset(i, step=step, max_len=max_len)
    df_answer = pd.DataFrame(answer_sheet)
    df_answer.to_csv(f"./eval_result/answer_step_{step}.csv", index=False)

In [None]:
step = 10
max_len = 50
answer_sheet = []
for i in tqdm(dataset_names[:]):
    answer_sheet += eval_dataset(i, step=step, max_len=max_len)
    df_answer = pd.DataFrame(answer_sheet)
    df_answer.to_csv(f"./eval_result/answer_step_{step}.csv", index=False)

In [None]:
step = 5
max_len = 50
answer_sheet = []
for i in tqdm(dataset_names[:]):
    answer_sheet += eval_dataset(i, step=step, max_len=max_len)
    df_answer = pd.DataFrame(answer_sheet)
    df_answer.to_csv(f"./eval_result/answer_step_{step}.csv", index=False)

In [None]:
step = 6
answer_sheet = []
for i in tqdm(dataset_names[:]):
    answer_sheet += eval_dataset(i, step=step)
    df_answer = pd.DataFrame(answer_sheet)
    df_answer.to_csv(f"./eval_result/answer_step_{step}.csv", index=False)

In [None]:
step = 30
answer_sheet = []
for i in tqdm(dataset_names[:]):
    answer_sheet += eval_dataset(i, step=step)
    df_answer = pd.DataFrame(answer_sheet)
    df_answer.to_csv(f"./eval_result/answer_step_{step}.csv", index=False)

In [None]:
step = 0
answer_sheet = []
for i in tqdm(dataset_names[:]):
    answer_sheet += eval_dataset(i, step=step)
    df_answer = pd.DataFrame(answer_sheet)
    df_answer.to_csv(f"./eval_result/answer_step_{step}.csv", index=False)

In [None]:
df_answer

In [None]:
pd.DataFrame(answer_sheet)

In [None]:
# from tqdm import tqdm
from tqdm.notebook import tqdm

correct = 0
total = 0

for ex in tqdm(dataset['test'], desc="Evaluating per-question delta"):
    # === 构造每道题的 Prompt ===
    prompt = f"""以下是一道单项选择题，请你阅读题目并选择最合适的选项。

题目：{ex['question']}

选项：
A. {ex['A']}
B. {ex['B']}
C. {ex['C']}
D. {ex['D']}

答案是："""

    # === 获取 H_state ===
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, return_dict=True)
    H = outputs.hidden_states[-1]

    # === 训练 delta（例如3步）===
    delta = train_delta_from_H(model, tokenizer, prompt, H, step=3)

    # === 推理与评估 ===
    pred, is_correct = evaluate_slot_ceval_eos(model, tokenizer, delta, ex, max_len=200, verbose=False)
    print(pred, ex)
    correct += int(is_correct)
    total += 1

print(f"🎯 Accuracy (per-question delta): {correct}/{total} = {correct / total:.2%}")


In [None]:
def get_empty_delta(H_state):
    hidden_size = H_state.size(-1)
    delta = nn.Parameter(torch.zeros((1, 1, hidden_size), device=H_state.device, requires_grad=True))

    return delta


delta_empty = get_empty_delta(H_state=H)
delta_empty.shape

In [None]:
from tqdm.notebook import tqdm

correct = 0
total = 0

for ex in tqdm(dataset['test'], desc="Evaluating per-question delta"):
    # === 构造每道题的 Prompt ===
    prompt = f"""以下是一道单项选择题，请你阅读题目并选择最合适的选项。
    
    题目：{ex['question']}
    
    选项：
    A. {ex['A']}
    B. {ex['B']}
    C. {ex['C']}
    D. {ex['D']}
    
    答案是："""

    # === 推理与评估 ===
    pred, is_correct = evaluate_slot_ceval_eos(model, tokenizer, delta_empty, ex, max_len=20, verbose=True)
    correct += int(is_correct)
    total += 1

print(f"🎯 Accuracy (per-question delta): {correct}/{total} = {correct / total:.2%}")


In [None]:
from delta_trainer import generate_by_H_eos


def evaluate_slot_ceval_eos_2(model, tokenizer, delta, example, max_len=20, verbose=True):
    """
    基于 generate_by_H_eos 的评估函数，用于 C-Eval 单选题目。

    返回：
    - predict_option: 预测选项，如 'A'
    - is_correct: 是否预测正确
    """
    prompt = f"""以下是一道单项选择题，请你阅读题目并选择最合适的选项。

题目：{example['question']}

选项：
A. {example['A']}
B. {example['B']}
C. {example['C']}
D. {example['D']}

答案是："""

    output_text = generate_by_H_eos(model, prompt, tokenizer, delta, answer_len=max_len)

    if verbose:
        print("🔍 模型生成结果:\n", output_text)

    predict_option = None
    for option in ['A', 'B', 'C', 'D']:
        if option in output_text:
            predict_option = option
            break

    print(predict_option, example['answer'])
    is_correct = (predict_option == example['answer'])
    return predict_option, predict_option, example['answer']


In [None]:
from tqdm.notebook import tqdm

correct = 0
total = 0

for ex in tqdm(dataset['test'], desc="Evaluating per-question delta"):
    # === 构造每道题的 Prompt ===
    prompt = f"""以下是一道单项选择题，请你阅读题目并选择最合适的选项。
    
    题目：{ex['question']}
    
    选项：
    A. {ex['A']}
    B. {ex['B']}
    C. {ex['C']}
    D. {ex['D']}
    
    答案是："""

    # === 推理与评估 ===
    pred, predict, truth = evaluate_slot_ceval_eos_2(model, tokenizer, delta_empty, ex, max_len=20, verbose=False)
    # print(predict,truth)
    correct += int(predict == truth)
    total += 1

print(f"🎯 Accuracy (per-question delta): {correct}/{total} = {correct / total:.2%}")


In [None]:
for ex in tqdm(dataset['val'], desc="Evaluating per-question delta"):
    # === 构造每道题的 Prompt ===
    prompt = f"""以下是一道单项选择题，请你阅读题目并选择最合适的选项。

题目：{ex['question']}

选项：
A. {ex['A']}
B. {ex['B']}
C. {ex['C']}
D. {ex['D']}

答案是："""
    print(prompt)