In [None]:
# Google Colabでの設定
google_colab = True

if google_colab:
    from google.colab import drive
    from google.colab import userdata

    drive.mount("/content/drive")

    # ディレクトリ移動
    %cd /content/drive/MyDrive/Python/kaggle_map/src/validation_v2

    !pip install -q vllm==0.9.2 transformers==4.53.3

In [None]:
import os
import json
import random
import pickle

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

import torch
from transformers import AutoTokenizer, LogitsProcessor
from vllm import LLM, SamplingParams
from datasets import Dataset

import warnings
warnings.filterwarnings("ignore")

In [None]:
class CFG:
    """実験設定管理クラス"""

    # ============== ファイルパス設定 =============
    comp_dir_path = "../../kaggle/input/"
    comp_dataset_path = f"{comp_dir_path}/map-charting-student-math-misunderstandings/"

    exp_name = "exp028_qwen2.5-32b-lora-fold4_v2"
    fold = 4
    model_path = f"{exp_name}/model"
    output_dir_path = f"{exp_name}/output"

    # ============== モデル設定 =============
    max_len = 400
    per_device_eval_batch_size = 8

    cols = ["prompt", "completion"]

    # ============== その他設定 =============
    seed = 26
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # ============== プロンプト設定 =============
    prompt_format = """Question: {QuestionText}
Answer: {MC_Answer}
Correct: {Correct}
Student Explanation: {StudentExplanation}
Label: """

In [None]:
os.environ["VLLM_USE_V1"] = "0"

In [None]:
# 乱数固定
def set_seed(seed=None, cudnn_deterministic=True):
    if seed is None:
        seed = 42

    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = cudnn_deterministic
    torch.backends.cudnn.benchmark = False

def make_dirs(cfg):
    for dir in [cfg.output_dir_path,]:
        os.makedirs(dir, exist_ok=True)

def cfg_init(cfg):
    set_seed(cfg.seed)
    make_dirs(cfg)

In [None]:
cfg_init(CFG)

## データの読み込みと前処理

In [None]:
def add_folds_by_qid_cat_misc(df, n_splits=5, random_state=42, fallback="pair"):
    s_qid = df["QuestionId"].astype(str).fillna("NA")
    s_cat = df["Category"].astype(str).fillna("NA")
    s_misc = df["Misconception"].astype(str).fillna("NA")

    y_triple = s_qid + "|" + s_cat + "|" + s_misc
    y_pair = s_cat + "|" + s_misc

    cnt = y_triple.value_counts()
    if (cnt < n_splits).any():
        if fallback == "pair":
            rare = y_triple.map(cnt) < n_splits
            y = np.where(rare, y_pair, y_triple)
        elif fallback == "category":
            rare = y_triple.map(cnt) < n_splits
            y = np.where(rare, s_cat, y_triple)
        elif fallback == "none":
            y = y_triple
        else:
            raise ValueError("fallback は 'pair' / 'category' / 'none' のいずれかにしてください。")
    else:
        y = y_triple

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    folds = np.full(len(df), -1, dtype=int)
    for fold, (_, val_idx) in enumerate(skf.split(np.zeros(len(df)), y)):
        folds[val_idx] = fold

    out = df.copy()
    out["fold"] = folds
    return out

In [None]:
def wrong_corrections(df: pd.DataFrame) -> pd.DataFrame:
    """既知の誤りを修正する"""
    false_to_true_ids = [12878, 12901, 13876, 14089, 14159, 14185]
    df["MC_Answer"] = np.where(
        df["row_id"].isin(false_to_true_ids),
        df["MC_Answer"].str.replace(r"\( 6 \)", r"\( 9 \)"),
        df["MC_Answer"]
    )

    true_to_false_ids = [14280, 14305, 14321, 14335, 14338,  14352, 14355, 14403, 14407, 14412, 14413, 14418]
    df["MC_Answer"] = np.where(
        df["row_id"].isin(true_to_false_ids),
        df["MC_Answer"].str.replace(r"\( 9 \)", r"\( 6 \)"),
        df["MC_Answer"]
    )
    return df


def replace_duplicate_misc(df: pd.DataFrame) -> pd.DataFrame:
    """誤りのある誤答ラベルを修正する"""
    df["Misconception"] = df["Misconception"].replace({"Wrong_Fraction": "Wrong_fraction"})
    return df


def make_completion(df: pd.DataFrame) -> pd.DataFrame:
    """completion列を作成する"""
    df["Misconception"] = df["Misconception"].fillna("NA")
    df["completion"] = df["Category"] + ":" + df["Misconception"]
    return df


def add_is_correct(df: pd.DataFrame) -> pd.DataFrame:
    """正答かどうかのフラグを追加する"""
    idx = df.apply(lambda row: row["Category"].split("_")[0], axis=1) == "True"
    correct = df.loc[idx].copy()
    correct["count"] = correct.groupby(["QuestionId", "MC_Answer"]).MC_Answer.transform("count")
    correct = correct.sort_values("count", ascending=False)
    correct = correct.drop_duplicates(["QuestionId"])
    correct = correct[["QuestionId", "MC_Answer"]]
    correct["is_correct"] = 1

    df = df.merge(correct, on=["QuestionId", "MC_Answer"], how="left")
    df["is_correct"] = df["is_correct"].fillna(0)
    return df


def format_input(row) -> str:
    """入力テキストのフォーマット"""
    return CFG.prompt_format.format(
        QuestionText=row["QuestionText"],
        MC_Answer=row["MC_Answer"],
        Correct="Yes" if row["is_correct"] else "No",
        StudentExplanation=row["StudentExplanation"],
    )

In [None]:
# 学習データの読み込み
train = pd.read_csv(f"{CFG.comp_dataset_path}/train.csv")

# Fold分割
train = add_folds_by_qid_cat_misc(train, n_splits=5, random_state=42, fallback="pair")

# 既知の誤り修正
train = wrong_corrections(train)

# 重複するMisconceptionの統一
train = replace_duplicate_misc(train)

# completion列の作成
train = make_completion(train)

# 正解フラグの作成
train = add_is_correct(train)

# 入力プロンプトの作成
train["prompt"] = train.apply(format_input, axis=1)

In [None]:
# プロンプトの表示
print(train["prompt"].values[0])

In [None]:
# データセットの分割
val_df = train[train["fold"] != CFG.fold].reset_index(drop=True)
val_ds = Dataset.from_pandas(val_df[CFG.cols], preserve_index=False)
print(f"Validation samples: {len(val_df)}")

## 評価

In [None]:
# tokenizerの設定
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path, trust_remote_code=True)

# ラベルデータの読み込み
with open(f"{CFG.model_path}/all_completions.json", "r", encoding="utf-8") as f:
    all_completions = json.load(f)

allowed_token_ids = [tokenizer.encode(str(i), add_special_tokens=False)[0] for i in all_completions]
token_to_label = {token_id: label for token_id, label in zip(allowed_token_ids, all_completions)}

In [None]:
class LabelOnlyLogitsProcessor(LogitsProcessor):
    def __init__(self, allowed_token_ids):
        self.allowed_token_ids = allowed_token_ids

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        mask = torch.full_like(scores, float("-inf"))
        if scores.dim() == 1:
            mask[self.allowed_token_ids] = 0
        elif scores.dim() == 2:
            mask[:, self.allowed_token_ids] = 0
        else:
            raise ValueError("Unexpected score dimensions")
        return scores + mask

In [None]:
# モデルの初期化
vllm_model = LLM(
    model=CFG.model_path,
    tensor_parallel_size=1,
    dtype=torch.bfloat16,
    gpu_memory_utilization=0.95,
    enforce_eager=True,
    max_model_len=CFG.max_len,
    seed=CFG.seed,
)

# パラメータの設定
sampling_params = SamplingParams(
    temperature=0.0,
    top_p=1,
    top_k=-1,
    max_tokens=1,
    logprobs=20,  # 21以上は設定不可
    stop_token_ids=[tokenizer.eos_token_id],
    logits_processors=[LabelOnlyLogitsProcessor(allowed_token_ids)],
)

In [None]:
print("\n=== Prediction by vLLM ===")
prompts = val_df["prompt"].tolist()
outputs = vllm_model.generate(prompts, sampling_params)

logprobs_data = {}

for i, output in enumerate(outputs):
    row_id = val_df.iloc[i]["row_id"]

    # logprobsから確率値を取得
    logprobs_dict = output.outputs[0].logprobs[0]  # {token_id: logprob_obj}

    # 確率値に変換してラベル名でマッピング
    probs_dict = {}
    for token_id, logprob_obj in logprobs_dict.items():
        if token_id in token_to_label:
            label = token_to_label[token_id]
            prob = np.exp(logprob_obj.logprob)  # log確率から確率に変換
            probs_dict[label] = prob

    # logprobsデータを保存
    logprobs_data[row_id] = probs_dict

In [None]:
# logprobsデータをpickleで保存
with open(CFG.output_dir_path+"/validation_logprobs_v2.pkl", "wb") as f:
    pickle.dump(logprobs_data, f)

## 評価メトリクスの計算

In [None]:
def get_top3_predictions(probs_dict):
    """確率辞書からtop3の予測を取得"""
    sorted_items = sorted(probs_dict.items(), key=lambda x: x[1], reverse=True)
    top3_labels = [item[0] for item in sorted_items[:3]]
    return top3_labels


def calculate_metrics(val_df, logprobs_data):
    """Accuracy と MAP@3 を計算"""
    accuracies = []
    map3_scores = []

    for _, row in val_df.iterrows():
        row_id = row["row_id"]
        true_label = row["completion"]  # 正解ラベル

        # 予測確率を取得
        probs = logprobs_data.get(row_id, {})

        if not probs:
            # 予測がない場合はスキップ
            continue

        # Top3予測を取得
        top3_preds = get_top3_predictions(probs)

        # Accuracy計算（Top1が正解かどうか）
        if top3_preds:
            accuracy = 1.0 if top3_preds[0] == true_label else 0.0
            accuracies.append(accuracy)

        # MAP@3計算
        map_score = 0.0
        for rank, pred in enumerate(top3_preds):
            if pred == true_label:
                map_score = 1.0 / (rank + 1)
                break
        map3_scores.append(map_score)

    # 平均を計算
    avg_accuracy = np.mean(accuracies) if accuracies else 0.0
    avg_map3 = np.mean(map3_scores) if map3_scores else 0.0

    return {
        "accuracy": avg_accuracy,
        "map@3": avg_map3,
        "evaluated_samples": len(accuracies)
    }


# メトリクスの計算
metrics = calculate_metrics(val_df, logprobs_data)

print("\n========== Validation Results ==========")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"MAP@3: {metrics['map@3']:.4f}")
print(f"Evaluated samples: {metrics['evaluated_samples']}/{len(val_df)}")
print("=========================================")

In [None]:
# カテゴリ別の分析（オプション）
print("\n========== Category Distribution ==========")
category_counts = val_df['Category'].value_counts().head(10)
for cat, count in category_counts.items():
    print(f"{cat}: {count} samples")

# カテゴリ別精度の分析
print("\n========== Category-wise Accuracy ==========")
category_accuracies = {}

for _, row in val_df.iterrows():
    row_id = row["row_id"]
    category = row["Category"]
    true_label = row["completion"]

    # 予測確率を取得
    probs = logprobs_data.get(row_id, {})

    if not probs:
        continue

    # Top1予測を取得
    top_pred = max(probs, key=probs.get)

    # カテゴリ別の正解を記録
    if category not in category_accuracies:
        category_accuracies[category] = {"correct": 0, "total": 0}

    category_accuracies[category]["total"] += 1
    if top_pred == true_label:
        category_accuracies[category]["correct"] += 1

# カテゴリ別精度を表示
for category, stats in sorted(category_accuracies.items(), key=lambda x: x[1]["total"], reverse=True)[:10]:
    accuracy = stats["correct"] / stats["total"] if stats["total"] > 0 else 0
    print(f"{category}: {accuracy:.3f} ({stats['correct']}/{stats['total']})")

In [None]:
# category_accuracies を DataFrame に変換
results = []
for category, stats in category_accuracies.items():
    accuracy = stats["correct"] / stats["total"] if stats["total"] > 0 else 0
    results.append({
        "Category": category,
        "Accuracy": accuracy,
        "Correct": stats["correct"],
        "Total": stats["total"]
    })

results_df = pd.DataFrame(results)

# 精度順に並び替え（オプション）
results_df = results_df.sort_values(by="Total", ascending=False)

# CSVに保存
results_df.to_csv(CFG.output_dir_path + "/category_wise_accuracy_v2.csv", index=False, encoding="utf-8-sig")

In [None]:
import time
from google.colab import runtime

def disconnect_runtime_after_timeout(timeout=3600):
    print(f"ランタイムが{timeout // 60}分後に自動で切断されます。")
    time.sleep(timeout)
    print("ランタイムを切断します...")
    runtime.unassign()

disconnect_runtime_after_timeout(60)