In [1]:
# Google Colabでの設定
google_colab = True

if google_colab:
    from google.colab import drive
    from google.colab import userdata

    drive.mount("/content/drive")

    # ディレクトリ移動
    %cd /content/drive/MyDrive/Python/kaggle_map/src/ettin-encoder-1b_exp007

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Python/kaggle_map/src/ettin-encoder-1b_exp007


In [2]:
import os
import gc
import time
import random
import pickle
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix

import torch
import wandb
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [3]:
class CFG:
    """実験設定管理クラス"""

    # ============== 実験情報 =============
    comp_name = "kaggle_map"
    exp_name = "ettin-encoder-1b_exp007_multi_model"
    model_name = "jhu-clsp/ettin-encoder-1b"

    # ============== ファイルパス設定 =============
    comp_dir_path = "../../kaggle/input/"
    comp_dataset_path = f"{comp_dir_path}/map-charting-student-math-misunderstandings/"
    output_dir_path = "output/"
    log_dir_path = "logs/"

    # ============== モデル設定 =============
    max_len = 256

    num_train_epochs = 10
    per_device_train_batch_size = 32
    gradient_accumulation_steps = 1
    per_device_eval_batch_size = 64
    optim_type = "adamw_torch"
    learning_rate = 5e-5
    lr_scheduler_type = "cosine"
    warmup_steps = 50
    weight_decay = 0.01

    # ============== その他設定 =============
    seed = 42
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # ============== 個別モデル設定 =============
    train_individual_models = True  # 個別モデルを学習するか
    parallel_training = False  # 並列学習（メモリに余裕がある場合）

In [4]:
# 乱数固定
def set_seed(seed=None, cudnn_deterministic=True):
    if seed is None:
        seed = 42

    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = cudnn_deterministic
    torch.backends.cudnn.benchmark = False

def make_dirs(cfg):
    for dir in [cfg.output_dir_path, cfg.log_dir_path]:
        os.makedirs(dir, exist_ok=True)

def cfg_init(cfg):
    set_seed(cfg.seed)
    make_dirs(cfg)

cfg_init(CFG)

# 1. LLMの学習

## 1.1 データの読み込み

In [5]:
# データ読み込み
train = pd.read_csv(f"{CFG.comp_dataset_path}/train.csv")
print(train.shape)
train.head(2)

(36696, 7)


Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,Misconception
0,0,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),0ne third is equal to tree nineth,True_Correct,
1,1,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 / 3 because 6 over 9 is 2 thirds and 1 third...,True_Correct,


## 1.2 前処理

In [6]:
def add_folds_by_qid_cat_misc(df, n_splits=5, random_state=42, fallback="pair"):
    s_qid = df["QuestionId"].astype(str).fillna("NA")
    s_cat = df["Category"].astype(str).fillna("NA")
    s_misc = df["Misconception"].astype(str).fillna("NA")

    y_triple = s_qid + "|" + s_cat + "|" + s_misc
    y_pair = s_cat + "|" + s_misc

    cnt = y_triple.value_counts()
    if (cnt < n_splits).any():
        if fallback == "pair":
            rare = y_triple.map(cnt) < n_splits
            y = np.where(rare, y_pair, y_triple)
        elif fallback == "category":
            rare = y_triple.map(cnt) < n_splits
            y = np.where(rare, s_cat, y_triple)
        elif fallback == "none":
            y = y_triple
        else:
            raise ValueError("fallback は 'pair' / 'category' / 'none' のいずれかにしてください。")
    else:
        y = y_triple

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    folds = np.full(len(df), -1, dtype=int)
    for fold, (_, val_idx) in enumerate(skf.split(np.zeros(len(df)), y)):
        folds[val_idx] = fold

    out = df.copy()
    out["fold"] = folds
    return out

In [7]:
# fold分割
train = add_folds_by_qid_cat_misc(train, n_splits=5, random_state=42, fallback="pair")
train["fold"].value_counts()



Unnamed: 0_level_0,count
fold,Unnamed: 1_level_1
0,7340
1,7339
4,7339
2,7339
3,7339


In [8]:
# 間違っている正解ラベルを修正(MEMO: trainのみに適用しないとCV/LBの相関がとれないが、一旦無視)
false_to_true_ids = [12878, 12901, 13876, 14089, 14159, 14185]
train["Category"] = np.where(train["row_id"].isin(false_to_true_ids), train["Category"].str.replace("False", "True"), train["Category"])

true_to_false_ids = [14280, 14305, 14321, 14335, 14338,  14352, 14355, 14403, 14407, 14412, 14413, 14418]
train["Category"] = np.where(train["row_id"].isin(true_to_false_ids), train["Category"].str.replace("True", "False"), train["Category"])

In [9]:
# 修正されたラベルが含まれるfold
for f in range(5):
    ids = train[(train["fold"] == f) & (train["row_id"].isin(false_to_true_ids) | train["row_id"].isin(true_to_false_ids))]["row_id"].values
    print(f"Fold {f}: {ids}")

Fold 0: [12878 14159]
Fold 1: [14280 14305 14338 14407]
Fold 2: [14089 14335 14355]
Fold 3: [13876 14185 14418]
Fold 4: [12901 14321 14352 14403 14412 14413]


In [10]:
# 重複するMisconceptionを統一(MEMO: 後処理で再分配する必要あり)
train["Misconception"] = train["Misconception"].replace({"Wrong_Fraction": "Wrong_fraction"})

In [11]:
# ラベルの作成
train.Misconception = train.Misconception.fillna("NA")
train["target"] = train.Category + ":" + train.Misconception

## 1.3 特徴量エンジニアリング

In [12]:
# 正解フラグの作成
idx = train.apply(lambda row: row.Category.split("_")[0], axis=1) == "True"
correct = train.loc[idx].copy()
correct["c"] = correct.groupby(["QuestionId", "MC_Answer"]).MC_Answer.transform("count")
correct = correct.sort_values("c", ascending=False)
correct = correct.drop_duplicates(["QuestionId"])
correct = correct[["QuestionId", "MC_Answer"]]
correct["is_correct"] = 1

# 正解ラベルを訓練データにマージ
train = train.merge(correct, on=["QuestionId", "MC_Answer"], how="left")
train.is_correct = train.is_correct.fillna(0)

# 1.4 QuestionIdごとの個別モデル学習

In [13]:
def format_input(row):
    """入力テキストのフォーマット"""
    x = "Yes"
    if not row["is_correct"]:
        x = "No"
    return (
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct: {x}\n"
        f"Student Explanation: {row['StudentExplanation']}"
    )

In [14]:
def train_model_for_question(train_df, qid, cfg, tokenizer, fold_to_use=0):
    """特定のQuestionIdに対してモデルを学習"""

    print(f"\n{'='*60}")
    print(f"QuestionId {qid} のモデル学習開始")
    print(f"{'='*60}")

    # ラベルエンコーディング（Question固有）
    le = LabelEncoder()
    train_df["label"] = le.fit_transform(train_df["target"])
    n_classes = len(le.classes_)

    print(f"クラス数: {n_classes}")
    print(f"サンプル数: {len(train_df)}")

    # テキストのフォーマット
    train_df["text"] = train_df.apply(format_input, axis=1)

    # 訓練・検証データ分割
    train_data = train_df[train_df.fold != fold_to_use]
    val_data = train_df[train_df.fold == fold_to_use]

    print(f"訓練データ: {len(train_data)}サンプル")
    print(f"検証データ: {len(val_data)}サンプル")

    # Hugging Faceデータセットに変換
    COLS = ["text", "label"]
    train_ds = Dataset.from_pandas(train_data[COLS])
    val_ds = Dataset.from_pandas(val_data[COLS])

    # トークナイズ
    def tokenize(batch):
        return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=cfg.max_len)

    train_ds = train_ds.map(tokenize, batched=True)
    val_ds = val_ds.map(tokenize, batched=True)

    # PyTorch用のフォーマット
    columns = ["input_ids", "attention_mask", "label"]
    train_ds.set_format(type="torch", columns=columns)
    val_ds.set_format(type="torch", columns=columns)

    # モデルの読み込み
    model = AutoModelForSequenceClassification.from_pretrained(
        cfg.model_name,
        num_labels=n_classes,
        ignore_mismatched_sizes=True,
    )

    # 学習の設定（QuestionId固有の出力ディレクトリ）
    output_dir = f"{cfg.output_dir_path}/qid_{qid}"
    os.makedirs(output_dir, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=output_dir,
        do_train=True,
        do_eval=True,
        eval_strategy="steps",
        save_strategy="steps",
        num_train_epochs=cfg.num_train_epochs,
        per_device_train_batch_size=cfg.per_device_train_batch_size,
        gradient_accumulation_steps=cfg.gradient_accumulation_steps,
        per_device_eval_batch_size=cfg.per_device_eval_batch_size,
        learning_rate=cfg.learning_rate,
        optim=cfg.optim_type,
        lr_scheduler_type=cfg.lr_scheduler_type,
        warmup_steps=cfg.warmup_steps,
        weight_decay=cfg.weight_decay,
        logging_dir=f"{cfg.log_dir_path}/qid_{qid}",
        logging_steps=50,
        save_steps=200,
        eval_steps=100,
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="map@3",
        greater_is_better=False,
        report_to="none",  # wandbを無効化（個別モデルが多いため）
        bf16=True,
        fp16=False,
    )

    # MAP@3メトリック
    def compute_map3(eval_pred):
        logits, labels = eval_pred
        probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()

        # クラス数が3未満の場合の処理
        k = min(3, probs.shape[1])
        top_k = np.argsort(-probs, axis=1)[:, :k]
        match = top_k == labels[:, None]

        map_k = 0
        for i in range(len(labels)):
            for j in range(k):
                if match[i, j]:
                    map_k += 1.0 / (j + 1)
                    break
        return {"map@3": map_k / len(labels)}

    # トレーナーの設定
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        processing_class=tokenizer,
        compute_metrics=compute_map3,
    )

    # モデルの学習
    print(f"\nQuestionId {qid} の学習開始...")
    trainer.train()

    # モデルとラベルエンコーダーの保存
    model_path = f"{output_dir}/model"
    trainer.save_model(model_path)
    tokenizer.save_pretrained(f"{output_dir}/tokenizer")

    # ラベルエンコーダーの保存
    with open(f"{output_dir}/label_encoder.pkl", "wb") as f:
        pickle.dump(le, f)

    print(f"\nQuestionId {qid} の学習完了！")
    print(f"モデル保存先: {model_path}")

    # メモリ解放
    del model
    gc.collect()
    torch.cuda.empty_cache()

    return trainer, le, val_data

In [15]:
# 全QuestionIdに対してモデルを学習
question_ids = sorted(train['QuestionId'].unique())
print(f"学習するQuestionIdのリスト: {question_ids}")

# トークナイザーは共通で使用
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)

# 各QuestionIdの学習結果を保存
training_results = {}
label_encoders = {}
validation_data = {}

# 学習ループ
# for qid in tqdm(question_ids, desc="QuestionIdごとのモデル学習"):

qid = question_ids[2]

# 該当QuestionIdのデータを抽出
train_qid = train[train['QuestionId'] == qid].copy()

# モデルの学習
trainer, le, val_data = train_model_for_question(
    train_qid,
    qid,
    CFG,
    tokenizer,
    fold_to_use=0
)

# 結果を保存
training_results[qid] = trainer
label_encoders[qid] = le
validation_data[qid] = val_data

# メモリ管理
del trainer
gc.collect()

# print("\n全QuestionIdのモデル学習が完了しました！")

学習するQuestionIdのリスト: [np.int64(31772), np.int64(31774), np.int64(31777), np.int64(31778), np.int64(32829), np.int64(32833), np.int64(32835), np.int64(33471), np.int64(33472), np.int64(33474), np.int64(76870), np.int64(89443), np.int64(91695), np.int64(104665), np.int64(109465)]

QuestionId 31777 のモデル学習開始
クラス数: 7
サンプル数: 2809
訓練データ: 2247サンプル
検証データ: 562サンプル


Map:   0%|          | 0/2247 [00:00<?, ? examples/s]

Map:   0%|          | 0/562 [00:00<?, ? examples/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/ettin-encoder-1b and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None}.



QuestionId 31777 の学習開始...


Step,Training Loss,Validation Loss,Map@3
100,0.3005,0.283559,0.948695
200,0.1753,0.332784,0.963227
300,0.1172,0.302477,0.965896
400,0.036,0.462695,0.963227
500,0.0153,0.438429,0.966785
600,0.0026,0.467832,0.968268
700,0.0016,0.467911,0.968268



QuestionId 31777 の学習完了！
モデル保存先: output//qid_31777/model


0

# 2. 推論と評価

In [16]:
# def load_model_for_question(qid, cfg):
#     """特定のQuestionId用のモデルをロード"""
#     model_dir = f"{cfg.output_dir_path}/qid_{qid}"
#     model_path = f"{model_dir}/model"
#     tokenizer_path = f"{model_dir}/tokenizer"

#     # メタデータの読み込み
#     metadata = pd.read_csv(f"{model_dir}/metadata.csv").iloc[0]
#     n_classes = metadata['n_classes']

#     # モデルとトークナイザーのロード
#     model = AutoModelForSequenceClassification.from_pretrained(
#         model_path,
#         num_labels=n_classes,
#     )
#     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

#     # ラベルエンコーダーのロード
#     with open(f"{model_dir}/label_encoder.pkl", "rb") as f:
#         le = pickle.load(f)

#     return model, tokenizer, le, metadata

In [17]:
# def predict_for_question(model, tokenizer, val_data, cfg):
#     """特定のQuestionId用モデルで予測"""
#     # Dataset作成
#     val_ds = Dataset.from_pandas(val_data[["text", "label"]])

#     # トークナイズ
#     def tokenize(batch):
#         return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=cfg.max_len)

#     val_ds = val_ds.map(tokenize, batched=True)
#     columns = ["input_ids", "attention_mask", "label"]
#     val_ds.set_format(type="torch", columns=columns)

#     # 推論用トレーナー
#     inference_trainer = Trainer(
#         model=model,
#         tokenizer=tokenizer
#     )

#     # 予測
#     predictions = inference_trainer.predict(val_ds)
#     logits = predictions.predictions
#     labels = predictions.label_ids

#     return logits, labels

In [18]:
# def apply_category_mask(logits, is_correct_flags, label_to_category, mode="exclude"):
#     """正解/不正解に基づいてTrue/Falseカテゴリをマスキング"""
#     masked_logits = logits.copy()

#     if mode == "exclude":
#         for i, is_correct in enumerate(is_correct_flags):
#             for label_idx in range(logits.shape[1]):
#                 category = label_to_category[label_idx]

#                 if is_correct == 1:
#                     # 正解の場合、True_*以外を除外
#                     if not category.startswith("True_"):
#                         masked_logits[i, label_idx] = -np.inf
#                 else:
#                     # 不正解の場合、False_*以外を除外
#                     if not category.startswith("False_"):
#                         masked_logits[i, label_idx] = -np.inf

#     return masked_logits

In [19]:
# # 全体の評価
# all_predictions = []
# all_labels = []
# all_question_ids = []
# all_row_ids = []

# print("\n各QuestionIdの検証データに対して推論実行...\n")

# for qid in tqdm(question_ids, desc="推論"):
#     # モデルのロード
#     model, tokenizer_qid, le, metadata = load_model_for_question(qid, CFG)

#     # 検証データの取得
#     val_data = validation_data[qid]

#     # 予測
#     logits, labels = predict_for_question(model, tokenizer_qid, val_data, CFG)

#     # Categoryマスクの適用
#     label_to_category = {i: target.split(':')[0] for i, target in enumerate(le.classes_)}
#     is_correct_flags = val_data['is_correct'].values
#     masked_logits = apply_category_mask(logits, is_correct_flags, label_to_category)

#     # 結果を保存
#     all_predictions.append(masked_logits)
#     all_labels.append(labels)
#     all_question_ids.extend([qid] * len(labels))
#     all_row_ids.extend(val_data['row_id'].values)

#     # メモリ解放
#     del model
#     gc.collect()
#     torch.cuda.empty_cache()

# print("\n推論完了！")

In [20]:
# def calculate_overall_map3(predictions_by_qid, labels_by_qid):
#     """
#     全QuestionIdを統合したMAP@3を計算

#     注意: 各QuestionIdで異なるラベル空間を持つため、
#     QuestionIdごとに計算してから平均を取る
#     """
#     all_map3_scores = []

#     for logits, labels in zip(predictions_by_qid, labels_by_qid):
#         probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()

#         k = min(3, probs.shape[1])
#         top_k = np.argsort(-probs, axis=1)[:, :k]
#         match = top_k == labels[:, None]

#         for i in range(len(labels)):
#             score = 0
#             for j in range(k):
#                 if match[i, j]:
#                     score = 1.0 / (j + 1)
#                     break
#             all_map3_scores.append(score)

#     return np.mean(all_map3_scores)

# # 全体のMAP@3を計算
# overall_map3 = calculate_overall_map3(all_predictions, all_labels)
# print(f"\n全体のMAP@3スコア: {overall_map3:.4f}")

In [21]:
# # QuestionIdごとの性能を評価
# print("\nQuestionIdごとのMAP@3スコア:")
# print("="*50)

# qid_performance = {}
# for i, qid in enumerate(question_ids):
#     logits = all_predictions[i]
#     labels = all_labels[i]

#     probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
#     k = min(3, probs.shape[1])
#     top_k = np.argsort(-probs, axis=1)[:, :k]
#     match = top_k == labels[:, None]

#     map3 = 0
#     for j in range(len(labels)):
#         for m in range(k):
#             if match[j, m]:
#                 map3 += 1.0 / (m + 1)
#                 break
#     map3 /= len(labels)

#     accuracy = (np.argmax(logits, axis=1) == labels).mean()

#     qid_performance[qid] = {
#         'map3': map3,
#         'accuracy': accuracy,
#         'n_samples': len(labels)
#     }

#     print(f"QuestionId {qid}: MAP@3={map3:.4f}, Acc={accuracy:.4f}, N={len(labels)}")

# print("="*50)
# print(f"\n平均MAP@3: {np.mean([v['map3'] for v in qid_performance.values()]):.4f}")
# print(f"平均精度: {np.mean([v['accuracy'] for v in qid_performance.values()]):.4f}")

In [22]:
# # 結果をCSVに保存
# performance_df = pd.DataFrame.from_dict(qid_performance, orient='index')
# performance_df.index.name = 'QuestionId'
# performance_df.to_csv(f"{CFG.output_dir_path}/question_performance.csv")
# print(f"\n性能評価結果を保存: {CFG.output_dir_path}/question_performance.csv")