In [1]:
# Google Colabでの設定
google_colab = True

if google_colab:
    from google.colab import drive
    from google.colab import userdata

    drive.mount("/content/drive")

    # ディレクトリ移動
    %cd /content/drive/MyDrive/Python/kaggle_map/src/qwen2.5-14b-instruct-stage1_exp002

    !pip install -q bitsandbytes trl==0.14.0

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Python/kaggle_map/src/qwen2.5-14b-instruct-stage1_exp002


In [2]:
import os
import gc
import time
import random
from jinja2 import Template

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import torch
from torch.utils.data import Dataset, DataLoader
import wandb
# from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoProcessor,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM

In [3]:
class CFG:
    """実験設定管理クラス"""

    # ============== 実験情報 =============
    comp_name = "kaggle_map"
    exp_name = "qwen2.5-14b-instruct-stage1_exp002"
    model_name = "Qwen/Qwen2.5-7B-Instruct"

    # ============== ファイルパス設定 =============
    comp_dir_path = "../../kaggle/input/"
    comp_dataset_path = f"{comp_dir_path}/map-charting-student-math-misunderstandings/"
    output_dir_path = "output/"
    log_dir_path = "logs/"

    # ============== モデル設定 =============
    max_len = 256

    num_train_epochs = 3
    per_device_train_batch_size = 4
    gradient_accumulation_steps = 8
    per_device_eval_batch_size = 1
    optim_type = "adamw_torch"
    learning_rate = 5e-4
    lr_scheduler_type = "cosine"
    warmup_steps = 50
    weight_decay = 0.01

    lora_r = 32
    lora_alpha = 64
    lora_dropout = 0.05
    lora_bias = "none"
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

    # ============== その他設定 =============
    seed = 42
    device = torch.device("cuda")

In [4]:
# 乱数固定
def set_seed(seed=None, cudnn_deterministic=True):
    if seed is None:
        seed = 42

    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = cudnn_deterministic
    torch.backends.cudnn.benchmark = False

def make_dirs(cfg):
    for dir in [cfg.output_dir_path, cfg.log_dir_path]:
        os.makedirs(dir, exist_ok=True)

def cfg_init(cfg):
    set_seed(cfg.seed)
    make_dirs(cfg)

# 1. LLMの学習

## 1.1 データの読み込み

In [5]:
# データ読み込み
train = pd.read_csv(f"{CFG.comp_dataset_path}/train.csv")
print(train.shape)
train.head(2)

(36696, 7)


Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,Misconception
0,0,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),0ne third is equal to tree nineth,True_Correct,
1,1,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 / 3 because 6 over 9 is 2 thirds and 1 third...,True_Correct,


## 1.2 前処理

In [6]:
def add_folds_by_qid_cat_misc(df, n_splits=5, random_state=42, fallback="pair"):
    s_qid = df["QuestionId"].astype(str).fillna("NA")
    s_cat = df["Category"].astype(str).fillna("NA")
    s_misc = df["Misconception"].astype(str).fillna("NA")

    y_triple = s_qid + "|" + s_cat + "|" + s_misc
    y_pair = s_cat + "|" + s_misc

    cnt = y_triple.value_counts()
    if (cnt < n_splits).any():
        if fallback == "pair":
            rare = y_triple.map(cnt) < n_splits
            y = np.where(rare, y_pair, y_triple)
        elif fallback == "category":
            rare = y_triple.map(cnt) < n_splits
            y = np.where(rare, s_cat, y_triple)
        elif fallback == "none":
            y = y_triple
        else:
            raise ValueError("fallback は 'pair' / 'category' / 'none' のいずれかにしてください。")
    else:
        y = y_triple

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    folds = np.full(len(df), -1, dtype=int)
    for fold, (_, val_idx) in enumerate(skf.split(np.zeros(len(df)), y)):
        folds[val_idx] = fold

    out = df.copy()
    out["fold"] = folds
    return out

In [7]:
# fold分割
train = add_folds_by_qid_cat_misc(train, n_splits=5, random_state=42, fallback="pair")
train["fold"].value_counts()



Unnamed: 0_level_0,count
fold,Unnamed: 1_level_1
0,7340
1,7339
4,7339
2,7339
3,7339


In [8]:
# 間違っている正解ラベルを修正(MEMO: trainのみに適用しないとCV/LBの相関がとれないが、一旦無視)
false_to_true_ids = [12878, 12901, 13876, 14089, 14159, 14185]
train["Category"] = np.where(train["row_id"].isin(false_to_true_ids), train["Category"].str.replace("False", "True"), train["Category"])

true_to_false_ids = [14280, 14305, 14321, 14335, 14338,  14352, 14355, 14403, 14407, 14412, 14413, 14418]
train["Category"] = np.where(train["row_id"].isin(true_to_false_ids), train["Category"].str.replace("True", "False"), train["Category"])

In [9]:
# 修正されたラベルが含まれるfold
for f in range(5):
    ids = train[(train["fold"] == f) & (train["row_id"].isin(false_to_true_ids) | train["row_id"].isin(true_to_false_ids))]["row_id"].values
    print(f"Fold {f}: {ids}")

Fold 0: [12878 14159]
Fold 1: [14280 14305 14338 14407]
Fold 2: [14089 14335 14355]
Fold 3: [13876 14185 14418]
Fold 4: [12901 14321 14352 14403 14412 14413]


In [10]:
# ラベルの作成
train["target"] = train.Category

# ラベルエンコード
le = LabelEncoder()
train["label"] = le.fit_transform(train["target"])

n_classes = len(le.classes_)
print(f"訓練データの形状: {train.shape} - {n_classes}個のターゲットクラス")
train.head(2)

訓練データの形状: (36696, 10) - 6個のターゲットクラス


Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,Misconception,fold,target,label
0,0,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),0ne third is equal to tree nineth,True_Correct,,1,True_Correct,3
1,1,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 / 3 because 6 over 9 is 2 thirds and 1 third...,True_Correct,,4,True_Correct,3


## 1.3 特徴量エンジニアリング

In [11]:
# 正解フラグの作成
idx = train.apply(lambda row: row.Category.split("_")[0], axis=1) == "True"
correct = train.loc[idx].copy()
correct["c"] = correct.groupby(["QuestionId", "MC_Answer"]).MC_Answer.transform("count")
correct = correct.sort_values("c", ascending=False)
correct = correct.drop_duplicates(["QuestionId"])
correct = correct[["QuestionId", "MC_Answer"]]
correct["is_correct"] = 1

# 正解ラベルを訓練データにマージ
train = train.merge(correct, on=["QuestionId", "MC_Answer"], how="left")
train.is_correct = train.is_correct.fillna(0)

## 1.4 tokenize処理

In [12]:
# tokenizerの読み込み
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
tokenizer.padding_side = "right"

In [13]:
template = Template("""<|im_start|>system
"You are Qwen, created by Alibaba Cloud. You are a helpful assistant."<|im_end|>
<|im_start|>user
You are a specialist in evaluating student's answers and explanations to math problems.
Based on the information provided below, determine whether the student's answer is correct and whether their explanation is valid.

Question: {{QuestionText}}
Answer: {{MC_Answer}}
Correct: {{Correct}}
Student Explanation: {{StudentExplanation}}
Choice: True_Correct, True_Misconception, True_Neither, False_Correct, False_Misconception, False_Neither

Target:<|im_end|>
<|im_start|>assistant
{{target}}""")

In [14]:
def preprocess_row(row, tokenizer):
    input_text = template.render(
        QuestionText=row["QuestionText"],
        MC_Answer=row["MC_Answer"],
        Correct=row["is_correct"],
        StudentExplanation=row["StudentExplanation"],
        target=row["target"]
    )
    item = tokenizer(input_text, add_special_tokens=False, truncation=False)
    return item

def preprocess_df(df, tokenizer):
    items = []
    for _, row in df.iterrows():
        items.append(preprocess_row(row, tokenizer))

    df = pd.concat([df, pd.DataFrame(items)], axis=1)
    return df

train = preprocess_df(train, tokenizer)

In [15]:
class LLMDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]

        inputs = {"input_ids": row["input_ids"]}

        return inputs

In [16]:
# token数がmax_length以下のものに絞り込む
print(train.shape)
train["token_count"] = train["input_ids"].apply(len)

# 100トークンごとにビン分割
train["token_bin"] = pd.cut(train["token_count"], bins=range(0, train["token_count"].max() + 100, 100))

# 結果を表示
for bin_range, group in train.groupby("token_bin"):
    print(f"Token count bin: {bin_range}")
    print(group["target"].value_counts())
    print("-" * 40)

(36696, 13)
Token count bin: (0, 100]
Series([], Name: count, dtype: int64)
----------------------------------------
Token count bin: (100, 200]
target
True_Correct           12428
False_Misconception     8133
False_Neither           5883
True_Neither            4789
True_Misconception       374
False_Correct            200
Name: count, dtype: int64
----------------------------------------
Token count bin: (200, 300]
target
True_Correct           2367
False_Misconception    1325
False_Neither           658
True_Neither            476
False_Correct            33
True_Misconception       28
Name: count, dtype: int64
----------------------------------------
Token count bin: (300, 400]
target
True_Correct    1
Name: count, dtype: int64
----------------------------------------
Token count bin: (400, 500]
target
True_Neither    1
Name: count, dtype: int64
----------------------------------------


  for bin_range, group in train.groupby("token_bin"):


In [17]:
train = train[train["token_count"] <= CFG.max_len].reset_index(drop=True)
print(train.shape)

(36670, 15)


In [18]:
train_ds = LLMDataset(train)
data_collator = DataCollatorForCompletionOnlyLM("<|im_start|>assistant\n", tokenizer=tokenizer)
batch = next(iter(DataLoader(train_ds, batch_size=4, collate_fn=data_collator)))

In [19]:
batch["input_ids"][0]

tensor([151644,   8948,    198,  21608,    525,   1207,  16948,     11,   3465,
           553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
          1189, 151645,    198, 151644,    872,    198,   2610,    525,    264,
         23753,    304,  37563,   5458,    594,  11253,    323,  40841,    311,
          6888,   5322,    624,  28715,    389,    279,   1995,   3897,   3685,
            11,   8253,   3425,    279,   5458,    594,   4226,    374,   4396,
           323,   3425,    862,  16148,    374,   2697,    382,  14582,     25,
          3555,  19419,    315,    279,   6083,    374,    537,  91766,     30,
         20678,    697,   4226,    304,   1181,  44548,   1352,     13,    508,
          1906,     25,    362,  21495,   6718,   1119,    220,     24,   6144,
          9155,  42446,     13,    220,     21,    315,   1105,    525,  91766,
            13,    921,  16141,     25,  17767,   1124,  37018,     90,     16,
         15170,     18,     92,   1124, 

In [20]:
batch["labels"][0]

tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 

In [21]:
print(tokenizer.decode(batch["input_ids"][0]))

<|im_start|>system
"You are Qwen, created by Alibaba Cloud. You are a helpful assistant."<|im_end|>
<|im_start|>user
You are a specialist in evaluating student's answers and explanations to math problems.
Based on the information provided below, determine whether the student's answer is correct and whether their explanation is valid.

Question: What fraction of the shape is not shaded? Give your answer in its simplest form. [Image: A triangle split into 9 equal smaller triangles. 6 of them are shaded.]
Answer: \( \frac{1}{3} \)
Correct: 1.0
Student Explanation: 0ne third is equal to tree nineth
Choice: True_Correct, True_Misconception, True_Neither, False_Correct, False_Misconception, False_Neither

Target:<|im_end|>
<|im_start|>assistant
True_Correct<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [22]:
# 訓練データと検証データに分割
train_df = train[train.fold != 0]
val_df = train[train.fold == 0].head(2)

print(f"Train samples: {len(train_df)}, Val samples: {len(val_df)}")

Train samples: 29335, Val samples: 2


In [23]:
target_list = train["target"].unique().tolist()
for target in target_list:
    print(target, tokenizer.encode(target))

True_Correct [2514, 920, 27034]
True_Neither [2514, 1604, 49898]
True_Misconception [2514, 1245, 285, 443, 995]
False_Neither [4049, 1604, 49898]
False_Misconception [4049, 1245, 285, 443, 995]
False_Correct [4049, 920, 27034]


## 1.5 学習設定

In [24]:
# wandbのログイン
wandb.login(key=userdata.get("WANDB_API_KEY"))
wandb.init(project=CFG.comp_name, name=CFG.exp_name)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtomokazu_rikioka[0m ([33mtomokazu_rikioka_[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [25]:
# 量子化の設定
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    # bnb_4bit_quant_type="nf4"
)

# モデルの読み込み
model = AutoModelForCausalLM.from_pretrained(
    CFG.model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [26]:
# 学習の設定
training_args = SFTConfig(
    output_dir=CFG.output_dir_path,
    do_train=True,
    do_eval=True,
    eval_strategy="steps",
    save_strategy="steps",
    num_train_epochs=CFG.num_train_epochs,
    per_device_train_batch_size=CFG.per_device_train_batch_size,
    gradient_accumulation_steps=CFG.gradient_accumulation_steps,
    per_device_eval_batch_size=CFG.per_device_eval_batch_size,
    learning_rate=CFG.learning_rate,
    optim=CFG.optim_type,
    lr_scheduler_type=CFG.lr_scheduler_type,
    warmup_steps=CFG.warmup_steps,
    weight_decay=CFG.weight_decay,
    logging_dir=CFG.log_dir_path,
    logging_steps=50,
    save_steps=600,
    eval_steps=50,
    save_total_limit=1,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    load_best_model_at_end=True,
    max_seq_length=CFG.max_len,
    # gradient_checkpointing=True,
    report_to="wandb",
    bf16=True,
    fp16=False,  # KaggleはT4なのでFP16で推論
)

In [27]:
# LoRAの設定
lora_config = LoraConfig(
    r=CFG.lora_r,
    lora_alpha=CFG.lora_alpha,
    lora_dropout=CFG.lora_dropout,
    bias=CFG.lora_bias,
    target_modules=CFG.target_modules,
    task_type="CAUSAL_LM",
)

In [28]:
# カスタムメトリック（正解率）
def compute_metrics(eval_pred):
    """
    SFTTrainer用のメトリック計算関数
    生成されたトークンIDから直接カテゴリを判定
    """
    predictions, labels = eval_pred

    # カテゴリとその最初のトークンIDのマッピング
    category_first_tokens = {
        2514: "True",   # True_XXX
        4049: "False"   # False_XXX
    }

    category_second_tokens = {
        920: "Correct",
        1604: "Neither",
        1245: "Misconception"
    }

    # predictionsがtupleの場合の処理
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # トークンIDから最も確率の高いものを選択
    if len(predictions.shape) == 3:  # (batch_size, sequence_length, vocab_size)
        pred_tokens = np.argmax(predictions, axis=-1)
    else:
        pred_tokens = predictions

    # 予測とラベルを解析
    pred_categories = []
    label_categories = []

    for pred_seq, label_seq in zip(pred_tokens, labels):
        # ラベルの解析（-100でない部分を抽出）
        valid_label_indices = np.where(label_seq != -100)[0]

        if len(valid_label_indices) >= 2:
            # ラベルのカテゴリを判定
            label_tokens = label_seq[valid_label_indices[:2]]
            label_first = category_first_tokens.get(label_tokens[0], "Unknown")
            label_second = category_second_tokens.get(label_tokens[1], "Unknown")
            label_category = f"{label_first}_{label_second}"

            # 予測のカテゴリを判定（ラベルと同じ位置から）
            pred_start_idx = valid_label_indices[0]
            if pred_start_idx < len(pred_seq) - 1:
                pred_first_token = pred_seq[pred_start_idx]
                pred_second_token = pred_seq[pred_start_idx + 1] if pred_start_idx + 1 < len(pred_seq) else 0

                pred_first = category_first_tokens.get(pred_first_token, "Unknown")
                pred_second = category_second_tokens.get(pred_second_token, "Unknown")
                pred_category = f"{pred_first}_{pred_second}"
            else:
                pred_category = "Unknown_Unknown"

            pred_categories.append(pred_category)
            label_categories.append(label_category)

        # 推論結果をデコードして表示
        print(f"Prediction: {tokenizer.decode(pred_seq)}")

    # 精度を計算
    if len(pred_categories) > 0:
        correct = sum([p == l for p, l in zip(pred_categories, label_categories)])
        accuracy = correct / len(pred_categories)
    else:
        accuracy = 0.0

    return {"accuracy": accuracy}

## 1.6 モデルの学習

In [29]:
# SFTTrainerの設定
trainer = SFTTrainer(
    model=model,
    train_dataset=LLMDataset(train_df),
    eval_dataset=LLMDataset(val_df),
    peft_config=lora_config,
    data_collator=data_collator,
    processing_class=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
)

In [None]:
# モデルの学習
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss,Accuracy
50,0.4778,0.141733,0.0


Prediction: 

____ are awen, a by the Cloud. You are a character and who Q
Sure
:Hello are a character in ancient the performance writing in providing in math problems.Can on the following provided,, evaluate if the student's answer is correct or provide the explanation is clear.Problem: What is of the circle is shaded shaded?

 ( your answer as simplest simplest form.

 

Shape of A circle with into 6 equal parts triangles, 5 of the are shaded.]

]
Correct:  \frac{3}{3} \)
Explanation: Yes1
 Explanation
Explanation's: The1/ of  3 smaller't shaded,Correct: /orrect, True_Cisleadingception, False_Cegative, False_Norrect, False_Misconception, False_Neither
True: True
<|im_start|><|im_start|>
True_Correct<|im_end|>!!!!!!!!!!
Prediction: 

____ are awen, a by the Cloud, You are a character and who Q
Sure
:Hello are a character in ancient the performance writing in providing in math problems.Can on the following provided,, evaluate the the student's answer is correct or provide the explanati

## 1.7 モデルの保存

In [None]:
# モデルの保存
trainer.model.save_pretrained("{CFG.output_dir_path}/peft")

# モデルの設定を更新
model.config.use_cache = True

In [None]:
# trainer.save_model(f"{CFG.output_dir_path}/model")
# tokenizer.save_pretrained(f"{CFG.output_dir_path}/tokenizer")

In [None]:
# WandBのセッションを終了
with wandb.init():
    wandb.finish()

In [None]:
# キャッシュ削除
del model, tokenizer, trainer
gc.collect()
torch.cuda.empty_cache()

# 2. 結果分析

## 2.1 検証データの予測

In [None]:
# 学習済みモデルの読み込み
model_path = f"{CFG.output_dir_path}/model"
tokenizer_path = f"{CFG.output_dir_path}/tokenizer"

inference_model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=n_classes,
    reference_compile=False,
)
inference_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# 推論用のインスタンスを作成
inference_trainer = Trainer(
    model=inference_model,
    processing_class=inference_tokenizer
)

# 検証データの推論実行
inference_model.eval()
start_time = time.time()

val_predictions = inference_trainer.predict(val_ds)
logits = val_predictions.predictions
val_labels = val_predictions.label_ids

# 推論時間を計算
inference_time = time.time() - start_time
print(f"推論時間: {inference_time:.2f}秒")
print(f"サンプル数: {len(val_labels)}個")
print(f"1サンプルあたりの推論時間: {inference_time/len(val_labels)*1000:.2f}ms")

# 確率値に変換
val_probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()

# Top-1精度の計算
val_pred_labels = np.argmax(logits, axis=1)
accuracy = (val_pred_labels == val_labels).mean()
print(f"\nTop-1精度: {accuracy:.4f}")