<a href="https://colab.research.google.com/github/Shun0212/CodeBERTPretrained/blob/main/EvalCodeMor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets



In [2]:
import torch
import numpy as np
from transformers import AutoModelForMaskedLM, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity

# デバイス設定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用デバイス: {device}")

def get_cls_embedding(model, tokenizer, text, device, max_length=256):
    """
    入力テキストから [CLS] トークンの埋め込みを取得する関数。
    ※モデルが BERT 系の場合は model.bert を、RoBERTa 系の場合は model.roberta を利用して出力を取得します。
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # モデルの種類に応じて内部のエンコーダにアクセスする
    if hasattr(model, "bert"):
        outputs = model.bert(**inputs)
    elif hasattr(model, "roberta"):
        outputs = model.roberta(**inputs)
    else:
        raise ValueError("Model does not have attribute 'bert' or 'roberta'.")

    # outputs.last_hidden_state: (batch_size, sequence_length, hidden_size)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] または最初のトークンの埋め込み
    return cls_embedding.detach().cpu().numpy()

def calculate_metrics(sim_matrix):
    """
    類似度行列から MRR, Recall@k, Precision@k などの評価指標を計算する関数。
    """
    num_examples = sim_matrix.shape[0]
    mrr_total = 0.0
    recall_at_1_total = 0.0
    recall_at_5_total = 0.0
    precision_at_1_total = 0.0
    precision_at_5_total = 0.0

    for i in range(num_examples):
        sims = sim_matrix[i]
        ranked_indices = np.argsort(-sims)
        rank = np.where(ranked_indices == i)[0][0] + 1
        mrr_total += 1.0 / rank

        # Recall@k, Precision@k の計算
        if i in ranked_indices[:1]:
            recall_at_1_total += 1.0
            precision_at_1_total += 1.0
        if i in ranked_indices[:5]:
            recall_at_5_total += 1.0
            precision_at_5_total += 1.0

    mrr = mrr_total / num_examples
    recall_at_1 = recall_at_1_total / num_examples
    recall_at_5 = recall_at_5_total / num_examples
    precision_at_1 = precision_at_1_total / num_examples
    precision_at_5 = precision_at_5_total / num_examples

    metrics = {
        "mrr": mrr,
        "recall@1": recall_at_1,
        "recall@5": recall_at_5,
        "precision@1": precision_at_1,
        "precision@5": precision_at_5,
    }
    return metrics


def evaluate_code_search(model, tokenizer, dataset, device, max_examples=100):
    """
    CodeSearchNet の一部サンプルを使って、クエリからコード検索の評価を行い、MRR, Recall@k, Precision@k を計算する関数。

    ... ( Docstring は変更なし ) ...
    """
    codes = []
    queries = []
    num_examples = min(len(dataset), max_examples)

    for i in range(num_examples):
        ex = dataset[i]
        # キーがどの名前になっているか確認して抽出する
        if "func_code_string" in ex:
            code_text = ex["func_code_string"]
        elif "code" in ex:
            code_text = ex["code"]
        else:
            raise KeyError(f"サンプル {i} にコードテキストが見つかりません。利用可能なキー: {list(ex.keys())}")

        if "func_documentation_string" in ex:
            query_text = ex["func_documentation_string"]
        elif "docstring" in ex:
            query_text = ex["docstring"]
        else:
            raise KeyError(f"サンプル {i} にドキュメンテーション文字列が見つかりません。利用可能なキー: {list(ex.keys())}")

        codes.append(code_text)
        queries.append(query_text)

    # 各コードとクエリの埋め込みを取得
    code_embeddings = []
    query_embeddings = []

    for code in codes:
        emb = get_cls_embedding(model, tokenizer, code, device)
        code_embeddings.append(emb)
    for query in queries:
        emb = get_cls_embedding(model, tokenizer, query, device)
        query_embeddings.append(emb)

    # 各埋め込みは shape (1, hidden_size) になっているので連結
    code_embeddings = np.concatenate(code_embeddings, axis=0)  # shape: (num_examples, hidden_size)
    query_embeddings = np.concatenate(query_embeddings, axis=0)

    # クエリとコードの間の cosine similarity 行列を計算
    sim_matrix = cosine_similarity(query_embeddings, code_embeddings)

    # 評価指標の計算
    metrics = calculate_metrics(sim_matrix)
    return metrics


if __name__ == "__main__":
    # CodeSearchNet の Python 部分のテストセットをロード
    print("CodeSearchNet データセットをロードします...")
    dataset = load_dataset("code_search_net", "python", split="test", trust_remote_code=True)
    # デモ用に上位 100 件を評価対象にする
    subset = dataset.select(range(1000))

    ### CodeMorph-BERT の評価 ###
    model_name1 = "Shuu12121/CodeMorph-BERT"
    print(f"\n{model_name1} を評価します...")
    tokenizer1 = AutoTokenizer.from_pretrained(model_name1)
    model1 = AutoModelForMaskedLM.from_pretrained(model_name1)
    model1.to(device)
    metrics1 = evaluate_code_search(model1, tokenizer1, subset, device, max_examples=23107) # max_examples は適宜調整
    print(f"CodeMorph-BERT の評価指標:")
    for name, value in metrics1.items():
        print(f"  {name}: {value:.4f}")

    ### Microsoft CodeBERT の評価 ###
    model_name2 = "microsoft/codebert-base-mlm"
    print(f"\n{model_name2} を評価します...")
    tokenizer2 = AutoTokenizer.from_pretrained(model_name2)
    model2 = AutoModelForMaskedLM.from_pretrained(model_name2)
    model2.to(device)
    metrics2 = evaluate_code_search(model2, tokenizer2, subset, device, max_examples=23107) # max_examples は適宜調整
    print(f"CodeBERT の評価指標:")
    for name, value in metrics2.items():
        print(f"  {name}: {value:.4f}")

使用デバイス: cuda
CodeSearchNet データセットをロードします...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



Shuu12121/CodeMorph-BERT を評価します...


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


CodeMorph-BERT の評価指標:
  mrr: 0.7668
  recall@1: 0.6930
  recall@5: 0.8500
  precision@1: 0.6930
  precision@5: 0.8500

microsoft/codebert-base-mlm を評価します...


Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CodeBERT の評価指標:
  mrr: 0.6766
  recall@1: 0.6180
  recall@5: 0.7370
  precision@1: 0.6180
  precision@5: 0.7370


In [4]:
import torch
import numpy as np
from transformers import AutoModelForMaskedLM, AutoTokenizer, RobertaForMaskedLM, BertForMaskedLM
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
import random

# デバイス設定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用デバイス: {device}")

def get_cls_embedding(model, tokenizer, text, device, max_length=256):
    """
    入力テキストから [CLS] トークンの埋め込みを取得する関数。
    ※モデルが BERT 系の場合は model.bert を、RoBERTa 系の場合は model.roberta を利用して出力を取得します。
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # モデルの種類に応じて内部のエンコーダにアクセスする
    if isinstance(model, BertForMaskedLM): # BERT 系モデルの判定を修正
        outputs = model.bert(**inputs)
    elif isinstance(model, RobertaForMaskedLM): # RoBERTa 系モデルの判定を修正
        outputs = model.roberta(**inputs)
    else:
        raise ValueError("Model must be BertForMaskedLM or RobertaForMaskedLM.") # エラーメッセージを修正

    # outputs.last_hidden_state: (batch_size, sequence_length, hidden_size)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] または最初のトークンの埋め込み
    return cls_embedding.detach().cpu().numpy()


def calculate_metrics(sim_matrix, k_values=[1, 5, 10, 50, 100]):
    num_queries = sim_matrix.shape[0]
    metrics = {
        "mrr": 0.0,
        "recall@k": {k: 0.0 for k in k_values},
        "precision@k": {k: 0.0 for k in k_values},
        "ndcg@k": {k: 0.0 for k in k_values},
        "map": 0.0,
        "f1@k": {k: 0.0 for k in k_values},
        "r_precision": 0.0,
        "success_rate@k": {k: 0.0 for k in k_values},
        "query_coverage@k": {k: 0.0 for k in k_values},
    }

    for i in range(num_queries):
        sims = sim_matrix[i]
        ranked_indices = np.argsort(-sims)
        correct_rank = np.where(ranked_indices == 0)[0][0] + 1
        is_correct_in_top_k = {k: correct_rank <= k for k in k_values}

        # MRR
        metrics["mrr"] += 1.0 / correct_rank

        # 各 k に対してクエリごとの precision, recall, f1 を計算
        for k in k_values:
            if correct_rank <= k:
                # 正解がトップkに含まれる場合
                prec = 1.0 / correct_rank
                rec = 1.0  # 候補が1件の場合、正解が含まれていれば recall は 1.0
            else:
                prec = 0.0
                rec = 0.0
            f1 = calculate_f1(prec, rec)

            metrics["precision@k"][k] += prec
            metrics["recall@k"][k] += rec
            metrics["f1@k"][k] += f1

            # NDCG
            ideal_ranking = [1.0] + [0.0] * (k - 1)
            actual_ranking = [1.0 if j == 0 else 0.0 for j in ranked_indices[:k]]
            idcg = calculate_dcg(ideal_ranking)
            dcg = calculate_dcg(actual_ranking)
            metrics["ndcg@k"][k] += dcg / idcg if idcg > 0 else 0.0

            # Success Rate, Query Coverage
            metrics["success_rate@k"][k] += 1.0 if is_correct_in_top_k[k] else 0.0
            metrics["query_coverage@k"][k] += 1.0 if is_correct_in_top_k[k] else 0.0

        # MAP, R-Precision
        metrics["map"] += calculate_average_precision(ranked_indices)
        metrics["r_precision"] += calculate_r_precision(ranked_indices)

    # 各指標をクエリ数で平均化
    metrics["mrr"] /= num_queries
    metrics["map"] /= num_queries
    metrics["r_precision"] /= num_queries
    for k in k_values:
        metrics["recall@k"][k] /= num_queries
        metrics["precision@k"][k] /= num_queries
        metrics["ndcg@k"][k] /= num_queries
        metrics["f1@k"][k] /= num_queries
        metrics["success_rate@k"][k] /= num_queries
        metrics["query_coverage@k"][k] /= num_queries

    return metrics


def calculate_average_precision(ranked_indices): # ★ MAP 計算用関数を追加
    """
    Average Precision (AP) を計算する関数。
    """
    correct_rank = np.where(ranked_indices == 0)[0][0] + 1
    ap = 0.0
    for k in range(1, len(ranked_indices) + 1):
        if k == correct_rank: # k番目に正解コードが現れた場合のみ Precision を加算
            ap += 1.0 / k
    return ap


def calculate_f1(precision, recall): # ★ F1値計算用関数を追加
    """
    F1値を計算する関数 (Precision と Recall から計算)。
    """
    if precision + recall == 0: # Precision, Recall が共に 0 の場合
        return 0.0
    return 2 * (precision * recall) / (precision + recall)


def calculate_r_precision(ranked_indices): # ★ R-Precision 計算用関数を追加
    """
    R-Precision を計算する関数 (R=1 で固定: 候補プールに正解コードは1つのみ)。
    """
    r = 1 # 正解コード数 (候補プールに1つのみ)
    correct_in_top_r = 0
    for i in range(r): # 上位 r 件まで確認
        if ranked_indices[i] == 0: # 正解コードがランクイン
            correct_in_top_r += 1
    return correct_in_top_r / r # R-Precision は適合率

def calculate_dcg(ranking):
    """
    Discounted Cumulative Gain (DCG) を計算する関数。
    """
    dcg = 0.0
    for i, rel in enumerate(ranking):
        dcg += rel / np.log2(i + 2) # 順位 i+1 の割引率: 1/log2(i+2)
    return dcg


def evaluate_code_search(model, tokenizer, dataset, device, max_examples=100, pool_size=100):
    """
    CodeSearchNet データセットを使って、クエリからコード検索の厳密な評価を行い、
    MRR, Recall@k, Precision@k, NDCG@k を計算する関数 (候補コードプール使用)。

    Args:
        model, tokenizer: 評価対象のモデルとトークナイザー
        dataset: Hugging Face Datasets の CodeSearchNet データセット
        device: "cpu" または "cuda"
        max_examples: 評価に使うクエリの最大数
        pool_size: 各クエリに対する候補コードプールのサイズ (正解コード1つ + 不正解コード pool_size-1)

    Returns:
        metrics: 評価指標 (MRR, Recall@k, Precision@k, NDCG@k) を含む辞書
    """
    num_examples = min(len(dataset), max_examples)
    all_codes = [dataset[i]["func_code_string"] for i in range(len(dataset))] # データセット全体のコードを取得 (不正解候補サンプリング用)
    queries = []
    correct_code_indices = [] # 各クエリに対応する正解コードのデータセット内インデックスを保持

    for i in range(num_examples):
        ex = dataset[i]
        query_text = ex["func_documentation_string"]
        queries.append(query_text)
        correct_code_indices.append(i) # 正解コードのインデックスを記録

    # データセット全体のコード埋め込みを事前に計算 (時間短縮のため)
    all_code_embeddings = []
    print("データセット全体のコード埋め込みを計算...")
    for code in all_codes[:num_examples]: # max_examples までに制限
        emb = get_cls_embedding(model, tokenizer, code, device)
        all_code_embeddings.append(emb)
    all_code_embeddings = np.concatenate(all_code_embeddings, axis=0) # shape: (num_examples, hidden_size)
    print("コード埋め込み計算完了.")


    sim_matrices = [] # 各クエリの類似度行列を格納するリスト
    print("候補コードプールを作成し、類似度行列を計算...")
    for i in range(num_examples): # 各クエリごとに候補コードプールを作成
        query_embedding = get_cls_embedding(model, tokenizer, queries[i], device).reshape(1, -1) # クエリ埋め込みを計算 # ★reshape を追加

        candidate_pool_embeddings = []
        candidate_pool_embeddings.append(all_code_embeddings[correct_code_indices[i]]) # プール先頭に正解コードの埋め込みを追加

        # 不正解コードを候補プールに追加 (データセット全体からランダムサンプリング)
        incorrect_code_indices = []
        while len(incorrect_code_indices) < pool_size - 1:
            rand_index = random.randint(0, num_examples - 1) # 不正解コードのインデックスをランダムに選択 (max_examplesまで)
            if rand_index != correct_code_indices[i] and rand_index not in incorrect_code_indices: # 正解コードと重複しないようにチェック
                incorrect_code_indices.append(rand_index)
        for incorrect_index in incorrect_code_indices:
            candidate_pool_embeddings.append(all_code_embeddings[incorrect_index])
        candidate_pool_embeddings = np.stack(candidate_pool_embeddings, axis=0) # shape: (pool_size, hidden_size) # ★np.stack に変更



        # クエリと候補コードプール間のコサイン類似度を計算
        sims = cosine_similarity(query_embedding, candidate_pool_embeddings)[0] # shape: (pool_size,)
        sim_matrices.append(sims) # shape: (num_queries, pool_size)
    sim_matrix = np.array(sim_matrices) # shape: (num_queries, pool_size)
    print("類似度行列計算完了.")


    # 評価指標の計算
    metrics = calculate_metrics(sim_matrix)
    return metrics


if __name__ == "__main__":
    # CodeSearchNet の Python 部分のテストセットをロード
    print("CodeSearchNet データセットをロードします...")
    dataset = load_dataset("code_search_net", "python", split="test", trust_remote_code=True)
    # デモ用に上位 1000 件を評価対象にする
    subset = dataset.select(range(1000)) # 評価サンプル数を調整

    ### CodeMorph-BERT の評価 ###
    model_name1 = "Shuu12121/CodeMorph-BERT"
    print(f"\n{model_name1} を評価します (候補プールサイズ: 100)...")
    tokenizer1 = AutoTokenizer.from_pretrained(model_name1)
    model1 = AutoModelForMaskedLM.from_pretrained(model_name1) # AutoModelForMaskedLM を使用
    model1.to(device)
    metrics1 = evaluate_code_search(model1, tokenizer1, subset, device, max_examples=1000, pool_size=100) # max_examples, pool_size を調整
    print(f"CodeMorph-BERT の評価指標 (候補プールサイズ: 100):")
    for name, value_dict in metrics1.items(): # metrics1 は辞書の中に辞書を持つ構造 (metrics1.items() で外側の辞書をiterate)
        if isinstance(value_dict, dict): # Recall@k, Precision@k, NDCG@k の場合 (辞書型)
            for k, v in value_dict.items(): # value_dict.items() で内側の辞書をiterate
                print(f"  {name}@{k}: {v:.4f}")
        else: # MRR の場合 (辞書型ではない)
            print(f"  {name}: {value_dict:.4f}")


    ### Microsoft CodeBERT (microsoft/codebert-base-mlm) の評価 ###
    model_name2 = "microsoft/codebert-base-mlm"
    print(f"\n{model_name2} を評価します (候補プールサイズ: 100)...")
    tokenizer2 = AutoTokenizer.from_pretrained(model_name2)
    model2 = AutoModelForMaskedLM.from_pretrained(model_name2) # AutoModelForMaskedLM を使用
    model2.to(device)
    metrics2 = evaluate_code_search(model2, tokenizer2, subset, device, max_examples=1000, pool_size=100) # max_examples, pool_size を調整
    print(f"CodeBERT (microsoft/codebert-base-mlm) の評価指標 (候補プールサイズ: 100):")
    for name, value_dict in metrics2.items(): # metrics2 は辞書の中に辞書を持つ構造 (metrics2.items() で外側の辞書をiterate)
        if isinstance(value_dict, dict): # Recall@k, Precision@k, NDCG@k の場合 (辞書型)
            for k, v in value_dict.items(): # value_dict.items() で内側の辞書をiterate
                print(f"  {name}@{k}: {v:.4f}")
        else: # MRR の場合 (辞書型ではない)
            print(f"  {name}: {value_dict:.4f}")

使用デバイス: cuda
CodeSearchNet データセットをロードします...

Shuu12121/CodeMorph-BERT を評価します (候補プールサイズ: 100)...
データセット全体のコード埋め込みを計算...
コード埋め込み計算完了.
候補コードプールを作成し、類似度行列を計算...
類似度行列計算完了.


Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CodeMorph-BERT の評価指標 (候補プールサイズ: 100):
  mrr: 0.8929
  recall@k@1: 0.8520
  recall@k@5: 0.9460
  recall@k@10: 0.9660
  recall@k@50: 0.9910
  recall@k@100: 1.0000
  precision@k@1: 0.8520
  precision@k@5: 0.8885
  precision@k@10: 0.8912
  precision@k@50: 0.8927
  precision@k@100: 0.8929
  ndcg@k@1: 0.8520
  ndcg@k@5: 0.9029
  ndcg@k@10: 0.9094
  ndcg@k@50: 0.9155
  ndcg@k@100: 0.9170
  map: 0.8929
  f1@k@1: 0.8520
  f1@k@5: 0.9036
  f1@k@10: 0.9084
  f1@k@50: 0.9113
  f1@k@100: 0.9115
  r_precision: 0.8520
  success_rate@k@1: 0.8520
  success_rate@k@5: 0.9460
  success_rate@k@10: 0.9660
  success_rate@k@50: 0.9910
  success_rate@k@100: 1.0000
  query_coverage@k@1: 0.8520
  query_coverage@k@5: 0.9460
  query_coverage@k@10: 0.9660
  query_coverage@k@50: 0.9910
  query_coverage@k@100: 1.0000

microsoft/codebert-base-mlm を評価します (候補プールサイズ: 100)...
データセット全体のコード埋め込みを計算...
コード埋め込み計算完了.
候補コードプールを作成し、類似度行列を計算...
類似度行列計算完了.
CodeBERT (microsoft/codebert-base-mlm) の評価指標 (候補プールサイズ: 100):
  mrr: 0.8064


In [7]:
import torch
import numpy as np
from transformers import AutoModelForMaskedLM, AutoTokenizer, RobertaForMaskedLM, BertForMaskedLM
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
import random

# デバイス設定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用デバイス: {device}")

def get_cls_embedding(model, tokenizer, text, device, max_length=256):
    """
    入力テキストから [CLS] トークンの埋め込みを取得する関数。
    ※モデルが BERT 系の場合は model.bert を、RoBERTa 系の場合は model.roberta を利用して出力を取得します。
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    if isinstance(model, BertForMaskedLM):
        outputs = model.bert(**inputs)
    elif isinstance(model, RobertaForMaskedLM):
        outputs = model.roberta(**inputs)
    else:
        raise ValueError("Model must be BertForMaskedLM or RobertaForMaskedLM.")

    cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.detach().cpu().numpy()

def calculate_metrics(sim_matrix, k_values=[1, 5, 10, 50, 100]):
    num_queries = sim_matrix.shape[0]
    metrics = {
        "mrr": 0.0,
        "recall@k": {k: 0.0 for k in k_values},
        "precision@k": {k: 0.0 for k in k_values},
        "ndcg@k": {k: 0.0 for k in k_values},
        "map": 0.0,
        "f1@k": {k: 0.0 for k in k_values},
        "r_precision": 0.0,
        "success_rate@k": {k: 0.0 for k in k_values},
        "query_coverage@k": {k: 0.0 for k in k_values},
    }

    for i in range(num_queries):
        sims = sim_matrix[i]
        ranked_indices = np.argsort(-sims)
        correct_rank = np.where(ranked_indices == 0)[0][0] + 1
        is_correct_in_top_k = {k: correct_rank <= k for k in k_values}

        # MRR
        metrics["mrr"] += 1.0 / correct_rank

        # 各 k に対してクエリごとの precision, recall, f1 を計算
        for k in k_values:
            if correct_rank <= k:
                # 正解がトップkに含まれる場合
                prec = 1.0 / correct_rank
                rec = 1.0  # 候補が1件の場合、正解が含まれていれば recall は 1.0
            else:
                prec = 0.0
                rec = 0.0
            f1 = calculate_f1(prec, rec)

            metrics["precision@k"][k] += prec
            metrics["recall@k"][k] += rec
            metrics["f1@k"][k] += f1

            # NDCG
            ideal_ranking = [1.0] + [0.0] * (k - 1)
            actual_ranking = [1.0 if j == 0 else 0.0 for j in ranked_indices[:k]]
            idcg = calculate_dcg(ideal_ranking)
            dcg = calculate_dcg(actual_ranking)
            metrics["ndcg@k"][k] += dcg / idcg if idcg > 0 else 0.0

            # Success Rate, Query Coverage
            metrics["success_rate@k"][k] += 1.0 if is_correct_in_top_k[k] else 0.0
            metrics["query_coverage@k"][k] += 1.0 if is_correct_in_top_k[k] else 0.0

        # MAP, R-Precision
        metrics["map"] += calculate_average_precision(ranked_indices)
        metrics["r_precision"] += calculate_r_precision(ranked_indices)

    # 各指標をクエリ数で平均化
    metrics["mrr"] /= num_queries
    metrics["map"] /= num_queries
    metrics["r_precision"] /= num_queries
    for k in k_values:
        metrics["recall@k"][k] /= num_queries
        metrics["precision@k"][k] /= num_queries
        metrics["ndcg@k"][k] /= num_queries
        metrics["f1@k"][k] /= num_queries
        metrics["success_rate@k"][k] /= num_queries
        metrics["query_coverage@k"][k] /= num_queries

    return metrics

def calculate_average_precision(ranked_indices):
    correct_rank = np.where(ranked_indices == 0)[0][0] + 1
    ap = 0.0
    for k in range(1, len(ranked_indices) + 1):
        if k == correct_rank:
            ap += 1.0 / k
    return ap

def calculate_f1(precision, recall):
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

def calculate_r_precision(ranked_indices):
    r = 1
    correct_in_top_r = 0
    for i in range(r):
        if ranked_indices[i] == 0:
            correct_in_top_r += 1
    return correct_in_top_r / r

def calculate_dcg(ranking):
    dcg = 0.0
    for i, rel in enumerate(ranking):
        dcg += rel / np.log2(i + 2)
    return dcg

def evaluate_code_search(model, tokenizer, dataset, device, max_examples=100, pool_size=100,
                         query_field="func_documentation_string", code_field="func_code_string"):
    """
    クエリとコードのペアから、候補プールを作成し各種評価指標を算出する関数です。
    フィールド名は dataset 内の実際のカラム名に合わせて指定してください。
    """
    num_examples = min(len(dataset), max_examples)
    all_codes = [dataset[i][code_field] for i in range(len(dataset))]
    queries = []
    correct_code_indices = []

    for i in range(num_examples):
        ex = dataset[i]
        query_text = ex[query_field]
        queries.append(query_text)
        correct_code_indices.append(i)

    # コードの埋め込みを事前計算
    all_code_embeddings = []
    print("データセット全体のコード埋め込みを計算中...")
    for code in all_codes[:num_examples]:
        emb = get_cls_embedding(model, tokenizer, code, device)
        all_code_embeddings.append(emb)
    all_code_embeddings = np.concatenate(all_code_embeddings, axis=0)
    print("コード埋め込み計算完了.")

    sim_matrices = []
    print("候補コードプールを作成し、類似度計算中...")
    for i in range(num_examples):
        query_embedding = get_cls_embedding(model, tokenizer, queries[i], device).reshape(1, -1)
        candidate_pool_embeddings = []
        candidate_pool_embeddings.append(all_code_embeddings[correct_code_indices[i]])
        incorrect_code_indices = []
        while len(incorrect_code_indices) < pool_size - 1:
            rand_index = random.randint(0, num_examples - 1)
            if rand_index != correct_code_indices[i] and rand_index not in incorrect_code_indices:
                incorrect_code_indices.append(rand_index)
        for incorrect_index in incorrect_code_indices:
            candidate_pool_embeddings.append(all_code_embeddings[incorrect_index])
        candidate_pool_embeddings = np.stack(candidate_pool_embeddings, axis=0)


        sims = cosine_similarity(query_embedding, candidate_pool_embeddings)[0]
        sim_matrices.append(sims)
    sim_matrix = np.array(sim_matrices)
    print("類似度計算完了.")

    metrics = calculate_metrics(sim_matrix)
    return metrics

if __name__ == "__main__":

    ######################################
    # ② google/code_x_glue_tc_nl_code_search_adv を用いた評価例
    ######################################
    print("\ngoogle/code_x_glue_tc_nl_code_search_adv データセット (Test) をロードします...")
    tc_dataset = load_dataset("google/code_x_glue_tc_nl_code_search_adv", split="test")
    subset_tc = tc_dataset.select(range(10000))  # 必要に応じてサンプル数を調整

    # ここでは、クエリとして "docstring"、コードとして "code" のカラムを利用
    model_name1 = "Shuu12121/CodeMorph-BERT"
    print(f"\n{model_name1} を google/code_x_glue_tc_nl_code_search_adv で評価します (候補プールサイズ: 100)...")
    tokenizer1 = AutoTokenizer.from_pretrained(model_name1)
    model1 = AutoModelForMaskedLM.from_pretrained(model_name1)
    model1.to(device)
    metrics_tc1 = evaluate_code_search(model1, tokenizer1, subset_tc, device, max_examples=10000, pool_size=100,
                                       query_field="docstring", code_field="code")
    print(f"{model_name1} の評価指標 (google/code_x_glue_tc_nl_code_search_adv):")
    for name, value in metrics_tc1.items():
        if isinstance(value, dict):
            for k, v in value.items():
                print(f"  {name}@{k}: {v:.4f}")
        else:
            print(f"  {name}: {value:.4f}")

    # 例として、Microsoft CodeBERT でも評価する場合
    model_name2 = "microsoft/codebert-base-mlm"
    print(f"\n{model_name2} を google/code_x_glue_tc_nl_code_search_adv で評価します (候補プールサイズ: 100)...")
    tokenizer2 = AutoTokenizer.from_pretrained(model_name2)
    model2 = AutoModelForMaskedLM.from_pretrained(model_name2)
    model2.to(device)
    metrics_tc2 = evaluate_code_search(model2, tokenizer2, subset_tc, device, max_examples=10000, pool_size=100,
                                       query_field="docstring", code_field="code")
    print(f"{model_name2} の評価指標 (google/code_x_glue_tc_nl_code_search_adv):")
    for name, value in metrics_tc2.items():
        if isinstance(value, dict):
            for k, v in value.items():
                print(f"  {name}@{k}: {v:.4f}")
        else:
            print(f"  {name}: {value:.4f}")


使用デバイス: cuda

google/code_x_glue_tc_nl_code_search_adv データセット (Test) をロードします...

Shuu12121/CodeMorph-BERT を google/code_x_glue_tc_nl_code_search_adv で評価します (候補プールサイズ: 100)...
データセット全体のコード埋め込みを計算中...
コード埋め込み計算完了.
候補コードプールを作成し、類似度計算中...
類似度計算完了.
Shuu12121/CodeMorph-BERT の評価指標 (google/code_x_glue_tc_nl_code_search_adv):
  mrr: 0.7788
  recall@k@1: 0.7159
  recall@k@5: 0.8509
  recall@k@10: 0.8990
  recall@k@50: 0.9847
  recall@k@100: 1.0000
  precision@k@1: 0.7159
  precision@k@5: 0.7676
  precision@k@10: 0.7741
  precision@k@50: 0.7786
  precision@k@100: 0.7788
  ndcg@k@1: 0.7159
  ndcg@k@5: 0.7885
  ndcg@k@10: 0.8040
  ndcg@k@50: 0.8236
  ndcg@k@100: 0.8262
  map: 0.7788
  f1@k@1: 0.7159
  f1@k@5: 0.7892
  f1@k@10: 0.8005
  f1@k@50: 0.8090
  f1@k@100: 0.8095
  r_precision: 0.7159
  success_rate@k@1: 0.7159
  success_rate@k@5: 0.8509
  success_rate@k@10: 0.8990
  success_rate@k@50: 0.9847
  success_rate@k@100: 1.0000
  query_coverage@k@1: 0.7159
  query_coverage@k@5: 0.8509
  query_cover

Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


データセット全体のコード埋め込みを計算中...
コード埋め込み計算完了.
候補コードプールを作成し、類似度計算中...
類似度計算完了.
microsoft/codebert-base-mlm の評価指標 (google/code_x_glue_tc_nl_code_search_adv):
  mrr: 0.7703
  recall@k@1: 0.7205
  recall@k@5: 0.8193
  recall@k@10: 0.8781
  recall@k@50: 0.9805
  recall@k@100: 1.0000
  precision@k@1: 0.7205
  precision@k@5: 0.7565
  precision@k@10: 0.7644
  precision@k@50: 0.7700
  precision@k@100: 0.7703
  ndcg@k@1: 0.7205
  ndcg@k@5: 0.7721
  ndcg@k@10: 0.7912
  ndcg@k@50: 0.8149
  ndcg@k@100: 0.8180
  map: 0.7703
  f1@k@1: 0.7205
  f1@k@5: 0.7722
  f1@k@10: 0.7861
  f1@k@50: 0.7965
  f1@k@100: 0.7971
  r_precision: 0.7205
  success_rate@k@1: 0.7205
  success_rate@k@5: 0.8193
  success_rate@k@10: 0.8781
  success_rate@k@50: 0.9805
  success_rate@k@100: 1.0000
  query_coverage@k@1: 0.7205
  query_coverage@k@5: 0.8193
  query_coverage@k@10: 0.8781
  query_coverage@k@50: 0.9805
  query_coverage@k@100: 1.0000
