In [8]:
import os
import torch
import json

os.chdir("/n/home08/zkong/mufan/tmp/moebench/open-instruct")
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

model_dir = "output/0304_lima_expert"
result_dir = f"results/mode/.__output__{os.path.basename(model_dir)}"

In [21]:
def find_latest_json_file(directory):
    # 获取目录下所有以 .json 结尾的文件列表
    json_files = [f for f in os.listdir(directory) if f.endswith(".json")]

    # 如果没有找到任何 .json 文件，返回 None
    if not json_files:
        return None

    # 获取每个文件的完整路径
    full_paths = [os.path.join(directory, f) for f in json_files]

    # 找到最新的文件
    latest_file = max(full_paths, key=os.path.getmtime)

    return latest_file


def get_acc(mode):
    result_path = find_latest_json_file(result_dir.replace("mode", mode))
    result = json.load(open(result_path))

    datasets = [
        "arc_challenge",
        "arc_easy",
        "mmlu",
        "piqa",
        "winogrande",
    ]
    acc = {dataset: result["results"][dataset]["acc,none"] for dataset in datasets}

    if mode == "baseline":
        acc["nq"] = result["results"]["nq_open"]["exact_match,remove_whitespace"]
        datasets = ["truthfulqa_mc1", "truthfulqa_mc2"]
        for dataset in datasets:
            acc[dataset] = result["results"][dataset]["acc,none"]

    return acc


print(get_acc("baseline"))
print(get_acc("random"))
print(get_acc("prune"))
print(get_acc("baseline").values())
print(get_acc("random").values())
print(get_acc("prune").values())

{'arc_challenge': 0.4641638225255973, 'arc_easy': 0.7878787878787878, 'mmlu': 0.5089018658310782, 'piqa': 0.8046789989118607, 'winogrande': 0.6882399368587214, 'nq': 0.1368421052631579, 'truthfulqa_mc1': 0.23378212974296206, 'truthfulqa_mc2': 0.35617605142565717}
{'arc_challenge': 0.19368600682593856, 'arc_easy': 0.27230639730639733, 'mmlu': 0.24405355362483977, 'piqa': 0.5125136017410229, 'winogrande': 0.47750591949486976}
{'arc_challenge': 0.43600682593856654, 'arc_easy': 0.7550505050505051, 'mmlu': 0.4188861985472155, 'piqa': 0.7878128400435256, 'winogrande': 0.6393054459352802}
dict_values([0.4641638225255973, 0.7878787878787878, 0.5089018658310782, 0.8046789989118607, 0.6882399368587214, 0.1368421052631579, 0.23378212974296206, 0.35617605142565717])
dict_values([0.19368600682593856, 0.27230639730639733, 0.24405355362483977, 0.5125136017410229, 0.47750591949486976])
dict_values([0.43600682593856654, 0.7550505050505051, 0.4188861985472155, 0.7878128400435256, 0.6393054459352802])


In [18]:
def calculate_entropy(A):
    A = A.float()
    row_sums = A.sum(dim=-1, keepdim=True)
    P = A / (row_sums + 1e-10)
    P_log_P = P * torch.log2(P + 1e-10)
    entropy = -P_log_P.sum(dim=-1)
    return entropy


logits = torch.load(f"{model_dir}/baseline.pt")
selected_experts = torch.topk(logits, k=8, dim=-1).indices
print("selected_experts shape", selected_experts.shape)
expert_frequency = torch.zeros((selected_experts.shape[0], 64), dtype=torch.int32)
for i in range(selected_experts.shape[0]):
    counts = torch.bincount(selected_experts[i].flatten(), minlength=64)
    expert_frequency[i] = counts
print("expert_frequency shape", expert_frequency.shape)
print("expert_frequency", expert_frequency)

entropy = calculate_entropy(expert_frequency)
print("entropy", entropy)
print("average entropy", entropy.mean())
print("max entropy", entropy.max())
print("entropy for each layer", entropy.tolist())

  logits = torch.load(f"{model_dir}/baseline.pt")


selected_experts shape torch.Size([16, 5350069, 8])
expert_frequency shape torch.Size([16, 64])
expert_frequency tensor([[1840042,  243950,  254206,  ...,  471582,  286382,  994916],
        [ 465830, 1924910,  405280,  ...,  378917,  684210,  673798],
        [ 481770, 1703642,  180686,  ...,  361239,  899755,  882962],
        ...,
        [ 483036,  644833,  526679,  ...,  189351,  229874,  562050],
        [ 425265,  813838, 1826416,  ...,  418472,  592490,  788973],
        [1523279,  381101,  724180,  ...,  641075,  451530, 1600010]],
       dtype=torch.int32)
entropy tensor([5.6433, 5.7174, 5.7027, 5.6944, 5.6423, 5.7080, 5.6782, 5.7117, 5.7021,
        5.7218, 5.6797, 5.6942, 5.7504, 5.7029, 5.7318, 5.7342])
average entropy tensor(5.7009)
max entropy tensor(5.7504)
entropy for each layer [5.643259048461914, 5.7174072265625, 5.70271635055542, 5.694390773773193, 5.642325401306152, 5.707958221435547, 5.678200721740723, 5.711667060852051, 5.702147006988525, 5.721807956695557, 5.679

In [None]:
from tqdm.auto import tqdm

top_expert_frequency = []
for layer in tqdm(range(logits.shape[0]), desc="layer"):
    top1_experts = torch.argmax(logits[layer], dim=1)  # 形状为 [2693448]

    expert_logits = []
    for expert_idx in tqdm(range(logits.shape[2]), desc="expert", disable=True):
        # 创建布尔掩码，标记出当前专家为 Top-1 的位置
        mask = top1_experts == expert_idx  # 形状为 [2,693,448]
        # 使用掩码筛选出对应的 logits
        selected_logits = logits[layer, mask]
        expert_logits.append(selected_logits)
        # print("selected_logits shape", selected_logits.shape)
    top_expert_frequency.append(expert_logits)

collaboration = torch.zeros(
    (logits.shape[0], logits.shape[2], logits.shape[2] - 1), dtype=torch.int32
)
for layer in tqdm(range(logits.shape[0]), desc="layer"):
    for expert in tqdm(range(logits.shape[2]), desc="expert", disable=True):
        expert_logits = top_expert_frequency[layer][expert]
        expert_logits[:, expert] = float("-inf")
        top_experts = torch.topk(expert_logits, k=7, dim=-1).indices
        collaboration_frequency = torch.bincount(top_experts.flatten(), minlength=64)
        mask = torch.arange(64) != expert
        collaboration[layer, expert] = collaboration_frequency[mask]

entropy = calculate_entropy(collaboration)
print(collaboration)
print("entropy", entropy)
print("average entropy", entropy.mean())
print("shape of entropy", entropy.shape)
print("collaboration for each layer", entropy.mean(dim=1))

layer:   0%|          | 0/16 [00:00<?, ?it/s]

layer:   0%|          | 0/16 [00:00<?, ?it/s]

tensor([[[    450,    1138,    1348,  ...,    3310,    1228,     591],
         [   5699,    5376,   21069,  ...,    7498,    3457,    3901],
         [  29121,    1416,    5035,  ...,    2234,    2182,    2828],
         ...,
         [   4881,    6642,    6032,  ...,    1757,    2657,    2101],
         [   6533,    2860,    2800,  ...,    1519,   15304,    8429],
         [     79,     194,     108,  ...,    4712,     819,     264]],

        [[  61109,   20409,   39898,  ...,   10334,    8005,   17453],
         [  51033,   22263,   38673,  ...,    2389,    6445,   14033],
         [   9119,    6520,    1738,  ...,    1807,     617,    1109],
         ...,
         [   2226,     993,    1673,  ...,   15056,   19325,   19356],
         [    133,    2158,     473,  ...,     824,    1563,   13970],
         [   1000,    4656,    1857,  ...,   36023,    7843,   26169]],

        [[   3264,   11524,   12216,  ...,   13582,   99856,    4517],
         [   1336,     396,    3134,  ...,   

In [27]:
datasets = ["I-CoNa.json", "I-Controversial.json", "I-MaliciousInstructions.json"]
for dataset in datasets:
    result_path = f"output/{os.path.basename(model_dir)}/{dataset}"
    result = json.load(open(result_path))
    harmfulness = [item["harmfulness"] for item in result]
    llamaguard = [item["llamaguard"] != "safe" for item in result]
    openai = [item["opanai"] for item in result]
    print(f"{dataset}: average harmfulness {sum(harmfulness) / len(harmfulness)}")
    print(f"{dataset}: average llamaguard {sum(llamaguard) / len(llamaguard)}")
    print(f"{dataset}: average openai {sum(openai) / len(openai)}")
    print(
        f"{sum(harmfulness) / len(harmfulness)}, {sum(llamaguard) / len(llamaguard)}, {sum(openai) / len(openai)}"
    )

I-CoNa.json: average harmfulness 2.790157190367077
I-CoNa.json: average llamaguard 0.7528089887640449
I-CoNa.json: average openai 0.5115669894124469
2.790157190367077, 0.7528089887640449, 0.5115669894124469
I-Controversial.json: average harmfulness 2.6914047837257384
I-Controversial.json: average llamaguard 0.7
I-Controversial.json: average openai 0.329106279740407
2.6914047837257384, 0.7, 0.329106279740407
I-MaliciousInstructions.json: average harmfulness 2.4393502897769213
I-MaliciousInstructions.json: average llamaguard 0.54
I-MaliciousInstructions.json: average openai 0.23379952376668833
2.4393502897769213, 0.54, 0.23379952376668833
