## 目录切换

In [None]:
import os
import sys
current_dir = os.path.dirname(os.path.abspath('.'))
sys.path.append(current_dir)
os.chdir(current_dir)

## 文件加载

In [None]:
# 修正文件列表的代码单元
target_files = [
    "../log/064851_only_acc.json"
]

In [None]:
import json

datas=[]
for file in target_files:
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        datas.append(data)


## 绘图分析

In [None]:
# 新增：提取指标并绘图
import numpy as np  # 新增
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

methods = ["entity_agnostic", "manual_erase", "hanlp_ner", "original_query"]
metric_keys = ["total_acc","dsl_acc", "dataset_acc",
               "dimension_acc", "measure_acc", "filter_acc"]

records = []

for d in datas:
    for full_key, value in d.items():
        # full_key 示例： trading__0.95_4_entity_agnostic
        # 业务线取双下划线前
        if "__" in full_key:
            biz = full_key.split("__")[0]
        else:
            # 兜底：取第一个下划线前
            biz = full_key.split("_")[0]

        # 方法名：匹配已知后缀
        method = None
        for m in methods:
            if full_key.endswith(m):
                method = m
                break
        if method is None:
            continue  # 跳过未知结构

        rec = {
            "business": biz,
            "method": method,
        }
        for mk in metric_keys:
            rec[mk] = value.get(mk, None)
        records.append(rec)

df = pd.DataFrame(records)

# 排序（业务线按出现顺序，方法按预定义顺序）
df["method"] = pd.Categorical(df["method"], categories=methods, ordered=True)
biz_order = list(dict.fromkeys(df["business"]))  # 保留首次出现顺序
df["business"] = pd.Categorical(
    df["business"], categories=biz_order, ordered=True)

display(df)

# 各方法在不同业务线的总体 acc 对比
plt.figure(figsize=(6, 4))
sns.barplot(data=df, x="business", y="acc", hue="method")
plt.title("Overall acc by business & method")
plt.ylabel("acc")
plt.tight_layout()
plt.show()

# 将其它指标展开长格式
melt_df = df.melt(id_vars=["business", "method"], value_vars=metric_keys,
                  var_name="metric", value_name="value")

plt.figure(figsize=(10, 5))
ax = sns.barplot(data=melt_df, x="metric", y="value", hue="method")
plt.title("Metrics (aggregated across business)")
# 为每个柱子添加数值标签（两位小数）
for p in ax.patches:
    h = p.get_height()
    if pd.isna(h):
        continue
    ax.annotate(f"{h:.2f}",
                (p.get_x() + p.get_width() / 2, h),
                ha="center", va="bottom",
                fontsize=8,
                xytext=(0, 3),
                textcoords="offset points")
plt.tight_layout()
plt.show()


# 分业务线 + 指标，使用FacetGrid
g = sns.catplot(
    data=melt_df,
    x="method",
    y="value",
    col="business",
    row="metric",
    kind="bar",
    height=2.2,
    sharey=False
)
g.set_titles("{row_name} | {col_name}")
for ax in g.axes.flatten():
    ax.tick_params(axis='x', rotation=45)
    # 添加数值标签
    for p in ax.patches:
        h = p.get_height()
        if np.isnan(h):
            continue
        ax.annotate(f"{h:.2f}",
                    (p.get_x() + p.get_width()/2, h),
                    ha="center", va="bottom",
                    fontsize=7,
                    xytext=(0, 2),
                    textcoords="offset points")
plt.tight_layout()
plt.show()

# 生成一个 pivot 方便查看
pivot = df.pivot(index="business", columns="method", values="acc")
display(pivot)

In [None]:
import json
import pandas as pd

# 读取JSON文件
with open(r'../log/093653_only_acc (1).json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 定义数据集和方法
datasets = ['commerce', 'community', 'trading']
methods = ['original_query', 'hanlp_ner', 'manual_erase', 'entity_agnostic']
# 只关注这三个方法
selected_methods = ['original_query', 'manual_erase', 'entity_agnostic']

# 表1: dataset_acc 和 total_acc
print("=" * 80)
print("表1: Dataset Accuracy 和 Total Accuracy")
print("=" * 80)

# 创建dataset_acc表格
dataset_acc_data = []
for method in selected_methods:
    row = [method]
    for dataset in datasets:
        key = f"{dataset}__0.95_4_2_{method}"
        if key in data:
            acc = data[key].get('dataset_acc', 0)
            row.append(f"{acc*100:.2f}")
        else:
            row.append("N/A")
    dataset_acc_data.append(row)

df_dataset_acc = pd.DataFrame(dataset_acc_data, columns=['Method'] + datasets)
print("\nDataset Accuracy:")
print(df_dataset_acc.to_string(index=False))

# 创建total_acc表格
total_acc_data = []
for method in selected_methods:
    row = [method]
    for dataset in datasets:
        key = f"{dataset}__0.95_4_2_{method}"
        if key in data:
            acc = data[key].get('total_acc', 0)
            row.append(f"{acc*100:.2f}")
        else:
            row.append("N/A")
    total_acc_data.append(row)

df_total_acc = pd.DataFrame(total_acc_data, columns=['Method'] + datasets)
print("\nTotal Accuracy:")
print(df_total_acc.to_string(index=False))

# 表2: dimension_acc, measure_acc, filter_acc
print("\n" + "=" * 80)
print("表2: Dimension, Measure 和 Filter Accuracy")
print("=" * 80)

metrics = ['dimension_acc', 'measure_acc', 'filter_acc']

for metric in metrics:
    metric_data = []
    for method in selected_methods:
        row = [method]
        for dataset in datasets:
            key = f"{dataset}__0.95_4_2_{method}"
            if key in data:
                acc = data[key].get(metric, 0)
                row.append(f"{acc*100:.2f}")
            else:
                row.append("N/A")
        metric_data.append(row)

    df_metric = pd.DataFrame(metric_data, columns=['Method'] + datasets)
    print(f"\n{metric.replace('_', ' ').title()}:")
    print(df_metric.to_string(index=False))

In [None]:
import json
import pandas as pd

# 读取JSON文件
with open(r'../log/093653_only_acc (1).json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 定义数据集和方法
datasets = ['commerce', 'community', 'trading']
selected_methods = ['original_query', 'manual_erase', 'entity_agnostic']
metrics = ['dimension_acc', 'measure_acc', 'filter_acc']

# 输出格式化的结果
print("=" * 80)
print("每行9个值（dimension_acc, measure_acc, filter_acc × 3个数据集）")
print("=" * 80)
print()

for method in selected_methods:
    values = []
    for dataset in datasets:
        for metric in metrics:
            key = f"{dataset}__0.95_4_2_{method}"
            if key in data:
                acc = data[key].get(metric, 0)
                values.append(f"{acc*100:.2f}")
            else:
                values.append("N/A")

    print(' & '.join(values))