In [None]:
import os,sys
current_dir = os.path.dirname(os.path.abspath('.'))
sys.path.append(current_dir)
os.chdir(current_dir)

In [None]:
import json
target_file_url = './log/20250919/144659_only_retrieved.json'
with open(target_file_url, 'r', encoding='utf-8') as f:
    data = json.load(f)
len(data)

In [None]:
from collections import defaultdict
import math
# results[method][query] = ranked list (length=5)
# results = {
#     "m1": {"q1": ["a","b","c","d","e"], ...},
#     "m2": {...},
#     "m3": {...},
#     "m4": {...},
# }
results = {}
for name,item in data.items():
    retrieved = item["retrieved"]
    ct={}
    for retrieved_item in retrieved:
        pattern = retrieved_item["pattern"]
        query_id = pattern["id"]
        pattern_retrieved = pattern["retrieved"]
        retrieved_id = [pr[0]['id'] for pr in pattern_retrieved]
        ct[query_id]=retrieved_id
    results[name]=ct

def jaccard(a, b):
    sa, sb = set(a), set(b)
    return len(sa & sb) / (len(sa | sb) or 1)

def overlap_at_k(a, b):
    return len(set(a) & set(b))

def spearman(a, b):
    # map missing item to rank = len(list)+1
    k = len(a)
    rank_a = {v:i+1 for i,v in enumerate(a)}
    rank_b = {v:i+1 for i,v in enumerate(b)}
    universe = set(a) | set(b)
    ra = []
    rb = []
    miss_rank = k + 1
    for item in universe:
        ra.append(rank_a.get(item, miss_rank))
        rb.append(rank_b.get(item, miss_rank))
    n = len(universe)
    mean_a = sum(ra)/n
    mean_b = sum(rb)/n
    num = sum((ra[i]-mean_a)*(rb[i]-mean_b) for i in range(n))
    denom = math.sqrt(sum((x-mean_a)**2 for x in ra) * sum((y-mean_b)**2 for y in rb))
    return num/denom if denom else 0

def rbo(S, T, p=0.85):
    # Rank-Biased Overlap (top-weighted); lists S,T
    S, T = list(S), list(T)
    k = max(len(S), len(T))
    ss, tt = set(), set()
    overlap = 0
    score = 0.0
    for d in range(1, k+1):
        if d <= len(S): ss.add(S[d-1])
        if d <= len(T): tt.add(T[d-1])
        overlap_d = len(ss & tt)
        score += (overlap_d / d) * (p ** (d-1))
    return (1-p) * score

methods = list(results.keys())
pairwise_metrics = defaultdict(lambda: {"jaccard":[],"rbo":[]})

for i in range(len(methods)):
    for j in range(i+1, len(methods)):
        m1, m2 = methods[i], methods[j]
        # print(m1, m2)
        # print(results[m1])
        # print(results[m2])
        for q in results[m1]:
            # print(q)
            a = results[m1][q]
            b = results[m2][q]
            pairwise_metrics[(m1,m2)]["jaccard"].append(jaccard(a,b))
            pairwise_metrics[(m1,m2)]["rbo"].append(rbo(a,b))
# 汇总
summary = {
    pair: {
        "avg_jaccard": sum(v["jaccard"])/len(v["jaccard"]),
        "avg_rbo": sum(v["rbo"])/len(v["rbo"])
    } for pair,v in pairwise_metrics.items()
}
print(summary)

In [None]:

import numpy as np
import pandas as pd
import json
from tabulate import tabulate

# 1. 行式 DataFrame
rows = []
for (m1, m2), stats in summary.items():
    rows.append({
        "m1": m1,
        "m2": m2,
        "avg_jaccard": stats["avg_jaccard"],
        "avg_rbo": stats["avg_rbo"]
    })
df = pd.DataFrame(rows).sort_values(
    ["avg_jaccard", "avg_rbo"], ascending=False).reset_index(drop=True)

print("=== Pairwise Metrics (sorted) ===")
print(tabulate(df, headers="keys", tablefmt="github", floatfmt=".4f"))

# 2. 生成对称矩阵（Jaccard / RBO）
methods_sorted = sorted(set(df.m1) | set(df.m2))


def build_symmetric(metric_col, fill_diag=1.0):
    mat = pd.DataFrame(np.nan, index=methods_sorted, columns=methods_sorted)
    for _, r in df.iterrows():
        mat.loc[r.m1, r.m2] = r[metric_col]
        mat.loc[r.m2, r.m1] = r[metric_col]
    for m in methods_sorted:
        mat.loc[m, m] = fill_diag
    return mat


jaccard_mat = build_symmetric("avg_jaccard")
rbo_mat = build_symmetric("avg_rbo")

print("\n=== Jaccard Matrix (upper/lower mirrored) ===")
print(tabulate(jaccard_mat, headers="keys", tablefmt="github", floatfmt=".3f"))
print("\n=== RBO Matrix ===")
print(tabulate(rbo_mat, headers="keys", tablefmt="github", floatfmt=".3f"))

# 3. Styler（在 notebook 里会有颜色，纯终端不会显示）
try:
    display(jaccard_mat.style.format(
        "{:.3f}").background_gradient(cmap="YlGnBu"))
    display(rbo_mat.style.format("{:.3f}").background_gradient(cmap="PuRd"))
except NameError:
    pass

# 5. 可选：热力图
try:
    import seaborn as sns
    import matplotlib.pyplot as plt
    plt.figure(figsize=(6, 4))
    sns.heatmap(jaccard_mat, annot=True, fmt=".3f", cmap="YlGnBu")
    plt.title("Pairwise Jaccard")
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(6, 4))
    sns.heatmap(rbo_mat, annot=True, fmt=".3f", cmap="PuRd")
    plt.title("Pairwise RBO")
    plt.tight_layout()
    plt.show()
except Exception as e:
    print("Heatmap skipped:", e)

# 6. 简短总结（示例）
print("\n=== Quick Summary ===")
print(
    f"Jaccard mean={df.avg_jaccard.mean():.3f}  std={df.avg_jaccard.std():.3f}")
print(f"RBO mean={df.avg_rbo.mean():.3f}  std={df.avg_rbo.std():.3f}")
top_j = df.iloc[0]
print(
    f"Highest Jaccard pair: ({top_j.m1}, {top_j.m2}) = {top_j.avg_jaccard:.3f}")
# ...existing code...