In [22]:
import json
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from scipy.signal import find_peaks
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive

# ✅ 挂载 Google Drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:

# ✅ 参数设置
start_index = 700
end_index = 731
range_tag = f"{start_index}-{end_index}"

BASE_PATH = "/content/drive/MyDrive/Cluster-proj"
LOGITS_PATH = f"{BASE_PATH}/output/llm_steps/whole_logits/deepseek7b-gsm-{range_tag}-hidden.json"
PDF_OUTPUT_PATH = f"{BASE_PATH}/output/visualization/seman_turning_{range_tag}.pdf"

In [31]:
with open(LOGITS_PATH, "r") as f:
    logits_data = json.load(f)

# ✅ 创建输出目录（如果不存在）
os.makedirs(os.path.dirname(PDF_OUTPUT_PATH), exist_ok=True)

In [32]:
# ✅ 计算相邻 token 的 cosine similarity
def compute_token_similarities(token_level):
    vectors = []
    tokens = []

    for entry in token_level:
        if "hidden_vector" in entry:
            vectors.append(entry["hidden_vector"])
            tokens.append(entry["token"])

    similarities = []
    for i in range(len(vectors) - 1):
        sim = cosine_similarity(
            np.array(vectors[i]).reshape(1, -1),
            np.array(vectors[i+1]).reshape(1, -1)
        )[0][0]
        similarities.append(sim)
    return similarities, tokens[:-1]  # tokens[:-1] 对应 similarity[i]

# ✅ 使用 find_peaks 找“语义低谷”作为断点（越低越可能是切分点）
def find_semantic_boundaries(similarities, distance=2, prominence=0.05, height=None):
    inverted = -np.array(similarities)
    peaks, properties = find_peaks(inverted, distance=distance, prominence=prominence, height=height)
    return peaks.tolist()

# ✅ 绘图函数
def plot_similarity(ax, similarities, tokens, boundaries, title):
    ax.plot(range(len(similarities)), similarities, marker='o', label="Cosine Similarity", alpha=0.8)
    ax.set_xticks(range(len(tokens)))
    ax.set_xticklabels(tokens, rotation=90, fontsize=6)
    ax.set_ylim(0.0, 1.05)
    ax.set_ylabel("Cosine Similarity")
    ax.set_title(title, fontsize=10)
    ax.grid(True)
    for b in boundaries:
        ax.axvline(x=b, color="red", linestyle="--", alpha=0.5)
    ax.legend()


In [33]:

# ✅ 主逻辑：逐个样本绘图
with PdfPages(PDF_OUTPUT_PATH) as pdf:
    for qid, sample in logits_data.items():
        for sid in ["sampling0", "sampling1", "sampling2"]:
            if sid not in sample or "token_probs" not in sample[sid]:
                continue

            token_level = sample[sid]["token_probs"]

            try:
                similarities, tokens = compute_token_similarities(token_level)
                if len(similarities) < 2:
                    continue  # 跳过太短的生成
                boundaries = find_semantic_boundaries(
                    similarities,
                    distance=5,
                    prominence=0.15
                )

                # 绘图
                fig, ax = plt.subplots(figsize=(16, 4))
                title = f"{qid} - {sid}"
                plot_similarity(ax, similarities, tokens, boundaries, title)

                # 保存当前页
                pdf.savefig(fig)
                plt.close(fig)
            except Exception as e:
                print(f"[ERROR] Skipped {qid}-{sid}: {e}")
                continue

print(f"✅ All similarity plots saved to: {PDF_OUTPUT_PATH}")

✅ All similarity plots saved to: /content/drive/MyDrive/Cluster-proj/output/visualization/seman_turning_700-731.pdf
