In [None]:
import os
import sys
current_dir = os.path.dirname(os.path.abspath('.'))
sys.path.append(current_dir)
os.chdir(current_dir)

In [None]:
import json
datafile = "./data/20250916/commerce_test_0.95_4.json"
with open(datafile, "r", encoding="utf-8") as f:
    data = json.load(f)
query_list = [item["query"] for item in data]
dataset_list = [item["dataset_id"] for item in data]
len(query_list), len(dataset_list)

In [None]:
# 建立出现顺序稳定的映射 (原ID -> 1..N)
unique_dataset_ids = list(dict.fromkeys(dataset_list))
dataset_id2idx = {did: i+1 for i, did in enumerate(unique_dataset_ids)}
dataset_idx_list = [dataset_id2idx[d] for d in dataset_list]

In [None]:
import torch
from entity_agnostic_embedding_model.contrastive_learning import ContrastiveMARLOEmbeddingModel
model = ContrastiveMARLOEmbeddingModel(
    base_model="Qwen/Qwen3-Embedding-0.6B", device="cuda:1")
model.load_state_dict(torch.load(
    "./model/Qwen3-Embedding-0.6B/contrastive_aimicorrect_ex0916_0930_nerfarneg_pw1.0_nw1.0_ep10.pth"))

In [None]:
from template_construct.embed_tool import embedding_L2_normalization, get_embeddings_by_model,get_embeddings

embeddings1 = embedding_L2_normalization(get_embeddings(query_list))
embeddings2 = embedding_L2_normalization(get_embeddings_by_model(query_list, model))

In [None]:
# 各自独立 PCA 并排可视化（坐标轴不可直接数值对比）
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

E1 = embeddings1
E2 = embeddings2

pca1 = PCA(n_components=2, random_state=42)
pca2 = PCA(n_components=2, random_state=42)

E1_2d = pca1.fit_transform(E1)
E2_2d = pca2.fit_transform(E2)

fig, axes = plt.subplots(1, 2, figsize=(16, 6), constrained_layout=True)

sc1 = axes[0].scatter(E1_2d[:, 0], E1_2d[:, 1],
                      c=dataset_idx_list, cmap='tab20', alpha=0.7, s=18)
axes[0].set_title("Qwen3-Embedding-0.6B")
# axes[0].set_xlabel("主成分1")
# axes[0].set_ylabel("主成分2")
axes[0].grid(True, alpha=0.3)

sc2 = axes[1].scatter(E2_2d[:, 0], E2_2d[:, 1],
                      c=dataset_idx_list, cmap='tab20', alpha=0.7, s=18)
axes[1].set_title("Entity Agnostic Embedding Model")
# axes[1].set_xlabel("主成分1")
# axes[1].set_ylabel("主成分2")
# axes[1].set_ylabel("")  # 右侧可去掉重复标签
axes[1].grid(True, alpha=0.3)

cbar = fig.colorbar(sc2, ax=axes.ravel().tolist(), shrink=0.85, pad=0.02)
# cbar.set_label("数据集（映射后索引）")
cbar.set_ticks(range(1, len(unique_dataset_ids)+1))
cbar.set_ticklabels([str(d) for d in unique_dataset_ids])
plt.show()

In [None]:
fig.savefig("./outputs/pca_2d_compare.pdf", format="pdf", bbox_inches="tight")

In [None]:
# 各自独立 PCA 3D 并排可视化
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

E1 = embeddings1
E2 = embeddings2

pca1 = PCA(n_components=3, random_state=42)
pca2 = PCA(n_components=3, random_state=42)

E1_3d = pca1.fit_transform(E1)
E2_3d = pca2.fit_transform(E2)

fig = plt.figure(figsize=(16, 6))
ax1 = fig.add_subplot(1, 2, 1, projection='3d')
ax2 = fig.add_subplot(1, 2, 2, projection='3d')

sc1 = ax1.scatter(E1_3d[:, 0], E1_3d[:, 1], E1_3d[:, 2],
                  c=dataset_list, cmap='tab20', alpha=0.75, s=18)
ax1.set_title("Embeddings1 独立PCA 3D")
ax1.set_xlabel("主成分1")
ax1.set_ylabel("主成分2")
ax1.set_zlabel("主成分3")

sc2 = ax2.scatter(E2_3d[:, 0], E2_3d[:, 1], E2_3d[:, 2],
                  c=dataset_list, cmap='tab20', alpha=0.75, s=18)
ax2.set_title("Embeddings2 独立PCA 3D")
ax2.set_xlabel("主成分1")
ax2.set_ylabel("主成分2")
ax2.set_zlabel("主成分3")

# 统一视角（可调节 elev/azim）
for ax in (ax1, ax2):
    ax.view_init(elev=18, azim=35)

cbar = fig.colorbar(sc2, ax=[ax1, ax2], shrink=0.75, pad=0.05)
cbar.set_label("数据集编号")

plt.tight_layout()
plt.show()