---
# PCA Visualization
---
Loads saved embeddings and plots a 2D PCA colored by category.

In [1]:
# Load npz
import numpy as np
npz = np.load('hepex-embeddings.npz', allow_pickle=True)
embeddings = npz['embeddings']
papers = npz['papers'].tolist()
success = npz['success']
len(papers), embeddings.shape

(36294, (36294, 768))

In [6]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import torch

embs_np = embeddings.detach().cpu().numpy() if isinstance(embeddings, torch.Tensor) else np.array(embeddings)
if embs_np.ndim != 2:
    raise ValueError(f'embeddings must be 2D, got {embs_np.shape}')

pca = PCA(n_components=2)
emb_2d = pca.fit_transform(embs_np)

cats = [p.get('categories', ['unknown']) for p in papers]
unique_cats = sorted(set(cats))
cmap = plt.get_cmap('tab10')
color_map = {cat: cmap(i) for i, cat in enumerate(unique_cats)}

plt.figure(figsize=(12, 9))
for cat in unique_cats:
    idxs = [i for i, c in enumerate(cats) if c == cat]
    if not idxs: continue
    pts = emb_2d[idxs]
    plt.scatter(pts[:, 0], pts[:, 1], s=60, alpha=0.9, label=cat, color=color_map[cat])
plt.xlabel('PC 1'); plt.ylabel('PC 2')
plt.title('Papers embeddings (PCA 2D) by arXiv category')
plt.legend(title='Category')
plt.grid(True); plt.tight_layout(); plt.show()

TypeError: unhashable type: 'list'