# Analyzing Embeddings from the Pipeline

This notebook demonstrates how to load the HDF5 embeddings and metadata, and perform basic exploratory analysis.

In [1]:
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Set style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams["figure.dpi"] = 100

## 1. Load Embeddings and Metadata

In [8]:
# Path to your HDF5 file
h5_path = "T:/users/altp/data/embeddings/thyroid_embeddings.h5"

print("Loading embeddings...")
with h5py.File(h5_path, "r") as f:
    print(f.keys())

Loading embeddings...
<KeysViewHDF5 []>


## 2. Data Summary

In [None]:
print("=" * 80)
print("Dataset Summary")
print("=" * 80)

# Per-sample counts
sample_counts = metadata["sample_code"].value_counts()
print(f"\nPatches per sample:")
print(sample_counts.head(10))
print(f"\nStats: min={sample_counts.min()}, max={sample_counts.max()}, mean={sample_counts.mean():.1f}")

# Tissue distribution
print(f"\nTissue content:")
print(f"  Mean: {metadata['tissue_pct'].mean():.1f}%")
print(f"  Median: {metadata['tissue_pct'].median():.1f}%")
print(f"  Min: {metadata['tissue_pct'].min():.1f}%")
print(f"  Max: {metadata['tissue_pct'].max():.1f}%")

# Invasion distribution
print(f"\nInvasion statistics:")
print(f"  Patches with invasion: {metadata['has_invasion'].sum():,} ({metadata['has_invasion'].mean()*100:.1f}%)")
print(f"\nInvasion types:")
print(metadata["invasion_type"].value_counts().sort_index())

## 3. Visualization: Metadata Distributions

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Tissue percentage histogram
axes[0, 0].hist(metadata["tissue_pct"], bins=50, edgecolor="black", alpha=0.7)
axes[0, 0].set_xlabel("Tissue Percentage")
axes[0, 0].set_ylabel("Count")
axes[0, 0].set_title("Distribution of Tissue Content")

# Invasion percentage histogram (only patches with invasion)
invasion_patches = metadata[metadata["has_invasion"]]
if len(invasion_patches) > 0:
    axes[0, 1].hist(invasion_patches["invasion_pct"], bins=50, edgecolor="black", alpha=0.7, color="orange")
    axes[0, 1].set_xlabel("Invasion Percentage")
    axes[0, 1].set_ylabel("Count")
    axes[0, 1].set_title("Distribution of Invasion Content")
else:
    axes[0, 1].text(0.5, 0.5, "No invasion patches", ha="center", va="center")
    axes[0, 1].set_title("Distribution of Invasion Content")

# Invasion type distribution
invasion_counts = metadata["invasion_type"].value_counts().sort_index()
axes[0, 2].bar(invasion_counts.index, invasion_counts.values, edgecolor="black", alpha=0.7, color="red")
axes[0, 2].set_xlabel("Invasion Type")
axes[0, 2].set_ylabel("Count")
axes[0, 2].set_title("Invasion Type Distribution")
axes[0, 2].set_xticks(invasion_counts.index)

# Spatial distribution (normalized coordinates)
axes[1, 0].scatter(
    metadata["coord_h_norm"],
    metadata["coord_w_norm"],
    c=metadata["tissue_pct"],
    cmap="viridis",
    alpha=0.3,
    s=1,
)
axes[1, 0].set_xlabel("Normalized Height")
axes[1, 0].set_ylabel("Normalized Width")
axes[1, 0].set_title("Spatial Distribution (colored by tissue %)")
axes[1, 0].set_aspect("equal")

# Depth distribution
axes[1, 1].hist(metadata["coord_d_norm"], bins=50, edgecolor="black", alpha=0.7, color="green")
axes[1, 1].set_xlabel("Normalized Depth")
axes[1, 1].set_ylabel("Count")
axes[1, 1].set_title("Depth Distribution")

# Patches per sample
top_samples = sample_counts.head(15)
axes[1, 2].barh(range(len(top_samples)), top_samples.values)
axes[1, 2].set_yticks(range(len(top_samples)))
axes[1, 2].set_yticklabels([s[:15] + "..." if len(s) > 15 else s for s in top_samples.index])
axes[1, 2].set_xlabel("Number of Patches")
axes[1, 2].set_title("Top 15 Samples by Patch Count")
axes[1, 2].invert_yaxis()

plt.tight_layout()
plt.savefig("embedding_metadata_distributions.png", dpi=150, bbox_inches="tight")
plt.show()

## 4. Dimensionality Reduction: PCA

In [None]:
print("=" * 80)
print("Computing PCA...")
print("=" * 80)

# Subsample for faster computation (optional)
n_samples_pca = min(10000, len(embeddings))
indices = np.random.choice(len(embeddings), n_samples_pca, replace=False)

embeddings_subset = embeddings[indices]
metadata_subset = metadata.iloc[indices].copy()

# Compute PCA
pca = PCA(n_components=50)
embeddings_pca = pca.fit_transform(embeddings_subset)

print(f"Explained variance (first 10 components): {pca.explained_variance_ratio_[:10].sum():.1%}")

In [None]:
# Visualize PCA
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# PC1 vs PC2, colored by tissue percentage
scatter1 = axes[0].scatter(
    embeddings_pca[:, 0],
    embeddings_pca[:, 1],
    c=metadata_subset["tissue_pct"],
    cmap="viridis",
    alpha=0.5,
    s=10,
)
axes[0].set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.1%})")
axes[0].set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.1%})")
axes[0].set_title("PCA: Colored by Tissue %")
plt.colorbar(scatter1, ax=axes[0], label="Tissue %")

# PC1 vs PC2, colored by invasion presence
scatter2 = axes[1].scatter(
    embeddings_pca[:, 0],
    embeddings_pca[:, 1],
    c=metadata_subset["has_invasion"].astype(int),
    cmap="RdYlGn_r",
    alpha=0.5,
    s=10,
)
axes[1].set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.1%})")
axes[1].set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.1%})")
axes[1].set_title("PCA: Colored by Invasion Presence")
plt.colorbar(scatter2, ax=axes[1], label="Has Invasion", ticks=[0, 1])

# PC1 vs PC2, colored by invasion type
scatter3 = axes[2].scatter(
    embeddings_pca[:, 0],
    embeddings_pca[:, 1],
    c=metadata_subset["invasion_type"],
    cmap="tab10",
    alpha=0.5,
    s=10,
)
axes[2].set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.1%})")
axes[2].set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.1%})")
axes[2].set_title("PCA: Colored by Invasion Type")
plt.colorbar(scatter3, ax=axes[2], label="Invasion Type")

plt.tight_layout()
plt.savefig("embedding_pca.png", dpi=150, bbox_inches="tight")
plt.show()

## 5. Sample-Level Analysis

In [None]:
print("=" * 80)
print("Sample-Level Analysis")
print("=" * 80)

# Aggregate by sample
sample_stats = metadata.groupby("sample_code").agg({
    "tissue_pct": ["mean", "std", "min", "max"],
    "has_invasion": "mean",  # Fraction of patches with invasion
    "invasion_type": "max",  # Maximum invasion type seen
    "invasion_pct": "mean",
}).round(2)

sample_stats.columns = ["_".join(col).strip() for col in sample_stats.columns.values]
sample_stats["num_patches"] = metadata.groupby("sample_code").size()

print("\nSample statistics:")
print(sample_stats.head(10))

# Save to CSV
sample_stats.to_csv("sample_level_statistics.csv")
print(f"\nSaved sample statistics to: sample_level_statistics.csv")

In [None]:
# Visualize sample-level invasion rates
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Invasion rate per sample
invasion_rate = sample_stats["has_invasion_mean"].sort_values(ascending=False)
axes[0].barh(range(min(20, len(invasion_rate))), invasion_rate.head(20).values)
axes[0].set_yticks(range(min(20, len(invasion_rate))))
axes[0].set_yticklabels([s[:20] + "..." if len(s) > 20 else s for s in invasion_rate.head(20).index])
axes[0].set_xlabel("Fraction of Patches with Invasion")
axes[0].set_title("Top 20 Samples by Invasion Rate")
axes[0].invert_yaxis()

# Scatter: tissue % vs invasion rate
axes[1].scatter(
    sample_stats["tissue_pct_mean"],
    sample_stats["has_invasion_mean"],
    s=sample_stats["num_patches"] / 10,
    alpha=0.6,
    edgecolors="black",
)
axes[1].set_xlabel("Mean Tissue %")
axes[1].set_ylabel("Fraction with Invasion")
axes[1].set_title("Tissue Content vs Invasion Rate\n(size = # patches)")

plt.tight_layout()
plt.savefig("sample_level_analysis.png", dpi=150, bbox_inches="tight")
plt.show()

## 6. Export Specific Subsets

In [None]:
# Example: Save embeddings for patches with high invasion
high_invasion_mask = metadata["invasion_pct"] > 50.0
high_invasion_embeddings = embeddings[high_invasion_mask]
high_invasion_metadata = metadata[high_invasion_mask]

print(f"\nHigh invasion patches (>50%): {len(high_invasion_embeddings):,}")

# Save as numpy arrays
np.save("high_invasion_embeddings.npy", high_invasion_embeddings)
high_invasion_metadata.to_csv("high_invasion_metadata.csv", index=False)

print("Saved high invasion subset to:")
print("  - high_invasion_embeddings.npy")
print("  - high_invasion_metadata.csv")

In [None]:
print("=" * 80)
print("Analysis complete!")
print("=" * 80)