# Cellular Neighborhood Analysis Pipeline

This notebook runs two complementary analyses:

1. **Unified CN Detection** (`cn_unified_kmeans.py`): Performs unified cellular neighborhood detection across multiple tiles
2. **Group-based CN Analysis** (`cn_unified_kmeans_selected.py`): Analyzes pre-computed CN results by groups (adjacent_tissue, center, margin)

## Workflow:

### Part 1: Unified CN Detection
- Loads multiple tiles into a unified dataset
- Performs k-means clustering on all cells together
- Generates unified CN composition heatmap
- Generates individual spatial CN maps for each tile
- Saves processed h5ad files with CN annotations

### Part 2: Group-based Analysis
- Reads processed h5ad files from unified CN detection
- Loads tile categorization from JSON file
- Generates group-specific visualizations (heatmaps, frequency distributions)
- Highlights group tiles in per-tile frequency plots


## Step 1: Import Required Libraries


In [23]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add the current directory to path to import the modules
sys.path.insert(0, str(Path.cwd()))

from cn_unified_kmeans import UnifiedCellularNeighborhoodDetector
from cn_unified_kmeans_selected import GroupCNAnalyzer
from IPython.display import Image, display
import matplotlib.pyplot as plt

print("‚úÖ Libraries imported successfully!")


‚úÖ Libraries imported successfully!


## Step 2: Configure Parameters for Unified CN Detection

Set the paths and parameters for unified CN detection.


In [None]:
# ‚ö†Ô∏è UPDATE THESE PATHS AND PARAMETERS FOR YOUR ANALYSIS

# Directory containing h5ad tile files
TILES_DIRECTORY = "/mnt/j/HandE/results/SOW1885_n=201_AT2 40X/JN_TS_023/manual_2mm_17/selected_h5ad/adjacent_tissue"
# Alternative paths (uncomment to use):
# TILES_DIRECTORY = "/mnt/c/ProgramData/github_repo/image_analysis_scripts/neighborhood_composition/spatial_contexts/selected_h5ad_tiles/processed_h5ad"
# TILES_DIRECTORY = "/mnt/j/GDC-TCGA-LUAD/00a0b174-1eab-446a-ba8c-7c6e3acd7f0c/pred/h5ad"

# Output directory for results
OUTPUT_DIR = "cn_unified_results"

# Analysis parameters
K_NEIGHBORS = 20  # Number of nearest neighbors for spatial graph
N_CLUSTERS = 7    # Number of cellular neighborhoods to detect
CELLTYPE_KEY = "cell_type"  # Column name for cell types
RANDOM_STATE = 220705  # Random seed for reproducibility

# Optional: Limit number of tiles for testing (set to None to process all)
MAX_TILES = None  # e.g., 5 for quick testing

# File pattern to match
FILE_PATTERN = "*.h5ad"

# Spatial coordinate offsetting
COORD_OFFSET = True  # Set to False to disable coordinate offsetting between tiles

print("Configuration for Unified CN Detection:")
print(f"  Tiles directory: {TILES_DIRECTORY}")
print(f"  Output directory: {OUTPUT_DIR}")
print(f"  k (nearest neighbors): {K_NEIGHBORS}")
print(f"  n_clusters (CNs): {N_CLUSTERS}")
print(f"  Cell type key: {CELLTYPE_KEY}")
print(f"  Max tiles: {MAX_TILES if MAX_TILES else 'All'}")
print(f"  Coordinate offset: {COORD_OFFSET}")


## Step 3: Run Unified CN Detection Pipeline


In [None]:
# Initialize the unified CN detector
detector = UnifiedCellularNeighborhoodDetector(
    tiles_directory=TILES_DIRECTORY,
    output_dir=OUTPUT_DIR
)

# Discover tile files
tile_files = detector.discover_tiles(pattern=FILE_PATTERN, max_tiles=MAX_TILES)

if not tile_files:
    print("‚ùå No tiles found! Please check the TILES_DIRECTORY path.")
else:
    print(f"\n‚úÖ Found {len(tile_files)} tile(s) to process")
    print("\nFirst few tiles:")
    for i, tile_file in enumerate(tile_files[:5], 1):
        print(f"  {i}. {tile_file.name}")
    if len(tile_files) > 5:
        print(f"  ... and {len(tile_files) - 5} more")


In [None]:
# Run the complete unified CN detection pipeline
if tile_files:
    detector.run_full_pipeline(
        tile_files=tile_files,
        k=K_NEIGHBORS,
        n_clusters=N_CLUSTERS,
        celltype_key=CELLTYPE_KEY,
        random_state=RANDOM_STATE,
        coord_offset=COORD_OFFSET
    )
    
    print("\n‚úÖ Unified CN detection pipeline completed successfully!")
    
    # Display the unified CN composition heatmap
    heatmap_path = Path(OUTPUT_DIR) / 'unified_analysis' / 'unified_cn_composition_heatmap.png'
    if heatmap_path.exists():
        print("\nüìä Unified CN Composition Heatmap:")
        display(Image(str(heatmap_path)))
else:
    print("‚ùå Cannot run pipeline: No tiles found")


---

## Step 4: Configure Parameters for Group-based CN Analysis

Set the paths and parameters for group-based analysis of pre-computed CN results.


In [24]:
# ‚ö†Ô∏è UPDATE THESE PATHS FOR GROUP-BASED ANALYSIS

# Directory containing processed h5ad files with CN annotations
# This should be the 'processed_h5ad' directory from the unified CN detection output
PROCESSED_H5AD_DIR = "/mnt/c/ProgramData/github_repo/image_analysis_scripts/neighborhood_composition/spatial_contexts/cn_unified_results/2mm_all_17_clusters=7/processed_h5ad"
# Alternative paths (uncomment to use):
# PROCESSED_H5AD_DIR = "/mnt/c/ProgramData/github_repo/image_analysis_scripts/neighborhood_composition/spatial_contexts/cn_unified_results/3mm_all_9_clusters=7/processed_h5ad"
# PROCESSED_H5AD_DIR = "/mnt/c/ProgramData/github_repo/image_analysis_scripts/neighborhood_composition/spatial_contexts/cn_unified_results/4mm_all_5_clusters=7/processed_h5ad"

# Path to tile categories JSON file
CATEGORIES_JSON = "/mnt/c/ProgramData/github_repo/image_analysis_scripts/neighborhood_composition/spatial_contexts/cn_unified_results/2mm_all_17_clusters=7/tile_categories.json"
# Alternative paths (uncomment to use):
# CATEGORIES_JSON = "/mnt/c/ProgramData/github_repo/image_analysis_scripts/neighborhood_composition/spatial_contexts/cn_unified_results/3mm_all_9_clusters=7/tile_categories.json"
# CATEGORIES_JSON = "/mnt/c/ProgramData/github_repo/image_analysis_scripts/neighborhood_composition/spatial_contexts/cn_unified_results/4mm_all_5_clusters=7/tile_categories.json"

# Output directory for group-specific results
# Results will be saved in a subfolder named after tile size (e.g., 2mm, 3mm, 4mm)
GROUP_OUTPUT_DIR = "/mnt/c/ProgramData/github_repo/image_analysis_scripts/neighborhood_composition/spatial_contexts/cn_unified_results_selected"

# Analysis parameters
GROUP_CN_KEY = "cn_celltype"  # Column name for CN labels
GROUP_CELLTYPE_KEY = "cell_type"  # Column name for cell types
GROUP_COLOR_PALETTE = "Set2"  # Color palette for visualizations

# Optional: Analyze specific group only (set to None to analyze all groups)
# Options: 'adjacent_tissue', 'center', 'margin', or None for all
ANALYZE_GROUP = None  # e.g., 'center' for center group only

print("Configuration for Group-based CN Analysis:")
print(f"  Processed h5ad directory: {PROCESSED_H5AD_DIR}")
print(f"  Categories JSON: {CATEGORIES_JSON}")
print(f"  Output directory: {GROUP_OUTPUT_DIR}")
print(f"  CN key: {GROUP_CN_KEY}")
print(f"  Cell type key: {GROUP_CELLTYPE_KEY}")
print(f"  Color palette: {GROUP_COLOR_PALETTE}")
print(f"  Analyze group: {ANALYZE_GROUP if ANALYZE_GROUP else 'All groups'}")


Configuration for Group-based CN Analysis:
  Processed h5ad directory: /mnt/c/ProgramData/github_repo/image_analysis_scripts/neighborhood_composition/spatial_contexts/cn_unified_results/2mm_all_17_clusters=7/processed_h5ad
  Categories JSON: /mnt/c/ProgramData/github_repo/image_analysis_scripts/neighborhood_composition/spatial_contexts/cn_unified_results/2mm_all_17_clusters=7/tile_categories.json
  Output directory: /mnt/c/ProgramData/github_repo/image_analysis_scripts/neighborhood_composition/spatial_contexts/cn_unified_results_selected
  CN key: cn_celltype
  Cell type key: cell_type
  Color palette: Set2
  Analyze group: All groups


## Step 5: Run Group-based CN Analysis


In [25]:
# Initialize the group CN analyzer
group_analyzer = GroupCNAnalyzer(
    processed_h5ad_dir=PROCESSED_H5AD_DIR,
    categories_json=CATEGORIES_JSON,
    output_dir=GROUP_OUTPUT_DIR
)

# Run analysis
if ANALYZE_GROUP:
    # Analyze specific group only
    print(f"\nAnalyzing group: {ANALYZE_GROUP}")
    group_analyzer.analyze_group(
        ANALYZE_GROUP,
        cn_key=GROUP_CN_KEY,
        celltype_key=GROUP_CELLTYPE_KEY,
        color_palette=GROUP_COLOR_PALETTE
    )
else:
    # Analyze all groups
    group_analyzer.analyze_all_groups(
        cn_key=GROUP_CN_KEY,
        celltype_key=GROUP_CELLTYPE_KEY,
        color_palette=GROUP_COLOR_PALETTE
    )

print("\n‚úÖ Group-based CN analysis completed successfully!")


FileExistsError: [Errno 17] File exists: '/mnt/c/ProgramData/github_repo/image_analysis_scripts/neighborhood_composition/spatial_contexts/cn_unified_results_selected/2mm'

## Step 6: View Results Summary

Display summary of generated files and view visualizations.


In [None]:
# Display output file locations
print("\nüìÅ Output Files:")
print("=" * 80)

# Unified CN detection results
print(f"\n1. Unified CN Detection Results:")
print(f"   - Directory: {Path(OUTPUT_DIR).absolute()}")
print(f"   - Heatmap: {Path(OUTPUT_DIR) / 'unified_analysis' / 'unified_cn_composition_heatmap.png'}")
print(f"   - Frequency (overall): {Path(OUTPUT_DIR) / 'unified_analysis' / 'neighborhood_frequency_overall.png'}")
print(f"   - Frequency (per tile): {Path(OUTPUT_DIR) / 'unified_analysis' / 'neighborhood_frequency_per_tile.png'}")
print(f"   - Individual tile maps: {Path(OUTPUT_DIR) / 'individual_tiles'}")
print(f"   - Processed h5ad files: {Path(OUTPUT_DIR) / 'processed_h5ad'}")

# Group-based analysis results
print(f"\n2. Group-based CN Analysis Results:")
print(f"   - Directory: {Path(GROUP_OUTPUT_DIR).absolute()}")
print(f"   - Tile size subfolder: Auto-created based on JSON metadata (e.g., 2mm, 3mm, 4mm)")
print(f"   - For each group, generates:")
print(f"     * cell_fraction_difference_{{group}}.png (heatmap showing difference from overall)")
print(f"     * cn_cell_fraction_{{group}}.csv (composition data)")
print(f"     * neighborhood_frequency_{{group}}.png")
print(f"     * neighborhood_frequency_per_tile_{{group}}.png (with highlighted tiles)")

print("\n" + "=" * 80)
print("‚úÖ All analyses complete!")

# Display group analysis output directory if available
import json
if Path(CATEGORIES_JSON).exists():
    with open(CATEGORIES_JSON, 'r') as f:
        categories = json.load(f)
    tile_size = categories.get('metadata', {}).get('tile_size_mm2', 2.0)
    tile_size_folder = f"{int(tile_size)}mm"
    group_results_dir = Path(GROUP_OUTPUT_DIR) / tile_size_folder
    print(f"\nGroup analysis results saved to: {group_results_dir.absolute()}")
