In [1]:
import numpy as np
import pandas as pd

from torch.utils.data import Dataset, DataLoader

from utils import load_test_data, visualize_test_region, generate_training_samples
from dataset import STDataset
from baseline import RandomRegionBaseline, TissueSpecificRandomRegionBaseline
from evaluate import Evaluator

In [2]:
all_test_items = load_test_data(num_holes=10)

Seeding all randomness with seed=2024
Donor_id: MsBrainAgingSpatialDonor_1
Slice_id: 0
Donor_id: MsBrainAgingSpatialDonor_2
Slice_id: 0
Slice_id: 1
Donor_id: MsBrainAgingSpatialDonor_3
Slice_id: 0
Slice_id: 1
Donor_id: MsBrainAgingSpatialDonor_4
Slice_id: 0
Slice_id: 1
Slice_id: 2
Donor_id: MsBrainAgingSpatialDonor_5
Slice_id: 0
Slice_id: 1
Slice_id: 2
Donor_id: MsBrainAgingSpatialDonor_6
Slice_id: 0
Slice_id: 1
Slice_id: 2
Donor_id: MsBrainAgingSpatialDonor_7
Slice_id: 0
Slice_id: 1
Slice_id: 2
Donor_id: MsBrainAgingSpatialDonor_8
Slice_id: 0
Slice_id: 1
Slice_id: 2
Donor_id: MsBrainAgingSpatialDonor_9
Slice_id: 0
Slice_id: 1
Slice_id: 2
Donor_id: MsBrainAgingSpatialDonor_10
Slice_id: 0
Slice_id: 1
Slice_id: 2
Donor_id: MsBrainAgingSpatialDonor_11
Slice_id: 0
Slice_id: 1
Slice_id: 2
Donor_id: MsBrainAgingSpatialDonor_12
Slice_id: 0
Slice_id: 1


In [4]:
# Initialize dictionaries to store metrics for both baselines
metrics = {
    'RandomRegionBaseline': {
        'mse': [],
        'f1': [],
        'cosine_sim': [],
        'chamfer_dist': [],
        'emd': []
    },
    'TissueSpecificRandomRegionBaseline': {
        'mse': [],
        'f1': [],
        'cosine_sim': [],
        'chamfer_dist': [],
        'emd': []
    }
}

In [5]:
for i, test_item in enumerate(all_test_items):
    print(f"Test Area {i+1}:")
    print(f"  Dominant Tissue: {test_item.test_area.dominant_tissue}")
    print(f"  Number of cells in ground truth: {len(test_item.ground_truth.hole_cells)}")
    
    # Apply RandomRegionBaseline
    random_baseline = RandomRegionBaseline(test_item.adata, test_item.test_area)
    random_coords, random_gene_expressions = random_baseline.fill_region()

    # Apply TissueSpecificRandomRegionBaseline
    tissue_baseline = TissueSpecificRandomRegionBaseline(test_item.adata, test_item.test_area)
    tissue_coords, tissue_gene_expressions = tissue_baseline.fill_region()

    # Evaluate predictions for RandomRegionBaseline
    true_coords = test_item.ground_truth.hole_cells[['center_x', 'center_y']].values
    true_gene_expressions = test_item.ground_truth.gene_expression

    mse_r, f1_r, cosine_sim_r = Evaluator.evaluate_expression(true_coords, true_gene_expressions, random_coords, random_gene_expressions)
    chamfer_dist_r = Evaluator.chamfer_distance(true_coords, random_coords)
    emd_r = Evaluator.calculate_emd(true_coords, random_coords)

    # Evaluate predictions for TissueSpecificRandomRegionBaseline
    mse_t, f1_t, cosine_sim_t = Evaluator.evaluate_expression(true_coords, true_gene_expressions, tissue_coords, tissue_gene_expressions)
    chamfer_dist_t = Evaluator.chamfer_distance(true_coords, tissue_coords)
    emd_t = Evaluator.calculate_emd(true_coords, tissue_coords)

    # Collect results for RandomRegionBaseline
    metrics['RandomRegionBaseline']['mse'].append(mse_r)
    metrics['RandomRegionBaseline']['f1'].append(f1_r)
    metrics['RandomRegionBaseline']['cosine_sim'].append(cosine_sim_r)
    metrics['RandomRegionBaseline']['chamfer_dist'].append(chamfer_dist_r)
    metrics['RandomRegionBaseline']['emd'].append(emd_r)

    # Collect results for TissueSpecificRandomRegionBaseline
    metrics['TissueSpecificRandomRegionBaseline']['mse'].append(mse_t)
    metrics['TissueSpecificRandomRegionBaseline']['f1'].append(f1_t)
    metrics['TissueSpecificRandomRegionBaseline']['cosine_sim'].append(cosine_sim_t)
    metrics['TissueSpecificRandomRegionBaseline']['chamfer_dist'].append(chamfer_dist_t)
    metrics['TissueSpecificRandomRegionBaseline']['emd'].append(emd_t)

Test Area 1:
  Dominant Tissue: striatum
  Number of cells in ground truth: 50
Test Area 2:
  Dominant Tissue: cortical layer VI
  Number of cells in ground truth: 50
Test Area 3:
  Dominant Tissue: cortical layer VI
  Number of cells in ground truth: 50
Test Area 4:
  Dominant Tissue: cortical layer VI
  Number of cells in ground truth: 50
Test Area 5:
  Dominant Tissue: cortical layer VI
  Number of cells in ground truth: 50
Test Area 6:
  Dominant Tissue: striatum
  Number of cells in ground truth: 50
Test Area 7:
  Dominant Tissue: striatum
  Number of cells in ground truth: 50
Test Area 8:
  Dominant Tissue: striatum
  Number of cells in ground truth: 50
Test Area 9:
  Dominant Tissue: cortical layer VI
  Number of cells in ground truth: 50
Test Area 10:
  Dominant Tissue: cortical layer VI
  Number of cells in ground truth: 50
Test Area 11:
  Dominant Tissue: striatum
  Number of cells in ground truth: 50
Test Area 12:
  Dominant Tissue: corpus callosum
  Number of cells in groun

In [6]:
for method in metrics:
    print(f"Results for {method}:")
    for metric in metrics[method]:
        mean_value = np.mean(metrics[method][metric])
        std_value = np.std(metrics[method][metric])
        print(f"  {metric.capitalize()}: Mean = {mean_value:.4f}, Std = {std_value:.4f}")

Results for RandomRegionBaseline:
  Mse: Mean = 1.9030, Std = 0.2837
  F1: Mean = 0.8143, Std = 0.0359
  Cosine_sim: Mean = 0.0329, Std = 0.0402
  Chamfer_dist: Mean = 26.7087, Std = 3.0385
  Emd: Mean = 21.8310, Std = 3.6684
Results for TissueSpecificRandomRegionBaseline:
  Mse: Mean = 1.8188, Std = 0.3385
  F1: Mean = 0.8236, Std = 0.0366
  Cosine_sim: Mean = 0.0725, Std = 0.0544
  Chamfer_dist: Mean = 27.0084, Std = 3.2562
  Emd: Mean = 22.2538, Std = 4.1351


In [2]:
training_samples = generate_training_samples(num_samples_per_slice=3)

Seeding all randomness with seed=2024


In [6]:
len(training_samples)

3

In [3]:
dataset = STDataset(training_samples)

In [4]:
# Define a DataLoader to handle batching
dataloader = DataLoader(dataset, batch_size=10, shuffle=True, num_workers=4)

# Example of iterating over the DataLoader
for batch in dataloader:
    positions = batch['positions']
    expressions = batch['expressions']
    metadata = batch['metadata']
    
    # Use the positions, expressions for model training
    # The metadata could be used for logging, tracking, or conditioning if needed
    print(positions.shape, expressions.shape)

torch.Size([3, 50, 2]) torch.Size([3, 50, 374])
