In [1]:
import pandas as pd

from torch.utils.data import Dataset, DataLoader

from utils import load_test_data, visualize_test_region, generate_training_samples
from dataset import STDataset
from baseline import RandomRegionBaseline
from evaluate import Evaluator


In [7]:
training_samples = generate_training_samples(num_samples_per_slice=10, minimum_cell=40)

Seeding all randomness with seed=2024


In [20]:
training_samples

[{'normalized_positions': array([[0.67797913, 0.81402073],
         [0.77662413, 0.83990823],
         [0.89788663, 0.99468823],
         [0.83902663, 0.58076074],
         [0.65835913, 0.50555074],
         [0.85919163, 0.67395573],
         [0.65290914, 0.45323074],
         [0.78370913, 0.95108823],
         [0.99571413, 0.61727574],
         [0.91123913, 0.87124573],
         [0.68288413, 0.32959072],
         [0.72866413, 0.53770574],
         [0.95810913, 0.94018823],
         [0.93957913, 0.53825074],
         [0.84202413, 0.50010074],
         [0.69678163, 0.66605323],
         [0.65508913, 0.61618574],
         [0.89189163, 0.35758324],
         [0.75536913, 0.44641824],
         [0.97173413, 0.16990572],
         [0.98045413, 0.28762572],
         [0.75373413, 0.28571822],
         [0.82458413, 0.15273822],
         [0.90578913, 0.07943572],
         [0.65917663, 0.18407572],
         [0.62783914, 0.24593322],
         [0.91696163, 0.20533072],
         [0.51537161, 0.2312182

In [21]:
dataset = STDataset(training_samples)

NameError: name 'STDataset' is not defined

In [None]:
# Define a DataLoader to handle batching
dataloader = DataLoader(dataset, batch_size=10, shuffle=True, num_workers=4)

# Example of iterating over the DataLoader
for batch in dataloader:
    positions = batch['positions']
    expressions = batch['expressions']
    metadata = batch['metadata']
    
    # Use the positions, expressions for model training
    # The metadata could be used for logging, tracking, or conditioning if needed
    print(positions.shape, expressions.shape)

In [None]:
all_test_items = load_test_data()

In [None]:
for i, test_item in enumerate(all_test_items):
    print(f"Test Area {i+1}:")
    print(f"  Min X: {test_item.test_area.hole_min_x}")
    print(f"  Max X: {test_item.test_area.hole_max_x}")
    print(f"  Min Y: {test_item.test_area.hole_min_y}")
    print(f"  Max Y: {test_item.test_area.hole_max_y}")
    print(f"  Dominant Tissue: {test_item.test_area.dominant_tissue}")
    print(f"  Number of cells in ground truth: {len(test_item.ground_truth.hole_cells)}")
    print(f"  Number of cells in adata after masking: {test_item.adata.shape[0]}")
    print(f"  Gene expression shape: {test_item.ground_truth.gene_expression.shape}")
    
    print("  Tissue Percentages in Ground Truth:")
    for tissue, percentage in test_item.ground_truth.tissue_percentages.items():
        print(f"    {tissue}: {percentage:.2%}")
        
    break
        

In [None]:
visualize_test_region(pd.DataFrame(test_item.adata.obs), test_item.test_area, title=f'Test Region {i+1}')

In [None]:
# Apply RandomRegionBaseline
baseline = RandomRegionBaseline(test_item.adata, test_item.test_area)
pred_coords, pred_gene_expressions = baseline.fill_region()

In [None]:
# Evaluate predictions
true_coords = test_item.ground_truth.hole_cells[['center_x', 'center_y']].values
true_gene_expressions = test_item.ground_truth.gene_expression

mse, f1, cosine_sim = Evaluator.evaluate_expression(true_coords, true_gene_expressions, pred_coords, pred_gene_expressions)
chamfer_dist = Evaluator.chamfer_distance(true_coords, pred_coords)
emd = Evaluator.calculate_emd(true_coords, pred_coords)

print(f"  Evaluation Metrics for Random Region Baseline:")
print(f"    MSE: {mse}")
print(f"    F1 Score: {f1}")
print(f"    Cosine Similarity: {cosine_sim}")
print(f"    Chamfer Distance: {chamfer_dist}")
print(f"    EMD: {emd}")

In [None]:
# Visualize test region with generated coordinates
visualize_test_region(pd.DataFrame(test_item.adata.obs), test_item.test_area, title=f'Test Region {i+1}', new_coords=pred_coords)