In [1]:
import scanpy as sc
import TOSICA
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import os
import shutil
import torch
import warnings

  from .autonotebook import tqdm as notebook_tqdm
  from pkg_resources import get_distribution, DistributionNotFound


In [None]:
# Suppress warnings to keep the output clean
warnings.filterwarnings('ignore')

# ==========================================
# 1. Configuration (UPDATE THESE)
# ==========================================
# Path to your .h5ad file
FILE_PATH = "./human_immune.h5ad" 

# The column name in .obs that contains ground truth labels
LABEL_KEY = "final_annotation"

# Output filename
OUTPUT_CSV = "TOSICA_CV_predictions.csv"

# Training parameters
N_EPOCHS = 8           # Runs from epoch 0 to 9
BATCH_SIZE = 16         # Set to 16 to avoid Out-Of-Memory (OOM) errors
N_SPLITS = 5            # 5-Fold CV

# ==========================================
# 2. Environment Check
# ==========================================
print("="*50)
print("System Check:")
if torch.cuda.is_available():
    print(f"✅ GPU Detected: {torch.cuda.get_device_name(0)}")
else:
    print("⚠️ WARNING: No GPU detected. Running on CPU (will be slow).")
print("="*50)

System Check:
✅ GPU Detected: Quadro RTX 5000


In [5]:
# ==========================================
# 3. Data Loading
# ==========================================
print(f"Loading data from: {FILE_PATH}...")
adata = sc.read_h5ad(FILE_PATH)
print(f"Data shape: {adata.shape[0]} cells, {adata.shape[1]} genes")

# Ensure index is unique to prevent errors during splitting
if not adata.obs_names.is_unique:
    print("Making index unique...")
    adata.obs_names_make_unique()

# ==========================================
# 4. Initialize Cross-Validation
# ==========================================
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=2024)

# Array to store final predictions for all cells
# Initialize with a placeholder string
full_predictions = np.array(['Unpredicted'] * adata.shape[0], dtype=object)

# Get indices and labels for stratification
X_indices = np.arange(adata.shape[0])
y_labels = adata.obs[LABEL_KEY].values

print(f"\nStarting {N_SPLITS}-Fold Cross-Validation...")

Loading data from: ./Lung_atlas_public.h5ad...
Data shape: 32472 cells, 15148 genes

Starting 5-Fold Cross-Validation...


In [None]:
# ==========================================
# 5. Main Loop
# ==========================================
for fold, (train_idx, test_idx) in enumerate(skf.split(X_indices, y_labels)):
    print(f"\n{'='*20} Processing Fold {fold + 1} / {N_SPLITS} {'='*20}")
    
    print("--> Splitting data...")
    adata_train = adata[train_idx].copy()
    adata_test = adata[test_idx].copy()
    
    # --- C. Define Project Name ---
    # This will create a folder named 'fold_X' in the current directory
    project_name = f'fold_{fold}'
    
    # Clean up old folder if it exists to ensure a fresh start
    if os.path.exists(project_name):
        shutil.rmtree(project_name)
    
    # --- D. Training ---
    print(f"--> Training on {len(train_idx)} cells (Project: {project_name})...")
    
    TOSICA.train(
        adata=adata_train,
        gmt_path=None,          # Explicit placeholder (No pre-defined mask)
        label_name=LABEL_KEY,   # The column name for labels
        project=project_name,   # Saves to ./fold_X/
        epochs=N_EPOCHS,
        batch_size=BATCH_SIZE
    )
    
    # --- E. Prediction ---
    print(f"--> Predicting on {len(test_idx)} cells...")
    
    # Construct model path. 
    # Note: TOSICA saves models as 'model-N.pth' where N = epoch index.
    # Since epochs=10, the last one is model-9.pth.
    last_epoch_index = N_EPOCHS - 1
    model_filename = f"model-{last_epoch_index}.pth"
    model_path = f"./{project_name}/{model_filename}"
    
    # Verification
    if not os.path.exists(model_path):
        print(f"❌ CRITICAL ERROR: Model file not found at {model_path}")
        print(f"   Contents of {project_name}: {os.listdir(project_name)}")
        break

    # Run Prediction
    # Note: save_dir='.' because TOSICA looks for the project folder relative to save_dir
    pred_adata = TOSICA.pre(
        adata_test,
        model_weight_path=model_path,
        project=project_name   # Must match training project name
    )
    
    # --- F. Store Results ---
    # Extract predictions from the returned AnnData object
    if 'Prediction' in pred_adata.obs:
        current_preds = pred_adata.obs['Prediction'].values
        full_predictions[test_idx] = current_preds
        print(f"✅ Fold {fold + 1} completed successfully.")
    else:
        print("❌ Error: 'Prediction' column missing in returned AnnData.")
    
    # --- G. Cleanup (Optional) ---
    # Remove the temporary project folder to save disk space
    if os.path.exists(project_name):
        shutil.rmtree(project_name)


--> Splitting data...
--> Training on 25977 cells (Project: fold_0)...
cuda:0
Full connection!
Model builded!


[train epoch 0] loss: 2.119, acc: 0.251: 100%|██████████| 4457/4457 [05:41<00:00, 13.05it/s]
[valid epoch 0] loss: 0.970, acc: 0.743: 100%|██████████| 1910/1910 [00:45<00:00, 41.64it/s]
[train epoch 1] loss: 0.515, acc: 0.859: 100%|██████████| 4457/4457 [05:42<00:00, 13.02it/s]
[valid epoch 1] loss: 0.236, acc: 0.940: 100%|██████████| 1910/1910 [00:45<00:00, 41.96it/s]
[train epoch 2] loss: 0.224, acc: 0.945: 100%|██████████| 4457/4457 [05:42<00:00, 13.02it/s]
[valid epoch 2] loss: 0.109, acc: 0.973: 100%|██████████| 1910/1910 [00:45<00:00, 41.88it/s]
[train epoch 3] loss: 0.139, acc: 0.968: 100%|██████████| 4457/4457 [05:42<00:00, 13.01it/s]
[valid epoch 3] loss: 0.076, acc: 0.981: 100%|██████████| 1910/1910 [00:45<00:00, 41.90it/s]
[train epoch 4] loss: 0.091, acc: 0.981: 100%|██████████| 4457/4457 [05:42<00:00, 13.01it/s]
[valid epoch 4] loss: 0.063, acc: 0.985: 100%|██████████| 1910/1910 [00:45<00:00, 41.97it/s]
[train epoch 5] loss: 0.066, acc: 0.988: 100%|██████████| 4457/4457 [0

Training finished!
--> Predicting on 6495 cells...
cuda:0
0
6495
✅ Fold 1 completed successfully.

--> Splitting data...
--> Training on 25977 cells (Project: fold_1)...
cuda:0
Full connection!
Model builded!


[train epoch 0] loss: 2.319, acc: 0.195: 100%|██████████| 4458/4458 [05:41<00:00, 13.05it/s]
[valid epoch 0] loss: 1.185, acc: 0.543: 100%|██████████| 1910/1910 [00:45<00:00, 41.77it/s]
[train epoch 1] loss: 1.164, acc: 0.564:  44%|████▎     | 1944/4458 [02:29<03:13, 13.02it/s]

In [None]:
# ==========================================
# 6. Finalizing
# ==========================================
print("\n" + "="*50)
print("Cross-Validation Finished!")

# Check for unpredicted cells
missing_count = np.sum(full_predictions == 'Unpredicted')
if missing_count > 0:
    print(f"⚠️ Warning: {missing_count} cells were not predicted.")
else:
    print("✅ All cells predicted successfully.")

# Save results to CSV
# We save the Ground Truth and the Prediction for comparison
# result_df = pd.DataFrame({
#     'cell_id': adata.obs_names,
#     'ground_truth': adata.obs[LABEL_KEY].values,
#     'tosica_prediction': full_predictions
# })

# result_df.to_csv(OUTPUT_CSV, index=False)
# print(f"Results saved to: {OUTPUT_CSV}")

# Optional: Add to original AnnData
# adata.obs['tosica_cv_pred'] = full_predictions

In [None]:
# Save to a CSV file
adata.obs['tosica_labels'] = full_predictions
adata.obs.to_csv('./labels/TOSICA_label_human_immune_obs.csv')