# Testing Trained Models on New Single Cell Dataset

This notebook demonstrates how to load new single cell data and test it using the trained RNA-to-ADT transformer mapping models.

## Overview
1. Load new single cell RNA data
2. Load pre-trained models
3. Preprocess new data
4. Extract embeddings and make predictions
5. Evaluate performance (if ground truth available)
6. Visualize results


## 1. Setup and Imports


In [None]:
import sys, os, importlib

# --- Autoreload ---
%load_ext autoreload
%autoreload 2

# --- Paths ---
current_dir = os.getcwd()
if 'Notebooks' in current_dir:
    parent_dir = os.path.dirname(current_dir)
    scripts_path = os.path.join(parent_dir, 'scripts')
else:
    parent_dir = current_dir
    scripts_path = os.path.join(current_dir, 'scripts')

if parent_dir not in sys.path:
    sys.path.append(parent_dir)
if scripts_path not in sys.path:
    sys.path.append(scripts_path)

print("Added to Python path:")
print(f"- Parent directory: {parent_dir}")
print(f"- Scripts directory: {scripts_path}")


Added to Python path:
- Parent directory: /projects/vanaja_lab/satya/DeepOMAPNet
- Scripts directory: /projects/vanaja_lab/satya/DeepOMAPNet/scripts


In [None]:
# Import required libraries
import torch
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import anndata as ad
from datetime import datetime
import json

# Import custom modules
import scripts.Embeddings_extract as Embeddings_extract
import scripts.GATmodel as GATmodel
import scripts.TransformerMap as TransformerMap
import scripts.Predictions

print("All imports successful!")


  from .autonotebook import tqdm as notebook_tqdm


All imports successful!


## 2. Load New Single Cell Data


In [None]:
# Load your new single cell dataset
# Replace with your actual data path
new_rna_path = "/projects/vanaja_lab/satya/DeepOMAPNet/scripts/GSE116256_combined.h5ad"

# Fix the data loading - remove the comma
GSE116256 = sc.read_h5ad("/projects/vanaja_lab/satya/DeepOMAPNet/scripts/GSE116256_combined.h5ad")
# Remove the comma that's causing the tuple

# Now check the data
print(f"Data shape: {GSE116256.shape}")
print(f"Number of observations: {GSE116256.n_obs}")
print(f"Number of variables: {GSE116256.n_vars}")

# If you have real data, uncomment the lines above and comment out the sample data creation


(AnnData object with n_obs × n_vars = 5997 × 27899
     obs: 'Cell', 'NumberOfReads', 'AlignedToGenome', 'AlignedToTranscriptome', 'TranscriptomeUMIs', 'NumberOfGenes', 'CyclingScore', 'CyclingBinary', 'MutTranscripts', 'WtTranscripts', 'PredictionRF2', 'PredictionRefined', 'CellType', 'Score_HSC', 'Score_Prog', 'Score_GMP', 'Score_ProMono', 'Score_Mono', 'Score_cDC', 'Score_pDC', 'Score_earlyEry', 'Score_lateEry', 'Score_ProB', 'Score_B', 'Score_Plasma', 'Score_T', 'Score_CTL', 'Score_NK', 'sample_id', 'gsm_id', 'cell_barcode', 'cell_id', 'n_genes', 'total_counts'
     var: 'gene_id', 'feature_type'
     uns: 'dataset', 'description',)

In [None]:
from Predictions import ADTPredictor

# Create predictor
predictor = ADTPredictor(
    individual_models_dir="/projects/vanaja_lab/satya/DeepOMAPNet/Notebooks/trained_models/individual_models_20250922_115253"
)

# Make predictions
print("Making predictions...")
rna_embeddings_np, predicted_adt_embeddings_np = predictor.predict_adt_embeddings(GSE116256)

# Add predictions to obs
adt_marker_names = [f'predicted_adt_{i}' for i in range(predicted_adt_embeddings_np.shape[1])]
for i, marker_name in enumerate(adt_marker_names):
    GSE116256.obs[marker_name] = predicted_adt_embeddings_np[:, i]

# Add embeddings to obsm
GSE116256.obsm['X_rna_embeddings'] = rna_embeddings_np
GSE116256.obsm['X_predicted_adt_embeddings'] = predicted_adt_embeddings_np

# Add metadata
GSE116256.uns['prediction_info'] = {
    'adt_embedding_dim': predicted_adt_embeddings_np.shape[1],
    'adt_marker_names': adt_marker_names,
    'prediction_timestamp': datetime.now().isoformat()
}

print("✅ Predictions completed successfully!")
print(f"Data shape: {GSE116256.shape}")
print(f"Number of predicted ADT features: {len(adt_marker_names)}")
print(f"First 5 ADT features: {adt_marker_names[:5]}")

# Check the predictions
print(f"\nSample predictions (first 5 cells, first 5 ADT features):")
sample_predictions = GSE116256.obs[adt_marker_names[:5]].head()
print(sample_predictions.round(4))