In [1]:
import numpy as np
import umap
from sklearn.preprocessing import StandardScaler

# Load data (Modify file paths)
X_signal = np.load("embedding.npy")       # Shape: (10000, 3072) (scRNA-seq or bulk)
X_background = np.load("embedding_bulk.npy")  # Shape: (104, 3072) (Unwanted variation)

# Combine datasets
X_combined = np.vstack([X_signal, X_background])
y_labels = np.array([1] * X_signal.shape[0] + [0] * X_background.shape[0])  # 1 = Signal, 0 = Background

# Standardize the data
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Apply UMAP with 64 components
n_components = 64  # Desired reduced dimension
mapper = umap.UMAP(n_components=n_components, metric="euclidean", n_neighbors=15, min_dist=0.3).fit(X_combined_scaled, y=y_labels)

# Extract embeddings
X_transformed = mapper.embedding_

# Split back into separate datasets
X_signal_transformed = X_transformed[:X_signal.shape[0], :]  # Shape: (10000, 64)
X_background_transformed = X_transformed[X_signal.shape[0]:, :]  # Shape: (104, 64)

# Save embeddings for further analysis
np.save("reduced_embedding_sc.npy", X_signal_transformed)
np.save("reduced_embedding_bulk.npy", X_background_transformed)

# Print output shapes
print("Signal Embedding Shape:", X_signal_transformed.shape)  # Expected: (10000, 64)
print("Background Embedding Shape:", X_background_transformed.shape)  # Expected: (104, 64)




Signal Embedding Shape: (18614, 64)
Background Embedding Shape: (104, 64)
