In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import anndata as ad

In [2]:
import umap
from sklearn.metrics import calinski_harabasz_score


In [3]:
# Paths to your simulation files
exp_id = "final_final"
sim_input_dir = f"/Users/apple/Desktop/KB/data/LarryData/Larry_simulation/Larry_simulation_4scCL/{exp_id}/"
sim_files = [
    f'Larry_Simulation_01_{exp_id}.h5ad',
    f'Larry_Simulation_03_{exp_id}.h5ad',
    f'Larry_Simulation_05_{exp_id}.h5ad',
    f'Larry_Simulation_07_{exp_id}.h5ad',
    f'Larry_Simulation_09_{exp_id}.h5ad'
]


# Loading simulation data
sim_data = [ad.read_h5ad(f'{sim_input_dir}/{file}') for file in sim_files]

In [4]:
# Initialize lists to store embeddings and Calinski-Harabasz scores
embeddings_list = []
calinski_scores_list = []

# Iterate through each anndata object in the list
for adata in sim_data:
    # Extract the data matrix and labels
    data = adata.X
    labels = adata.obs['clone_id'].values

    # Initialize UMAP with a higher number of neighbors for supervised learning
    reducer = umap.UMAP(n_neighbors=15, n_components=10)

    # Fit and transform the data with the labels
    embedding = reducer.fit_transform(data, y=labels)

    # Store the embedding in the list
    embeddings_list.append(embedding)

    # Calculate the Calinski-Harabasz score
    score = calinski_harabasz_score(embedding, labels)

    # Store the score in the list
    calinski_scores_list.append(score)

# Now embeddings_list contains all the embeddings, and calinski_scores_list contains all the scores


In [5]:
for i in calinski_scores_list:
    print(i)

1323.1478213266537
1758.5884080309918
2291.238617503212
3157.5489004410033
4925.97944441121


In [6]:
for i in embeddings_list:
    print(i.shape)

(41093, 10)
(41093, 10)
(41093, 10)
(41093, 10)
(41093, 10)
