In [1]:
import numpy as np
import anndata as ad
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from copy import deepcopy
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix

In [2]:
def simulation_data_generator(beta):

    adata = ad.read_h5ad("/Users/apple/Desktop/KB/data/LarryData/Larry_41201_2000.h5ad")
    
    count_matrix = adata.X 
    variances = np.var(count_matrix.toarray(), axis=0)
    sorted_indices = np.argsort(-variances)

    # reorder the gene (columns) according to their variances in a descending order
    count_matrix_reordered = (count_matrix[:, sorted_indices]).toarray()

    if np.max(variances)== np.var(count_matrix_reordered, axis=0)[0]:
        print("reorder process done")
    else:
        print("reorder process went wrong")

    # only input first beta*num_genes gene into the K-means cluster
    matrix_for_cluster = count_matrix_reordered[:,:int(beta*count_matrix_reordered.shape[1])]

    kmeans = KMeans(n_clusters=2000, random_state=42)
    kmeans.fit(matrix_for_cluster)

    cluster_labels = kmeans.labels_
    # Add cluster labels to adata.obs
    adata.obs['clone_id'] = cluster_labels

    # Count the frequency of each label
    label_counts = pd.Series(cluster_labels).value_counts()
    # Identify labels that appear fewer than 5 times
    labels_to_keep = label_counts[label_counts >= 5].index

    # Filter out cells with labels that appear fewer than 5 times
    cells_to_keep = adata.obs['clone_id'].isin(labels_to_keep)
    filtered_adata = adata[cells_to_keep].copy()

    return filtered_adata

In [3]:
adata_beta_01 = simulation_data_generator(0.1)
adata_beta_03 = simulation_data_generator(0.3)
adata_beta_05 = simulation_data_generator(0.5)
adata_beta_07 = simulation_data_generator(0.7)
adata_beta_09 = simulation_data_generator(0.9)

reorder process done


  super()._check_params_vs_input(X, default_n_init=10)
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



reorder process done


  super()._check_params_vs_input(X, default_n_init=10)


reorder process done


  super()._check_params_vs_input(X, default_n_init=10)


reorder process done


  super()._check_params_vs_input(X, default_n_init=10)


reorder process done


  super()._check_params_vs_input(X, default_n_init=10)


In [11]:
adata_beta_01.write("Larry_Simulation_01.h5ad")
adata_beta_03.write("Larry_Simulation_03.h5ad")
adata_beta_05.write("Larry_Simulation_05.h5ad")
adata_beta_07.write("Larry_Simulation_07.h5ad")
adata_beta_09.write("Larry_Simulation_09.h5ad")

In [24]:
print("beta=0.1, number of cells:",adata_beta_01.obs.shape[0],", number of linegae: ",len(adata_beta_01.obs["clone_id"].unique()),", max lineage frequency",adata_beta_01.obs["clone_id"].value_counts().max(),", avg lineage frequency",round(adata_beta_01.obs["clone_id"].value_counts().mean(),2))
print("beta=0.3, number of cells:",adata_beta_03.obs.shape[0],", number of linegae: ",len(adata_beta_03.obs["clone_id"].unique()),", max lineage frequency",adata_beta_03.obs["clone_id"].value_counts().max(),", avg lineage frequency",round(adata_beta_03.obs["clone_id"].value_counts().mean(),2))
print("beta=0.5, number of cells:",adata_beta_05.obs.shape[0],", number of linegae: ",len(adata_beta_05.obs["clone_id"].unique()),", max lineage frequency",adata_beta_05.obs["clone_id"].value_counts().max(),", avg lineage frequency",round(adata_beta_05.obs["clone_id"].value_counts().mean(),2))
print("beta=0.7, number of cells:",adata_beta_07.obs.shape[0],", number of linegae: ",len(adata_beta_07.obs["clone_id"].unique()),", max lineage frequency",adata_beta_07.obs["clone_id"].value_counts().max(),", avg lineage frequency",round(adata_beta_07.obs["clone_id"].value_counts().mean(),2))
print("beta=0.9, number of cells:",adata_beta_09.obs.shape[0],", number of linegae: ",len(adata_beta_09.obs["clone_id"].unique()),", max lineage frequency",adata_beta_09.obs["clone_id"].value_counts().max(),", avg lineage frequency",round(adata_beta_09.obs["clone_id"].value_counts().mean(),2))

beta=0.1, number of cells: 39795 , number of linegae:  1271 , max lineage frequency 497 , avg lineage frequency 31.31
beta=0.3, number of cells: 39729 , number of linegae:  1239 , max lineage frequency 435 , avg lineage frequency 32.07
beta=0.5, number of cells: 39700 , number of linegae:  1229 , max lineage frequency 390 , avg lineage frequency 32.3
beta=0.7, number of cells: 39767 , number of linegae:  1242 , max lineage frequency 416 , avg lineage frequency 32.02
beta=0.9, number of cells: 39754 , number of linegae:  1231 , max lineage frequency 479 , avg lineage frequency 32.29
