In [1]:
import numpy as np
import anndata as ad
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from copy import deepcopy
from sklearn.cluster import KMeans
import scanpy as sc
import scipy
import numpy as np
import scipy.sparse as sp
from scipy.stats import binom

In [2]:
Exp_id = "exp_8"

In [3]:
sc.settings.verbosity = 3 

In [4]:
adata_raw = ad.read_h5ad("/Users/apple/Desktop/KB/data/LarryData/larry_raw_w_clone_id.h5ad")

In [5]:
len(adata_raw.obs["clone_id"].unique())

5864

In [6]:
adata_raw.obs['clone_id'].value_counts()

clone_id
1261    177
2370    165
5714    142
292     134
5209    130
       ... 
5594      2
866       2
4884      2
5101      2
2251      2
Name: count, Length: 5864, dtype: int64

### Filter genes and cells

In [7]:
adata_raw.shape

(49302, 23420)

In [8]:
sc.pp.filter_cells(adata_raw, min_genes=200)
sc.pp.filter_genes(adata_raw, min_cells=3)

filtered out 124 cells that have less than 200 genes expressed
filtered out 8773 genes that are detected in less than 3 cells


In [9]:
adata_raw.shape

(49178, 14647)

In [10]:
value_counts = adata_raw.obs['clone_id'].value_counts()
frequency_dict = {}
for value, count in value_counts.items():
    if count in frequency_dict:
        frequency_dict[count].append(value)
    else:
        frequency_dict[count] = [value]



In [11]:
clone_for_remove =frequency_dict[1]+frequency_dict[2]+frequency_dict[3]+frequency_dict[4] 
adata_subset = adata_raw[~adata_raw.obs['clone_id'].isin(clone_for_remove)]

In [12]:
adata_subset.shape

(41093, 14647)

In [13]:
adata_subset.obs['clone_id'].value_counts()

clone_id
1261    177
2370    165
5714    141
292     134
5209    129
       ... 
513       5
5629      5
1014      5
3998      5
4329      5
Name: count, Length: 2813, dtype: int64

#### Downsample the countmatrix

In [14]:
def downsample_adata(adata, beta):
    # Make a copy of the original AnnData object
    new_adata = adata.copy()
    
    matrix = new_adata.X
    
    if sp.issparse(matrix):
        # For sparse matrices
        data = matrix.data.astype(int)  # Convert to integer type
        downsampled_data = binom.rvs(data, beta)
        downsampled_matrix = sp.csr_matrix((downsampled_data, matrix.indices, matrix.indptr), shape=matrix.shape)
    else:
        # For dense matrices
        matrix = matrix.astype(int)  # Convert to integer type
        downsampled_matrix = binom.rvs(matrix, beta)
    
    # Update the count matrix in the copied AnnData object
    new_adata.X = downsampled_matrix
    
    
    return new_adata



In [15]:
adata_ds_sim01 = downsample_adata(adata_subset,0.1)
adata_ds_sim03 = downsample_adata(adata_subset,0.3)
adata_ds_sim05 = downsample_adata(adata_subset,0.5)
adata_ds_sim07 = downsample_adata(adata_subset,0.7)
adata_ds_sim09 = downsample_adata(adata_subset,0.9)


#### Simulation for scCL

In [16]:
def scCL_sim(adata):
    
    adata_scCL = adata.copy()
    sc.pp.normalize_total(adata_scCL, target_sum=1e4)
    sc.pp.log1p(adata_scCL)
    sc.pp.highly_variable_genes(adata_scCL,n_top_genes=2000)
    print("adata_scCL.shape: ", adata_scCL.shape)
    hvgene = (adata_scCL.var.highly_variable[adata_scCL.var.highly_variable==True]).index
    print("number of the highly variable genes:", len(hvgene))
    adata_hvgene = adata_scCL[:,hvgene]
    # print("adata_hvgene.obs['clone_id'].value_counts():", adata_hvgene.obs['clone_id'].value_counts())
    print("adata_hvgene: ", adata_hvgene.shape)

    return adata_hvgene

In [17]:
adata_scCl_sim01 = scCL_sim(adata_ds_sim01)
adata_scCl_sim03 = scCL_sim(adata_ds_sim03)
adata_scCl_sim05 = scCL_sim(adata_ds_sim05)
adata_scCl_sim07 = scCL_sim(adata_ds_sim07)
adata_scCl_sim09 = scCL_sim(adata_ds_sim09)

normalizing counts per cell
    finished (0:00:00)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)


  disp_grouped = df.groupby('mean_bin')['dispersions']


adata_scCL.shape:  (41093, 14647)
number of the highly variable genes: 2000
adata_hvgene:  (41093, 2000)
normalizing counts per cell
    finished (0:00:00)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes


  disp_grouped = df.groupby('mean_bin')['dispersions']


    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
adata_scCL.shape:  (41093, 14647)
number of the highly variable genes: 2000
adata_hvgene:  (41093, 2000)
normalizing counts per cell
    finished (0:00:00)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)


  disp_grouped = df.groupby('mean_bin')['dispersions']


adata_scCL.shape:  (41093, 14647)
number of the highly variable genes: 2000
adata_hvgene:  (41093, 2000)
normalizing counts per cell
    finished (0:00:00)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)


  disp_grouped = df.groupby('mean_bin')['dispersions']


adata_scCL.shape:  (41093, 14647)
number of the highly variable genes: 2000
adata_hvgene:  (41093, 2000)
normalizing counts per cell
    finished (0:00:00)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes


  disp_grouped = df.groupby('mean_bin')['dispersions']


    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
adata_scCL.shape:  (41093, 14647)
number of the highly variable genes: 2000
adata_hvgene:  (41093, 2000)


In [18]:
adata_scCl_sim01.write(f"adata_scCL_sim01_{Exp_id}.h5ad")
adata_scCl_sim03.write(f"adata_scCL_sim03_{Exp_id}.h5ad")
adata_scCl_sim05.write(f"adata_scCL_sim05_{Exp_id}.h5ad")
adata_scCl_sim07.write(f"adata_scCL_sim07_{Exp_id}.h5ad")
adata_scCl_sim09.write(f"adata_scCL_sim09_{Exp_id}.h5ad")

#### Simulation for scvi

In [19]:
def scvi_sim(adata_ds, adata_scCL):

    print("adata_ds.shape: ", adata_ds.shape)
    print("adata_scCL.shape: ", adata_scCL.shape)

    cells_to_select = adata_scCL.obs.index
    genes_sim = adata_scCL.var_names
    adata_scvi_subset = adata_ds[cells_to_select, genes_sim]
    
    # Create new anndata object with X from adata_scvi_subset and obs from adata_scCL
    adata_scvi = ad.AnnData(X=adata_scvi_subset.X, obs=adata_scCL.obs, var=adata_scCL.var)

    return adata_scvi

In [20]:
adata_scvi_sim01 = scvi_sim(adata_ds_sim01, adata_scCl_sim01)
adata_scvi_sim03 = scvi_sim(adata_ds_sim03, adata_scCl_sim03)
adata_scvi_sim05 = scvi_sim(adata_ds_sim05, adata_scCl_sim05)
adata_scvi_sim07 = scvi_sim(adata_ds_sim07, adata_scCl_sim07)
adata_scvi_sim09 = scvi_sim(adata_ds_sim09, adata_scCl_sim09)

adata_ds.shape:  (41093, 14647)
adata_scCL.shape:  (41093, 2000)
adata_ds.shape:  (41093, 14647)
adata_scCL.shape:  (41093, 2000)
adata_ds.shape:  (41093, 14647)
adata_scCL.shape:  (41093, 2000)
adata_ds.shape:  (41093, 14647)
adata_scCL.shape:  (41093, 2000)
adata_ds.shape:  (41093, 14647)
adata_scCL.shape:  (41093, 2000)


In [21]:
adata_scvi_sim01.write(f"adata_scvi_sim01_{Exp_id}.h5ad")
adata_scvi_sim03.write(f"adata_scvi_sim03_{Exp_id}.h5ad")
adata_scvi_sim05.write(f"adata_scvi_sim05_{Exp_id}.h5ad")
adata_scvi_sim07.write(f"adata_scvi_sim07_{Exp_id}.h5ad")
adata_scvi_sim09.write(f"adata_scvi_sim09_{Exp_id}.h5ad")