This is used for the larry data filtering for the scvi package(for two-page abstract use)\
We only filter the number of cells so that a lineage (clone_id) at least has 5 cells.

In [1]:
import anndata as ad
import numpy as np
import scipy
import pandas as pd
import scanpy as sc


In [2]:
#---------------------------------------------------------Load the matrixs-----------------------------------------------------

normed_counts = "/Users/apple/Desktop/KB/Larry_Dataset_original/stateFate_inVitro_normed_counts.mtx.gz"  #snakemake.input['normed_counts']
gene_names = "/Users/apple/Desktop/KB/Larry_Dataset_original/stateFate_inVitro_gene_names.txt.gz" #snakemake.input['gene_names']
clone_matrix = "/Users/apple/Desktop/KB/Larry_Dataset_original/stateFate_inVitro_clone_matrix.mtx.gz" #snakemake.input['clone_matrix']
metadata = "/Users/apple/Desktop/KB/Larry_Dataset_original/stateFate_inVitro_metadata.txt.gz" #snakemake.input['metadata']

# load data
normed_counts_mat = scipy.io.mmread(normed_counts).tocsr()
genes = pd.read_csv(gene_names, sep='\t',header=None).to_numpy().flatten()
clone_mat = scipy.io.mmread(clone_matrix).tocsr()
meta_df = pd.read_csv(metadata, sep='\t')


#-------------------------------Get num_genes of highly expressed genes from the orginal data(all cells)--------------------
# create full adata
adata = ad.AnnData(normed_counts_mat, obs=meta_df, var=pd.DataFrame(index=genes), dtype=np.float32)
# optimize dtypes
adata.obs['Library'] = adata.obs['Library'].astype('category')
adata.obs['Time point'] = adata.obs['Time point'].astype(int)
adata.obs['Starting population'] = adata.obs['Starting population'].astype('category')
adata.obs['Cell type annotation'] = adata.obs['Cell type annotation'].astype('category')
adata.obs['Well'] = adata.obs['Well'].astype(int)
# assign clone_id
adata.obs['clone_id'] = (clone_mat @ np.arange(1,1+clone_mat.shape[1])) - 1
print("number of lineages: ", len(adata.obs['clone_id'].unique()))



number of lineages:  5865




In [3]:
# get 2000 genes from the 130887(all) cells
# sc.pp.log1p(adata)
# sc.pp.highly_variable_genes(adata,n_top_genes=2000)


#---------------------------------Creat the subset adata with trimmed number of genes and number of cells--------------------
# create full adata
adata_cp = ad.AnnData(normed_counts_mat, obs=meta_df, var=pd.DataFrame(index=genes), dtype=np.float32)

# optimize dtypes
adata_cp.obs['Library'] = adata_cp.obs['Library'].astype('category')
adata_cp.obs['Time point'] = adata_cp.obs['Time point'].astype(int)
adata_cp.obs['Starting population'] = adata_cp.obs['Starting population'].astype('category')
adata_cp.obs['Cell type annotation'] = adata_cp.obs['Cell type annotation'].astype('category')
adata_cp.obs['Well'] = adata_cp.obs['Well'].astype(int)
# assign clone_id
adata_cp.obs['clone_id'] = (clone_mat @ np.arange(1,1+clone_mat.shape[1])) - 1

# remove the cells that belong to the clone_id with few cells
value_counts = adata.obs['clone_id'].value_counts()
frequency_dict = {}
for value, count in value_counts.items():
    if count in frequency_dict:
        frequency_dict[count].append(value)
    else:
        frequency_dict[count] = [value]

clone_for_remove = frequency_dict[81585]+frequency_dict[2]+frequency_dict[3]+frequency_dict[4] 
adata_subset = adata_cp[~adata_cp.obs['clone_id'].isin(clone_for_remove)]
print("adata_subset.obs.shape:", adata_subset.obs.shape)

# trim the number of genes to the required number using the highly variable gene calculated from the original adata (all cells)
# hvgene = (adata.var.highly_variable[adata.var.highly_variable==True]).index
# print("number of the highly variable genes:", len(hvgene))
# adata_subset = adata_subset[:,hvgene]
print("adata_subset.X.shape:", adata_subset.X.shape)



adata_subset.obs.shape: (41201, 9)
adata_subset.X.shape: (41201, 25289)


In [4]:
sc.pp.log1p(adata_subset)

  view_to_actual(adata)


In [5]:
import numpy as np
import scipy.sparse as sp

# Extract the sparse matrix
sparse_matrix = adata_subset.X

# Get the maximum value in the sparse matrix
max_val = sparse_matrix.max()

# Define the transformation function
def transform(x, max_val):
    return round(np.exp((np.log(1000) / max_val) * x))

# Get the data, row indices, and column indices of the sparse matrix
data = sparse_matrix.data
indices = sparse_matrix.indices
indptr = sparse_matrix.indptr

# Apply the transformation to non-zero elements
transformed_data = np.array([transform(x, max_val) for x in data])

# Construct the new sparse matrix
sparse_transformed_matrix = sp.csr_matrix((transformed_data, indices, indptr), shape=sparse_matrix.shape)

# Assign the transformed sparse matrix back to adata_subset.X
adata_subset.X = sparse_transformed_matrix

# Check the transformed matrix
print(adata_subset.X)


  (0, 1)	1
  (0, 5)	1
  (0, 12)	1
  (0, 23)	1
  (0, 25)	1
  (0, 27)	1
  (0, 28)	2
  (0, 29)	3
  (0, 31)	1
  (0, 44)	2
  (0, 45)	2
  (0, 50)	1
  (0, 61)	1
  (0, 63)	1
  (0, 65)	1
  (0, 181)	1
  (0, 204)	1
  (0, 309)	1
  (0, 398)	1
  (0, 433)	1
  (0, 483)	1
  (0, 501)	2
  (0, 509)	1
  (0, 516)	2
  (0, 520)	2
  :	:
  (41200, 24642)	4
  (41200, 24664)	7
  (41200, 24667)	2
  (41200, 24716)	2
  (41200, 24741)	2
  (41200, 24771)	2
  (41200, 24789)	2
  (41200, 24798)	2
  (41200, 24897)	2
  (41200, 24904)	5
  (41200, 25021)	2
  (41200, 25044)	2
  (41200, 25218)	2
  (41200, 25227)	4
  (41200, 25271)	2
  (41200, 25276)	37
  (41200, 25278)	40
  (41200, 25279)	71
  (41200, 25280)	10
  (41200, 25281)	21
  (41200, 25282)	35
  (41200, 25283)	11
  (41200, 25284)	4
  (41200, 25285)	26
  (41200, 25287)	4


In [6]:
adata_subset.X.shape

(41201, 25289)

In [8]:
row_sums = np.sum(adata_subset.X, axis=1)
adata_subset.obs['row_sums'] = row_sums
row_sums_summary = adata_subset.obs['row_sums'].describe()

print(row_sums_summary)

count    41201.000000
mean      4796.319458
std        741.608455
min       3624.000000
25%       4211.000000
50%       4588.000000
75%       5344.000000
max       7915.000000
Name: row_sums, dtype: float64


In [25]:
row_sums_log = np.sum(adata_subset.X, axis=1)
adata_subset.obs['row_sums_log'] = row_sums_log
row_sums_summary_log = adata_subset.obs['row_sums_log'].describe()
print(row_sums_summary_log)

In [7]:
# save the data to h5ad file
adata_subset.write('Larry_41201_25289_scvi.h5ad')