## Data Preparation: Filter peaks based on variance or counts for SOX10 data and blood2K data.

In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import sys
sys.path.append('../Scarp/')
from data_preprocessing import Mat_normalization
import warnings
warnings.filterwarnings("ignore")

### 1. Filter according to counts

In [2]:
adata_blood2k = sc.read_h5ad('../Exp1_Benchmark/Processed data/blood2K.h5ad')
for filter_cells in np.array([30, 40, 50, 60, 70]):
    adata1_blood2k = adata_blood2k.copy()
    sc.pp.filter_genes(adata1_blood2k, min_cells=filter_cells)
    adata1_blood2k.write('./Processed data/blood2K_filter'+str(filter_cells)+'.h5ad')

In [3]:
adata_Sox10KD = sc.read_h5ad('../Exp1_Benchmark/Processed data/Sox10KD.h5ad')
for filter_cells in np.array([10, 20, 30, 40, 50]):
    adata1_Sox10KD = adata_Sox10KD.copy()
    sc.pp.filter_genes(adata1_Sox10KD, min_cells=filter_cells)
    adata1_Sox10KD.write('./Processed data/Sox10KD_filter'+str(filter_cells)+'.h5ad')

### 2. Filter according to vaiance

In [4]:
Cells_blood2k = adata_blood2k.obs.index
labels_blood2k = adata_blood2k.obs['celltype'].astype('category')

sparse_matrix_blood2k = adata_blood2k.X  # sparse matrix

In [5]:
peak_std = np.std(np.array(sparse_matrix_blood2k.todense()), 0)

for remove_prop in np.around(np.arange(0.5, 0.91, 0.1), 2):
    keep_peak_index = np.where(peak_std >= np.quantile(peak_std, remove_prop))

    filter_data_df = pd.DataFrame(index = Cells_blood2k,
                                  columns = adata_blood2k.var.index[keep_peak_index])
    filter_data = sc.AnnData(filter_data_df)
    filter_data.X = sparse_matrix_blood2k.copy()[:, keep_peak_index[0]]
    filter_data.var_names_make_unique()
    filter_data.obs['celltype'] = labels_blood2k

    filter_data.write('./Processed data/blood2k_filter_var' + str(remove_prop) + '.h5ad')

In [6]:
Cells_Sox10KD = adata_Sox10KD.obs.index
labels_Sox10KD = adata_Sox10KD.obs['celltype'].astype('category')

sparse_matrix_Sox10KD = adata_Sox10KD.X  # sparse matrix

In [7]:
peak_std = np.std(np.array(sparse_matrix_Sox10KD.todense()), 0)

for remove_prop in np.around(np.arange(0.5, 0.91, 0.1), 2):
    keep_peak_index = np.where(peak_std >= np.quantile(peak_std, remove_prop))

    filter_data_df = pd.DataFrame(index = Cells_Sox10KD,
                                  columns = adata_Sox10KD.var.index[keep_peak_index])
    filter_data = sc.AnnData(filter_data_df)
    filter_data.X = sparse_matrix_Sox10KD.copy()[:, keep_peak_index[0]]
    filter_data.var_names_make_unique()
    filter_data.obs['celltype'] = labels_Sox10KD

    filter_data.write('./Processed data/Sox10KD_filter_var' + str(remove_prop) + '.h5ad')