In [1]:
import pandas as pd
import anndata as ad
import scanpy as sc
import numpy as np
import seaborn as sns
import random
import pickle as pkl

from matplotlib_venn import venn3
import matplotlib.pyplot as plt

In [2]:
adata = ad.read_h5ad("../../../data/sciplex_qc_ann_nobalancing.h5ad")

In [3]:
adata

AnnData object with n_obs × n_vars = 401917 × 27544
    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'SMILES', 'fmfp', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'drug_celltype_dose', 'match_index'
    var: 'id', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells'

In [3]:
def save_adata_hvg_seurat(adata_qc_path, adata_out_path, n_genes=None):
    adata = ad.read_h5ad(adata_qc_path)
    sc.pp.normalize_total(adata, inplace=True)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, flavor='seurat', n_top_genes=n_genes)
    adata = adata[:, adata.var['highly_variable']].copy()
    adata.write_h5ad(adata_out_path)

## Creating datasets with  500, 1000, 2000, 3500, 5000, 7500 HVG genes with seurat flavor



In [4]:
save_adata_hvg_seurat("../../../data/sciplex_qc_ann_nobalancing.h5ad", "../../../data/feature_number/sciplex_hvg_500.h5ad", n_genes=500)

In [5]:
save_adata_hvg_seurat("../../../data/sciplex_qc_ann_nobalancing.h5ad", "../../../data/feature_number/sciplex_hvg_1000.h5ad", n_genes=1000)

In [6]:
save_adata_hvg_seurat("../../../data/sciplex_qc_ann_nobalancing.h5ad", "../../../data/feature_number/sciplex_hvg_2000.h5ad", n_genes=2000)

In [7]:
save_adata_hvg_seurat("../../../data/sciplex_qc_ann_nobalancing.h5ad", "../../../data/feature_number/sciplex_hvg_3500.h5ad", n_genes=3500)

In [8]:
save_adata_hvg_seurat("../../../data/sciplex_qc_ann_nobalancing.h5ad", "../../../data/feature_number/sciplex_hvg_5000.h5ad", n_genes=5000)

In [9]:
save_adata_hvg_seurat("../../../data/sciplex_qc_ann_nobalancing.h5ad", "../../../data/feature_number/sciplex_hvg_7500.h5ad", n_genes=7500)