In [None]:
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
from scipy.stats import median_abs_deviation
import seaborn as sns
import celltypist
from celltypist import models
import scrublet as scr
from scipy.sparse import csr_matrix
import os

def is_outlier(adata, metric: str, nmads: int):
    M = adata.obs[metric]
    outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
        np.median(M) + nmads * median_abs_deviation(M) < M
    )
    return outlier

# Sample No. of all human samples.
human_datasets = ["GSM4837523","GSM4837524","GSM4837525","GSM4837526","GSM4837527","GSM4837528",
                  
                  "GSM8029950","GSM8029951","GSM8029952","GSM8029953","GSM8029954","GSM8029955",
                  "GSM8029956","GSM8029957","GSM8029958","GSM8029959","GSM8029960","GSM8029961",
                  "GSM8029962","GSM8029963","GSM8029964","GSM8029965","GSM8029966","GSM8029967",
                  "GSM8029968","GSM8029969","GSM8029970","GSM8029971","GSM8029972","GSM8029973",
                  
                  "GSM8121894","GSM8121895","GSM8121896","GSM8121897","GSM8121898","GSM8121899",
                  "GSM8121900","GSM8121901","GSM8121902","GSM8121903","GSM8121904","GSM8121905",
                  "GSM8121906","GSM8121907","GSM8121908","GSM5392187","GSM5392188","GSM5392189",
                  
                  "GSM5410851","GSM5410853","GSM5410854","GSM5410855","GSM5410856","GSM6976304",
                  "GSM6976305","GSM6976306","GSM6976307","GSM6976308","GSM6976309","GSM6976310",
                  "GSM6976311","GSM6976312","GSM6976313",
                  
                  "GSM6422822","GSM6422823","GSM6422824","GSM6422825","GSM6422826","GSM6422827",
                  
                  "GSM3449613","GSM3449614","GSM3449619","GSM3449620",
                  
                  "GSM4307515","GSM4307516","GSM4307517","GSM4307518","GSM4307519","GSM4307530",
                  "GSM4307531","GSM4307532","GSM4307533","GSM4307534","GSM4307535","GSM4307536",
                  "GSM4307537","GSM4307538","GSM4307539","GSM4307540","GSM4307541","GSM4307542",
                  "GSM4307543","GSM4307544","GSM4307545","GSM4307551","GSM4307552",
                  
                  'GSM4705589','GSM4705590','GSM4705591',
                  
                  "GSM5577199","GSM5577200","GSE131778","GSE145154","GSE155512","GSE196943",
                  
                  "GSM3819856","GSM3819857","GSM3819858","GSM3819859","GSM3819860","GSM3819861",
                  "GSM3819862","GSM3819863",

                  "GSM5905363","GSM5905364","GSM5905365","GSM5905366","GSM5905367","GSM5905368",
                  "GSM5905370","GSM5905371","GSM5905372","GSM5905373","GSM5905375","GSM5905377"
]

# Load py to R interface
import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro
import anndata2ri
ro.pandas2ri.activate()
anndata2ri.activate()
ro.r('library(scDblFinder)')

# path to the location where all folders named "GSEXXXXXX", "E-MTAB-XXXX", etc., exist. **Please end with "/"
main_path = "path/to/all/datasets/"

hen2sy = pd.read_table(main_path + "human_annotation.txt")
hen2sy = dict(list(zip(hen2sy["Gene stable ID"], hen2sy["Gene name"])))
men2sy = pd.read_table(main_path + "mouse_annotation.txt")
men2sy = dict(list(zip(men2sy["Gene stable ID"], men2sy["Gene name"])))

# lists of datasets with different file formats
raw_10x_mtx = ["GSE214611","GSE159677","GSE205930","GSE150140","GSE169332","GSE197853"] #6
raw_csv = ["GSE185265","GSE227088"] #2
raw_txt = ["GSE260656","GSE260657","GSE235275","GSE179159","GSE155512","GSE155513"] #5
raw_tiled_10x_mtx = ["GSE178469","GSE201947","GSE216211","GSE253902","GSE253903","GSE246779","GSE210159","GSE106472","E-MTAB-9816","E-MTAB-9817",
                     "GSE119355","GSE128509","GSE135310","GSE145154","GSE153480","GSE163465","GSE157244","GSE184073","GSE130699","E-MTAB-9817",
                     "GSE193426"] #17
raw_h5 = ["GSE206787","GSE236609","GSE132144","GSE163129"] #4
raw_h5ad = ["GSE146285","GSE121893","GSE210152","E-MTAB-7376","GSE196943","GSE207275","GSE135296","GSE131776","GSE131778","E-MTAB-9583","GSE197441"
            ] #10


  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()
cffi mode is CFFI_MODE.ANY
R home found: C:\Program Files\R\R-4.3.3
Default options to initialize R: rpy2, --quiet, --no-save
R[write to console]: 载入需要的程辑包：SingleCellExperiment

R[write to console]: 载入需要的程辑包：SummarizedExperiment

R[write to console]: 载入需要的程辑包：MatrixGenerics

R[write to console]: 载入需要的程辑包：matrixStats

R[write to console]: 
载入程辑包：'MatrixGenerics'


R[write to console]: The following objects are masked from 'package:matrixStats':

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, co

In [None]:
# For all samples
''''''
for a in raw_10x_mtx:
    # for expression matrices of 10x raw format
    dataset_id = a
    path_li = list(os.walk(main_path+dataset_id+"_RAW/"))[1:]
    print("\033[33mReading dataset {} ...\033[0m".format(a))
    for j in path_li:
        
        adata = sc.read_10x_mtx(j[0], prefix="", gex_only="False")
        j = j[0].split("/")[-1]
        print("\033[32mReading sample {} ...\033[0m".format(j))
        
        # convert ensembl ids to gene symbols
        new_var = list(adata.var_names)
        if j in human_datasets:
            new_var = [("MT-"+i.split("MT")[1]) if (i.startswith("MT") and not i.startswith("MT-")) else i for i in new_var] # turn "MTXXX" into "MT-XXX"
            for i in range(len(new_var)):
                try:
                    if new_var[i][:4]=="ENSG" and pd.isna(hen2sy[new_var[i]])==False:
                        new_var[i] = hen2sy[new_var[i]]
                except:
                    continue
        else:
            new_var = [("mt-"+i.split("mt")[1]) if (i.startswith("mt") and not i.startswith("mt-")) else i for i in new_var] # turn "mtXXX" into "mt-XXX"
            for i in range(len(new_var)):
                # if mouse mito gene symbols were in upper case, turn into lower case with "mt-"
                if new_var[i] in [n.upper() for n in ["Tf","Rnr1","Tv","Rnr2","Tl1","Nd1","Ti","Tq","Tm","Nd2","Tw","Ta","Tn",
                                                        "Tc","Ty","Co1","Ts1","Td","Co2","Tk","Atp8","Atp6","Co3","Tg","Nd3","Tr",
                                                        "Nd4l","Nd4","Th","Ts2","Tl2","Nd5","Nd6","Te","Cytb","Tt","Tp"]]:
                    new_var[i] = "mt-" + new_var[i][0] + new_var[i][1:].lower()
                # turn ensembl id into gene symbol
                try:
                    if new_var[i][:7]=="ENSMUSG" and pd.isna(men2sy[new_var[i]])==False:
                        new_var[i] = men2sy[new_var[i]]
                except:
                    continue
        adata.var_names = pd.Index(new_var)
        
        # remove low sequencing quality cells
        adata.var_names_make_unique()
        adata.obs_names_make_unique()
        sc.pp.filter_cells(adata, min_genes=200)
        if j in human_datasets:
            adata.var["mt"] = adata.var_names.str.startswith("MT-")
        else:
            adata.var["mt"] = adata.var_names.str.startswith("mt-") | adata.var_names.isin([n.upper() for n in ["Tf","Rnr1","Tv","Rnr2","Tl1","Nd1","Ti","Tq","Tm","Nd2","Tw","Ta","Tn",
                                                                                                                "Tc","Ty","Co1","Ts1","Td","Co2","Tk","Atp8","Atp6","Co3","Tg","Nd3","Tr",
                                                                                                                "Nd4l","Nd4","Th","Ts2","Tl2","Nd5","Nd6","Te","Cytb","Tt","Tp"]])
        sc.pp.calculate_qc_metrics(
            adata, qc_vars=["mt"], inplace=True, percent_top=[20], log1p=True
        )
        adata.obs["outlier"] = (
            is_outlier(adata, "log1p_total_counts", 5)
            | is_outlier(adata, "log1p_n_genes_by_counts", 5)
            | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
        )
        adata.obs.outlier.value_counts()
        adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 5)
        adata.obs.mt_outlier.value_counts()
        adata.obs["is_outlier"] = ["outlier" if i else "non-outlier" for i in adata.obs["outlier"]]
        adata.obs["is_mt_outlier"] = ["mt_outlier" if i else "non-mt_outlier" for i in adata.obs["mt_outlier"]]
        sc.pl.violin(adata, ["pct_counts_mt"], groupby = "is_mt_outlier")
        
        print(f"Total number of cells: {adata.n_obs}")
        adata = adata[(~adata.obs.outlier)\
                    & (~adata.obs.mt_outlier)\
                    ].copy()
        
        print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")
        counts_matrix = adata.X
        ro.globalenv["adata"] = counts_matrix.T.todense()
        ro.r('doublets <- scDblFinder(sce = as.matrix(adata), verbose=FALSE)')
        doublet_scores = ro.r('doublets$scDblFinder.score').T
        doublets = ro.r('doublets$scDblFinder.class')
        adata.obs["doublet_score"] = doublet_scores
        adata.obs["doublet"] = doublets
        adata.obs["doublet"] = ["doublet" if i!=1 else "singlet" for i in adata.obs["doublet"]]
        adata.obs["is_doublet"] = [True if i=="doublet" else False for i in adata.obs["doublet"]]
        
        print(f"Total number of cells: {adata.n_obs}")
        adata = adata[(~adata.obs.is_doublet)].copy()

        print(f"Number of cells after filtering of doublets: {adata.n_obs}")
        adata.write(main_path+"datanew/"+j+".h5ad")

for a in raw_csv:
    # For CSV format
    dataset_id = a
    path_li = list(os.walk(main_path+dataset_id+"_RAW/"))[0][2]
    print("\033[33mReading dataset {} ...\033[0m".format(a))
    for j in path_li:
        
        print("\033[33mReading sample {} ...\033[0m".format(j))
        adata = sc.read_text(main_path+dataset_id+"_RAW/"+j, delimiter=",").T
        j = j.split("_")[0].split(".")[0]
        
        # convert ensembl ids to gene symbols
        new_var = list(adata.var_names)
        if j in human_datasets:
            new_var = [("MT-"+i.split("MT")[1]) if (i.startswith("MT") and not i.startswith("MT-")) else i for i in new_var] # turn "MTXXX" into "MT-XXX"
            for i in range(len(new_var)):
                try:
                    if new_var[i][:4]=="ENSG" and pd.isna(hen2sy[new_var[i]])==False:
                        new_var[i] = hen2sy[new_var[i]]
                except:
                    continue
        else:
            new_var = [("mt-"+i.split("mt")[1]) if (i.startswith("mt") and not i.startswith("mt-")) else i for i in new_var] # turn "mtXXX" into "mt-XXX"
            for i in range(len(new_var)):
                # if mouse mito gene symbols were in upper case, turn into lower case with "mt-"
                if new_var[i] in [n.upper() for n in ["Tf","Rnr1","Tv","Rnr2","Tl1","Nd1","Ti","Tq","Tm","Nd2","Tw","Ta","Tn",
                                                        "Tc","Ty","Co1","Ts1","Td","Co2","Tk","Atp8","Atp6","Co3","Tg","Nd3","Tr",
                                                        "Nd4l","Nd4","Th","Ts2","Tl2","Nd5","Nd6","Te","Cytb","Tt","Tp"]]:
                    new_var[i] = "mt-" + new_var[i][0] + new_var[i][1:].lower()
                # turn ensembl id into gene symbol
                try:
                    if new_var[i][:7]=="ENSMUSG" and pd.isna(men2sy[new_var[i]])==False:
                        new_var[i] = men2sy[new_var[i]]
                except:
                    continue
        adata.var_names = pd.Index(new_var)
        
        # remove low sequencing quality cells
        adata.var_names_make_unique()
        adata.obs_names_make_unique()
        sc.pp.filter_cells(adata, min_genes=200)
        if j in human_datasets:
            adata.var["mt"] = adata.var_names.str.startswith("MT-")
        else:
            adata.var["mt"] = adata.var_names.str.startswith("mt-") | adata.var_names.isin([n.upper() for n in ["Tf","Rnr1","Tv","Rnr2","Tl1","Nd1","Ti","Tq","Tm","Nd2","Tw","Ta","Tn",
                                                                                                                "Tc","Ty","Co1","Ts1","Td","Co2","Tk","Atp8","Atp6","Co3","Tg","Nd3","Tr",
                                                                                                                "Nd4l","Nd4","Th","Ts2","Tl2","Nd5","Nd6","Te","Cytb","Tt","Tp"]])
        sc.pp.calculate_qc_metrics(
            adata, qc_vars=["mt"], inplace=True, percent_top=[20], log1p=True
        )
        adata.obs["outlier"] = (
            is_outlier(adata, "log1p_total_counts", 5)
            | is_outlier(adata, "log1p_n_genes_by_counts", 5)
            | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
        )
        adata.obs.outlier.value_counts()
        adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 5)
        adata.obs.mt_outlier.value_counts()
        adata.obs["is_outlier"] = ["outlier" if i else "non-outlier" for i in adata.obs["outlier"]]
        adata.obs["is_mt_outlier"] = ["mt_outlier" if i else "non-mt_outlier" for i in adata.obs["mt_outlier"]]
        sc.pl.violin(adata, ["pct_counts_mt"], groupby = "is_mt_outlier")
        
        print(f"Total number of cells: {adata.n_obs}")
        adata = adata[(~adata.obs.outlier)\
                    & (~adata.obs.mt_outlier)\
                    ].copy()
        
        print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")
        counts_matrix = adata.X
        ro.globalenv["adata"] = counts_matrix.T
        ro.r('doublets <- scDblFinder(sce = as.matrix(adata), verbose=FALSE)')
        doublet_scores = ro.r('doublets$scDblFinder.score').T
        doublets = ro.r('doublets$scDblFinder.class')
        adata.obs["doublet_score"] = doublet_scores
        adata.obs["doublet"] = doublets
        adata.obs["doublet"] = ["doublet" if i!=1 else "singlet" for i in adata.obs["doublet"]]
        adata.obs["is_doublet"] = [True if i=="doublet" else False for i in adata.obs["doublet"]]
        
        print(f"Total number of cells: {adata.n_obs}")
        adata = adata[(~adata.obs.is_doublet)].copy()

        print(f"Number of cells after filtering of doublets: {adata.n_obs}")
        adata.write(main_path+"datanew/"+j+".h5ad")

for a in raw_txt:
    # For TXT format
    dataset_id = a
    path_li = list(os.walk(main_path+dataset_id+"_RAW/"))[0][2]
    print("\033[33mReading dataset {} ...\033[0m".format(a))
    for j in path_li:
        print("\033[33mReading sample {} ...\033[0m".format(j))
        adata = sc.read_text(main_path+dataset_id+"_RAW/"+j).T
        j = j.split("_")[0].split(".")[0]
        
        # convert ensembl ids to gene symbols
        new_var = list(adata.var_names)
        if j in human_datasets:
            new_var = [("MT-"+i.split("MT")[1]) if (i.startswith("MT") and not i.startswith("MT-")) else i for i in new_var] # turn "MTXXX" into "MT-XXX"
            for i in range(len(new_var)):
                try:
                    if new_var[i][:4]=="ENSG" and pd.isna(hen2sy[new_var[i]])==False:
                        new_var[i] = hen2sy[new_var[i]]
                except:
                    continue
        else:
            new_var = [("mt-"+i.split("mt")[1]) if (i.startswith("mt") and not i.startswith("mt-")) else i for i in new_var] # turn "mtXXX" into "mt-XXX"
            for i in range(len(new_var)):
                # if mouse mito gene symbols were in upper case, turn into lower case with "mt-"
                if new_var[i] in [n.upper() for n in ["Tf","Rnr1","Tv","Rnr2","Tl1","Nd1","Ti","Tq","Tm","Nd2","Tw","Ta","Tn",
                                                        "Tc","Ty","Co1","Ts1","Td","Co2","Tk","Atp8","Atp6","Co3","Tg","Nd3","Tr",
                                                        "Nd4l","Nd4","Th","Ts2","Tl2","Nd5","Nd6","Te","Cytb","Tt","Tp"]]:
                    new_var[i] = "mt-" + new_var[i][0] + new_var[i][1:].lower()
                # turn ensembl id into gene symbol
                try:
                    if new_var[i][:7]=="ENSMUSG" and pd.isna(men2sy[new_var[i]])==False:
                        new_var[i] = men2sy[new_var[i]]
                except:
                    continue
        adata.var_names = pd.Index(new_var)
        
        # remove low sequencing quality cells
        adata.var_names_make_unique()
        adata.obs_names_make_unique()
        sc.pp.filter_cells(adata, min_genes=200)
        if j in human_datasets:
            adata.var["mt"] = adata.var_names.str.startswith("MT-")
        else:
            adata.var["mt"] = adata.var_names.str.startswith("mt-") | adata.var_names.isin([n.upper() for n in ["Tf","Rnr1","Tv","Rnr2","Tl1","Nd1","Ti","Tq","Tm","Nd2","Tw","Ta","Tn",
                                                                                                                "Tc","Ty","Co1","Ts1","Td","Co2","Tk","Atp8","Atp6","Co3","Tg","Nd3","Tr",
                                                                                                                "Nd4l","Nd4","Th","Ts2","Tl2","Nd5","Nd6","Te","Cytb","Tt","Tp"]])
        sc.pp.calculate_qc_metrics(
            adata, qc_vars=["mt"], inplace=True, percent_top=[20], log1p=True
        )
        adata.obs["outlier"] = (
            is_outlier(adata, "log1p_total_counts", 5)
            | is_outlier(adata, "log1p_n_genes_by_counts", 5)
            | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
        )
        adata.obs.outlier.value_counts()
        adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 5)
        adata.obs.mt_outlier.value_counts()
        adata.obs["is_outlier"] = ["outlier" if i else "non-outlier" for i in adata.obs["outlier"]]
        adata.obs["is_mt_outlier"] = ["mt_outlier" if i else "non-mt_outlier" for i in adata.obs["mt_outlier"]]
        sc.pl.violin(adata, ["pct_counts_mt"], groupby = "is_mt_outlier")
        
        print(f"Total number of cells: {adata.n_obs}")
        adata = adata[(~adata.obs.outlier)\
                    & (~adata.obs.mt_outlier)\
                    ].copy()
        
        print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")
        counts_matrix = adata.X
        ro.globalenv["adata"] = counts_matrix.T
        ro.r('doublets <- scDblFinder(sce = as.matrix(adata), verbose=FALSE)')
        doublet_scores = ro.r('doublets$scDblFinder.score').T
        doublets = ro.r('doublets$scDblFinder.class')
        adata.obs["doublet_score"] = doublet_scores
        adata.obs["doublet"] = doublets
        adata.obs["doublet"] = ["doublet" if i!=1 else "singlet" for i in adata.obs["doublet"]]
        adata.obs["is_doublet"] = [True if i=="doublet" else False for i in adata.obs["doublet"]]
        
        print(f"Total number of cells: {adata.n_obs}")
        adata = adata[(~adata.obs.is_doublet)].copy()

        print(f"Number of cells after filtering of doublets: {adata.n_obs}")
        adata.write(main_path+"datanew/"+j+".h5ad")

for a in raw_tiled_10x_mtx:
    # For tiled TXT,CSV format
    dataset_id = a
    file_li = list(os.walk(main_path+dataset_id+"_RAW/"))[0][2]
    ra = range(int(len(file_li)/3))
    print("\033[33mReading dataset {} ...\033[0m".format(a))
    for o in ra:
        prefix = file_li[o*3].split(".")[0][:-8]
        adata = sc.read_10x_mtx(main_path+dataset_id+"_RAW/", prefix=prefix, gex_only="False")
        j = prefix.split("_")[0]
        print("\033[33mReading sample {} ...\033[0m".format(j))
        
        # convert ensembl ids to gene symbols
        new_var = list(adata.var_names)
        if j in human_datasets:
            new_var = [("MT-"+i.split("MT")[1]) if (i.startswith("MT") and not i.startswith("MT-")) else i for i in new_var] # turn "MTXXX" into "MT-XXX"
            for i in range(len(new_var)):
                try:
                    if new_var[i][:4]=="ENSG" and pd.isna(hen2sy[new_var[i]])==False:
                        new_var[i] = hen2sy[new_var[i]]
                except:
                    continue
        else:
            new_var = [("mt-"+i.split("mt")[1]) if (i.startswith("mt") and not i.startswith("mt-")) else i for i in new_var] # turn "mtXXX" into "mt-XXX"
            for i in range(len(new_var)):
                # if mouse mito gene symbols were in upper case, turn into lower case with "mt-"
                if new_var[i] in [n.upper() for n in ["Tf","Rnr1","Tv","Rnr2","Tl1","Nd1","Ti","Tq","Tm","Nd2","Tw","Ta","Tn",
                                                        "Tc","Ty","Co1","Ts1","Td","Co2","Tk","Atp8","Atp6","Co3","Tg","Nd3","Tr",
                                                        "Nd4l","Nd4","Th","Ts2","Tl2","Nd5","Nd6","Te","Cytb","Tt","Tp"]]:
                    new_var[i] = "mt-" + new_var[i][0] + new_var[i][1:].lower()
                # turn ensembl id into gene symbol
                try:
                    if new_var[i][:7]=="ENSMUSG" and pd.isna(men2sy[new_var[i]])==False:
                        new_var[i] = men2sy[new_var[i]]
                except:
                    continue
        adata.var_names = pd.Index(new_var)
        
        # remove low sequencing quality cells
        adata.var_names_make_unique()
        adata.obs_names_make_unique()
        sc.pp.filter_cells(adata, min_genes=200)
        if j in human_datasets:
            adata.var["mt"] = adata.var_names.str.startswith("MT-")
        else:
            adata.var["mt"] = adata.var_names.str.startswith("mt-") | adata.var_names.isin([n.upper() for n in ["Tf","Rnr1","Tv","Rnr2","Tl1","Nd1","Ti","Tq","Tm","Nd2","Tw","Ta","Tn",
                                                                                                                "Tc","Ty","Co1","Ts1","Td","Co2","Tk","Atp8","Atp6","Co3","Tg","Nd3","Tr",
                                                                                                                "Nd4l","Nd4","Th","Ts2","Tl2","Nd5","Nd6","Te","Cytb","Tt","Tp"]])
        sc.pp.calculate_qc_metrics(
            adata, qc_vars=["mt"], inplace=True, percent_top=[20], log1p=True
        )
        adata.obs["outlier"] = (
            is_outlier(adata, "log1p_total_counts", 5)
            | is_outlier(adata, "log1p_n_genes_by_counts", 5)
            | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
        )
        adata.obs.outlier.value_counts()
        adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 5)
        adata.obs.mt_outlier.value_counts()
        adata.obs["is_outlier"] = ["outlier" if i else "non-outlier" for i in adata.obs["outlier"]]
        adata.obs["is_mt_outlier"] = ["mt_outlier" if i else "non-mt_outlier" for i in adata.obs["mt_outlier"]]
        sc.pl.violin(adata, ["pct_counts_mt"], groupby = "is_mt_outlier")
        
        print(f"Total number of cells: {adata.n_obs}")
        adata = adata[(~adata.obs.outlier)\
                    & (~adata.obs.mt_outlier)\
                    ].copy()
        
        print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")
        counts_matrix = adata.X
        ro.globalenv["adata"] = counts_matrix.T.todense()
        ro.r('doublets <- scDblFinder(sce = as.matrix(adata), verbose=FALSE)')
        doublet_scores = ro.r('doublets$scDblFinder.score').T
        doublets = ro.r('doublets$scDblFinder.class')
        adata.obs["doublet_score"] = doublet_scores
        adata.obs["doublet"] = doublets
        adata.obs["doublet"] = ["doublet" if i!=1 else "singlet" for i in adata.obs["doublet"]]
        adata.obs["is_doublet"] = [True if i=="doublet" else False for i in adata.obs["doublet"]]
        
        print(f"Total number of cells: {adata.n_obs}")
        adata = adata[(~adata.obs.is_doublet)].copy()

        print(f"Number of cells after filtering of doublets: {adata.n_obs}")
        adata.write(main_path+"datanew/"+j+".h5ad")

for a in raw_h5:
    # For h5 format
    dataset_id = a
    path_li = list(os.walk(main_path+dataset_id+"_RAW/"))[0][2]
    print("\033[33mReading dataset {} ...\033[0m".format(a))
    for j in path_li:
        print("\033[33mReading sample {} ...\033[0m".format(j))
        adata = sc.read_10x_h5(main_path+dataset_id+"_RAW/"+j)
        j = j.split("_")[0]
        
        # convert ensembl ids to gene symbols
        new_var = list(adata.var_names)
        if j in human_datasets:
            new_var = [("MT-"+i.split("MT")[1]) if (i.startswith("MT") and not i.startswith("MT-")) else i for i in new_var] # turn "MTXXX" into "MT-XXX"
            for i in range(len(new_var)):
                try:
                    if new_var[i][:4]=="ENSG" and pd.isna(hen2sy[new_var[i]])==False:
                        new_var[i] = hen2sy[new_var[i]]
                except:
                    continue
        else:
            new_var = [("mt-"+i.split("mt")[1]) if (i.startswith("mt") and not i.startswith("mt-")) else i for i in new_var] # turn "mtXXX" into "mt-XXX"
            for i in range(len(new_var)):
                # if mouse mito gene symbols were in upper case, turn into lower case with "mt-"
                if new_var[i] in [n.upper() for n in ["Tf","Rnr1","Tv","Rnr2","Tl1","Nd1","Ti","Tq","Tm","Nd2","Tw","Ta","Tn",
                                                        "Tc","Ty","Co1","Ts1","Td","Co2","Tk","Atp8","Atp6","Co3","Tg","Nd3","Tr",
                                                        "Nd4l","Nd4","Th","Ts2","Tl2","Nd5","Nd6","Te","Cytb","Tt","Tp"]]:
                    new_var[i] = "mt-" + new_var[i][0] + new_var[i][1:].lower()
                # turn ensembl id into gene symbol
                try:
                    if new_var[i][:7]=="ENSMUSG" and pd.isna(men2sy[new_var[i]])==False:
                        new_var[i] = men2sy[new_var[i]]
                except:
                    continue
        adata.var_names = pd.Index(new_var)
        
        # remove low sequencing quality cells
        adata.var_names_make_unique()
        adata.obs_names_make_unique()
        sc.pp.filter_cells(adata, min_genes=200)
        if j in human_datasets:
            adata.var["mt"] = adata.var_names.str.startswith("MT-")
        else:
            adata.var["mt"] = adata.var_names.str.startswith("mt-") | adata.var_names.isin([n.upper() for n in ["Tf","Rnr1","Tv","Rnr2","Tl1","Nd1","Ti","Tq","Tm","Nd2","Tw","Ta","Tn",
                                                                                                                "Tc","Ty","Co1","Ts1","Td","Co2","Tk","Atp8","Atp6","Co3","Tg","Nd3","Tr",
                                                                                                                "Nd4l","Nd4","Th","Ts2","Tl2","Nd5","Nd6","Te","Cytb","Tt","Tp"]])
        sc.pp.calculate_qc_metrics(
            adata, qc_vars=["mt"], inplace=True, percent_top=[20], log1p=True
        )
        adata.obs["outlier"] = (
            is_outlier(adata, "log1p_total_counts", 5)
            | is_outlier(adata, "log1p_n_genes_by_counts", 5)
            | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
        )
        adata.obs.outlier.value_counts()
        adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 5)
        adata.obs.mt_outlier.value_counts()
        adata.obs["is_outlier"] = ["outlier" if i else "non-outlier" for i in adata.obs["outlier"]]
        adata.obs["is_mt_outlier"] = ["mt_outlier" if i else "non-mt_outlier" for i in adata.obs["mt_outlier"]]
        sc.pl.violin(adata, ["pct_counts_mt"], groupby = "is_mt_outlier")
        
        print(f"Total number of cells: {adata.n_obs}")
        adata = adata[(~adata.obs.outlier)\
                    & (~adata.obs.mt_outlier)\
                    ].copy()
        
        print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")
        counts_matrix = adata.X
        try:
            ro.globalenv["adata"] = counts_matrix.T.todense()
        except:
            ro.globalenv["adata"] = counts_matrix.T
        ro.r('doublets <- scDblFinder(sce = as.matrix(adata), verbose=FALSE)')
        doublet_scores = ro.r('doublets$scDblFinder.score').T
        doublets = ro.r('doublets$scDblFinder.class')
        adata.obs["doublet_score"] = doublet_scores
        adata.obs["doublet"] = doublets
        adata.obs["doublet"] = ["doublet" if i!=1 else "singlet" for i in adata.obs["doublet"]]
        adata.obs["is_doublet"] = [True if i=="doublet" else False for i in adata.obs["doublet"]]
        sc.pl.violin(adata, ["log1p_total_counts","log1p_n_genes_by_counts","pct_counts_in_top_20_genes"], groupby="doublet")
        print(f"Total number of cells: {adata.n_obs}")
        adata = adata[(~adata.obs.is_doublet)].copy()

        print(f"Number of cells after filtering of doublets: {adata.n_obs}")
        adata.write(main_path+"datanew/"+j+".h5ad")

for a in raw_h5ad:
    # For h5ad format
    dataset_id = a
    path_li = list(os.walk(main_path+dataset_id+"_RAW/"))[0][2]
    print("\033[33mReading dataset {} ...\033[0m".format(a))
    for j in path_li:
        print("\033[33mReading sample {} ...\033[0m".format(j))
        adata = sc.read_h5ad(main_path+dataset_id+"_RAW/"+j)
        j = j.split("_")[0]
        
        # convert ensembl ids to gene symbols
        new_var = list(adata.var_names)
        if j in human_datasets:
            new_var = [("MT-"+i.split("MT")[1]) if (i.startswith("MT") and not i.startswith("MT-")) else i for i in new_var] # turn "MTXXX" into "MT-XXX"
            for i in range(len(new_var)):
                try:
                    if new_var[i][:4]=="ENSG" and pd.isna(hen2sy[new_var[i]])==False:
                        new_var[i] = hen2sy[new_var[i]]
                except:
                    continue
        else:
            new_var = [("mt-"+i.split("mt")[1]) if (i.startswith("mt") and not i.startswith("mt-")) else i for i in new_var] # turn "mtXXX" into "mt-XXX"
            for i in range(len(new_var)):
                # if mouse mito gene symbols were in upper case, turn into lower case with "mt-"
                if new_var[i] in [n.upper() for n in ["Tf","Rnr1","Tv","Rnr2","Tl1","Nd1","Ti","Tq","Tm","Nd2","Tw","Ta","Tn",
                                                        "Tc","Ty","Co1","Ts1","Td","Co2","Tk","Atp8","Atp6","Co3","Tg","Nd3","Tr",
                                                        "Nd4l","Nd4","Th","Ts2","Tl2","Nd5","Nd6","Te","Cytb","Tt","Tp"]]:
                    new_var[i] = "mt-" + new_var[i][0] + new_var[i][1:].lower()
                # turn ensembl id into gene symbol
                try:
                    if new_var[i][:7]=="ENSMUSG" and pd.isna(men2sy[new_var[i]])==False:
                        new_var[i] = men2sy[new_var[i]]
                except:
                    continue
        adata.var_names = pd.Index(new_var)
        
        # remove low sequencing quality cells
        adata.var_names_make_unique()
        adata.obs_names_make_unique()
        sc.pp.filter_cells(adata, min_genes=200)
        if j in human_datasets:
            adata.var["mt"] = adata.var_names.str.startswith("MT-")
        else:
            adata.var["mt"] = adata.var_names.str.startswith("mt-") | adata.var_names.isin([n.upper() for n in ["Tf","Rnr1","Tv","Rnr2","Tl1","Nd1","Ti","Tq","Tm","Nd2","Tw","Ta","Tn",
                                                                                                                "Tc","Ty","Co1","Ts1","Td","Co2","Tk","Atp8","Atp6","Co3","Tg","Nd3","Tr",
                                                                                                                "Nd4l","Nd4","Th","Ts2","Tl2","Nd5","Nd6","Te","Cytb","Tt","Tp"]])
        sc.pp.calculate_qc_metrics(
            adata, qc_vars=["mt"], inplace=True, percent_top=[20], log1p=True
        )
        adata.obs["outlier"] = (
            is_outlier(adata, "log1p_total_counts", 5)
            | is_outlier(adata, "log1p_n_genes_by_counts", 5)
            | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
        )
        adata.obs.outlier.value_counts()
        adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 5)
        adata.obs.mt_outlier.value_counts()
        adata.obs["is_outlier"] = ["outlier" if i else "non-outlier" for i in adata.obs["outlier"]]
        adata.obs["is_mt_outlier"] = ["mt_outlier" if i else "non-mt_outlier" for i in adata.obs["mt_outlier"]]
        sc.pl.violin(adata, ["pct_counts_mt"], groupby = "is_mt_outlier")
        
        print(f"Total number of cells: {adata.n_obs}")
        adata = adata[(~adata.obs.outlier)\
                    & (~adata.obs.mt_outlier)\
                    ].copy()
        
        print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")
        counts_matrix = adata.X
        try:
            ro.globalenv["adata"] = counts_matrix.T.todense()
        except:
            ro.globalenv["adata"] = counts_matrix.T
        ro.r('doublets <- scDblFinder(sce = as.matrix(adata), verbose=FALSE)')
        doublet_scores = ro.r('doublets$scDblFinder.score').T
        doublets = ro.r('doublets$scDblFinder.class')
        adata.obs["doublet_score"] = doublet_scores
        adata.obs["doublet"] = doublets
        adata.obs["doublet"] = ["doublet" if i!=1 else "singlet" for i in adata.obs["doublet"]]
        adata.obs["is_doublet"] = [True if i=="doublet" else False for i in adata.obs["doublet"]]
        
        print(f"Total number of cells: {adata.n_obs}")
        adata = adata[(~adata.obs.is_doublet)].copy()

        print(f"Number of cells after filtering of doublets: {adata.n_obs}")
        adata.write(main_path+"datanew/"+j+".h5ad")