In [None]:
import scipy.io
import pandas as pd
import scanpy as sc

In [None]:
from pathlib import Path

path = Path("/mnt/f/pvn/outer/raw")
output_path = Path("/mnt/f/pvn/outer/step1/output/")
h5ad_path = Path("/mnt/f/pvn/outer/step1/")

if not path.exists():
    raise FileNotFoundError(f"path {path} not exist")
sample_folders = [folder for folder in path.iterdir() if folder.is_dir()]
sample_folders

In [None]:
import mygene
main_folder_path = Path("/mnt/f/pvn/outer/raw/")

if not main_folder_path.exists():
    raise FileNotFoundError(f"path {main_folder_path} not exist")

sample_folders = [folder for folder in main_folder_path.iterdir() if folder.is_dir()]

mg = mygene.MyGeneInfo()

def convert_ensembl_to_symbol(ensembl_ids):
    results = mg.querymany(ensembl_ids, scopes='ensembl.gene', fields='symbol', species='mouse')
    
    ensembl_to_symbol = {result['query']: result.get('symbol', result['query']) for result in results}
    
    symbols = [ensembl_to_symbol.get(x, x) for x in ensembl_ids]
    return symbols

adata_list = []

for sample_folder in sample_folders:
    filtered_matrix_folder = sample_folder / 'filtered_feature_bc_matrix'

    if not filtered_matrix_folder.exists():
        print(f"warningï¼š{filtered_matrix_folder} do not exist,skip")
        continue

    adata = sc.read_10x_mtx(
        str(filtered_matrix_folder),
        var_names='gene_symbols',
        cache=True
    )

    adata.var_names_make_unique()  

    if adata.var_names.str.contains('ENS').any():
        adata.var_names = convert_ensembl_to_symbol(adata.var_names)
        
    adata.var_names_make_unique()  
    sample_name = sample_folder.name
    adata.obs_names = [f"{sample_name}_{cell}" for cell in adata.obs_names]
    adata.obs['sample'] = sample_name
    adata_list.append(adata)


In [None]:
adata_concatenated = sc.concat(adata_list, join='outer')
print(adata_concatenated.obs['sample'].value_counts())
adata_concatenated.obs_names_make_unique()
adata_concatenated.write(str(h5ad_path / "merged_data.h5ad"))

In [None]:
sc.pp.scrublet(adata_concatenated,expected_doublet_rate=0.05,threshold=0.25,batch_key='sample')

In [None]:
adata_concatenated.write(f"{h5ad_path}/merged_data_scrublet.h5ad")
adata_concatenated = adata_concatenated[~adata_concatenated.obs['predicted_doublet']]
adata_concatenated.write(f"{h5ad_path}/merged_data_scrublet_filtered.h5ad")