# Notebook to run basic `scanpy` QC and doublet detection with `scrublet` for SRA Project - PRJEB39602

- **Developed by**: Srivalli Kolla

- **Created date** : 14 October, 2024

- **Modification date** : 31 October, 2024

- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**

Env : Scanpy(Python 3.12.4)

# Import Packages

In [16]:
import anndata
import logging
import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sb
import scrublet as scr
import os
import time
import matplotlib.pyplot as plt
from statsmodels.robust.scale import mad as median_abs_deviation
from matplotlib import colors
from matplotlib import rcParams

In [17]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 160, color_map = 'RdPu', dpi_save = 180, vector_friendly = True, format = 'svg')
timestamp = time.strftime("%d_%m_%Y")

-----
anndata     0.10.8
scanpy      1.10.2
-----
PIL                 10.3.0
asttokens           NA
attr                23.2.0
cffi                1.16.0
colorama            0.4.6
comm                0.2.2
cycler              0.12.1
cython_runtime      NA
dateutil            2.9.0.post0
debugpy             1.8.2
decorator           5.1.1
defusedxml          0.7.1
distutils           3.12.4
django              5.0.6
executing           2.0.1
h5py                3.11.0
igraph              0.11.5
ipykernel           6.29.5
ipython_genutils    0.2.0
ipywidgets          8.1.3
jedi                0.19.1
joblib              1.4.2
kiwisolver          1.4.5
legacy_api_wrap     NA
leidenalg           0.10.2
llvmlite            0.43.0
louvain             0.8.2
matplotlib          3.8.4
matplotlib_inline   0.1.7
mpl_toolkits        NA
natsort             8.4.0
numba               0.60.0
numexpr             2.10.1
numpy               1.26.4
packaging           24.1
pandas              2.2.2
parso  

# Import sample_names

In [18]:
path = '/mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/'
files = ['ERR6449801_GeneFull_raw.h5ad',
'ERR6449803_GeneFull_raw.h5ad',
'ERR6449820_GeneFull_raw.h5ad',
'ERR6449847_GeneFull_raw.h5ad',
'ERR6449908_GeneFull_raw.h5ad',
'ERR6449913_GeneFull_raw.h5ad',
'ERR6449914_GeneFull_raw.h5ad',
'ERR6449915_GeneFull_raw.h5ad',
'ERR6449920_GeneFull_raw.h5ad',
'ERR6449932_GeneFull_raw.h5ad',
'ERR6449933_GeneFull_raw.h5ad',
'ERR6449960_GeneFull_raw.h5ad',
'ERR6449969_GeneFull_raw.h5ad',
'ERR6449986_GeneFull_raw.h5ad',
'ERR6449990_GeneFull_raw.h5ad',
'ERR7423245_GeneFull_raw.h5ad',
'ERR7423246_GeneFull_raw.h5ad',
'ERR7423252_GeneFull_raw.h5ad',
'ERR7423268_GeneFull_raw.h5ad',
'ERR7423284_GeneFull_raw.h5ad',
'ERR7423293_GeneFull_raw.h5ad',
'ERR7423294_GeneFull_raw.h5ad',
'ERR7423297_GeneFull_raw.h5ad',
'ERR7423304_GeneFull_raw.h5ad',
'ERR7423313_GeneFull_raw.h5ad',
'ERR7423322_GeneFull_raw.h5ad',
'ERR7423335_GeneFull_raw.h5ad',
'ERR7423342_GeneFull_raw.h5ad',
'ERR7423350_GeneFull_raw.h5ad',
'ERR7423357_GeneFull_raw.h5ad',
'ERR7423362_GeneFull_raw.h5ad',
'ERR7423372_GeneFull_raw.h5ad',
'ERR7423384_GeneFull_raw.h5ad',
'ERR7423386_GeneFull_raw.h5ad',
'ERR7423420_GeneFull_raw.h5ad',
'ERR7423423_GeneFull_raw.h5ad',
'ERR7423466_GeneFull_raw.h5ad',
'ERR7423467_GeneFull_raw.h5ad',
'ERR7423472_GeneFull_raw.h5ad',
'ERR7423485_GeneFull_raw.h5ad',
'ERR6449746_GeneFull_raw.h5ad',
'ERR6449747_GeneFull_raw.h5ad',
'ERR6449750_GeneFull_raw.h5ad',
'ERR6449751_GeneFull_raw.h5ad',
'ERR6449756_GeneFull_raw.h5ad',
'ERR6449758_GeneFull_raw.h5ad',
'ERR6449761_GeneFull_raw.h5ad',
'ERR6449763_GeneFull_raw.h5ad',
'ERR6449765_GeneFull_raw.h5ad',
'ERR6449767_GeneFull_raw.h5ad',
'ERR6449775_GeneFull_raw.h5ad',
'ERR6449783_GeneFull_raw.h5ad',
'ERR6449789_GeneFull_raw.h5ad',
'ERR6449790_GeneFull_raw.h5ad',
'ERR6449792_GeneFull_raw.h5ad',
'ERR6449793_GeneFull_raw.h5ad',
'ERR6449799_GeneFull_raw.h5ad',
'ERR6449808_GeneFull_raw.h5ad',
'ERR6449810_GeneFull_raw.h5ad',
'ERR6449815_GeneFull_raw.h5ad',
'ERR6449818_GeneFull_raw.h5ad',
'ERR6449822_GeneFull_raw.h5ad',
'ERR6449824_GeneFull_raw.h5ad',
'ERR6449826_GeneFull_raw.h5ad',
'ERR6449828_GeneFull_raw.h5ad',
'ERR6449829_GeneFull_raw.h5ad',
'ERR6449831_GeneFull_raw.h5ad',
'ERR6449835_GeneFull_raw.h5ad',
'ERR6449837_GeneFull_raw.h5ad',
'ERR6449840_GeneFull_raw.h5ad',
'ERR6449845_GeneFull_raw.h5ad',
'ERR6449846_GeneFull_raw.h5ad',
'ERR6449849_GeneFull_raw.h5ad',
'ERR6449852_GeneFull_raw.h5ad',
'ERR6449853_GeneFull_raw.h5ad',
'ERR6449862_GeneFull_raw.h5ad',
'ERR6449866_GeneFull_raw.h5ad',
'ERR6449871_GeneFull_raw.h5ad',
'ERR6449872_GeneFull_raw.h5ad',
'ERR6449874_GeneFull_raw.h5ad',
'ERR6449878_GeneFull_raw.h5ad',
'ERR6449879_GeneFull_raw.h5ad',
'ERR6449884_GeneFull_raw.h5ad',
'ERR6449888_GeneFull_raw.h5ad',
'ERR6449892_GeneFull_raw.h5ad',
'ERR6449894_GeneFull_raw.h5ad',
'ERR6449902_GeneFull_raw.h5ad',
'ERR6449905_GeneFull_raw.h5ad',
'ERR6449912_GeneFull_raw.h5ad',
'ERR6449918_GeneFull_raw.h5ad',
'ERR6449924_GeneFull_raw.h5ad',
'ERR6449926_GeneFull_raw.h5ad',
'ERR6449934_GeneFull_raw.h5ad',
'ERR6449936_GeneFull_raw.h5ad',
'ERR6449938_GeneFull_raw.h5ad',
'ERR6449942_GeneFull_raw.h5ad',
'ERR6449944_GeneFull_raw.h5ad',
'ERR6449945_GeneFull_raw.h5ad',
'ERR6449950_GeneFull_raw.h5ad',
'ERR6449953_GeneFull_raw.h5ad',
'ERR6449954_GeneFull_raw.h5ad',
'ERR6449955_GeneFull_raw.h5ad',
'ERR6449959_GeneFull_raw.h5ad',
'ERR6449961_GeneFull_raw.h5ad',
'ERR6449962_GeneFull_raw.h5ad',
'ERR6449964_GeneFull_raw.h5ad',
'ERR6449965_GeneFull_raw.h5ad',
'ERR6449970_GeneFull_raw.h5ad',
'ERR6449971_GeneFull_raw.h5ad',
'ERR6449976_GeneFull_raw.h5ad',
'ERR6449982_GeneFull_raw.h5ad',
'ERR6449985_GeneFull_raw.h5ad',
'ERR6449989_GeneFull_raw.h5ad',
'ERR6449992_GeneFull_raw.h5ad',
'ERR6449996_GeneFull_raw.h5ad',
'ERR6449997_GeneFull_raw.h5ad',
'ERR6449998_GeneFull_raw.h5ad',
'ERR6449999_GeneFull_raw.h5ad',
'ERR7423235_GeneFull_raw.h5ad',
'ERR7423236_GeneFull_raw.h5ad',
'ERR7423237_GeneFull_raw.h5ad',
'ERR7423241_GeneFull_raw.h5ad',
'ERR7423243_GeneFull_raw.h5ad',
'ERR7423254_GeneFull_raw.h5ad',
'ERR7423258_GeneFull_raw.h5ad',
'ERR7423260_GeneFull_raw.h5ad',
'ERR7423262_GeneFull_raw.h5ad',
'ERR7423265_GeneFull_raw.h5ad',
'ERR7423270_GeneFull_raw.h5ad',
'ERR7423277_GeneFull_raw.h5ad',
'ERR7423279_GeneFull_raw.h5ad',
'ERR7423281_GeneFull_raw.h5ad',
'ERR7423288_GeneFull_raw.h5ad',
'ERR7423289_GeneFull_raw.h5ad',
'ERR7423292_GeneFull_raw.h5ad',
'ERR7423295_GeneFull_raw.h5ad',
'ERR7423296_GeneFull_raw.h5ad',
'ERR7423300_GeneFull_raw.h5ad',
'ERR7423302_GeneFull_raw.h5ad',
'ERR7423303_GeneFull_raw.h5ad',
'ERR7423307_GeneFull_raw.h5ad',
'ERR7423311_GeneFull_raw.h5ad',
'ERR7423312_GeneFull_raw.h5ad',
'ERR7423316_GeneFull_raw.h5ad',
'ERR7423318_GeneFull_raw.h5ad',
'ERR7423319_GeneFull_raw.h5ad',
'ERR7423320_GeneFull_raw.h5ad',
'ERR7423325_GeneFull_raw.h5ad',
'ERR7423326_GeneFull_raw.h5ad',
'ERR7423327_GeneFull_raw.h5ad',
'ERR7423330_GeneFull_raw.h5ad',
'ERR7423332_GeneFull_raw.h5ad',
'ERR7423336_GeneFull_raw.h5ad',
'ERR7423345_GeneFull_raw.h5ad',
'ERR7423347_GeneFull_raw.h5ad',
'ERR7423348_GeneFull_raw.h5ad',
'ERR7423351_GeneFull_raw.h5ad',
'ERR7423355_GeneFull_raw.h5ad',
'ERR7423356_GeneFull_raw.h5ad',
'ERR7423361_GeneFull_raw.h5ad',
'ERR7423370_GeneFull_raw.h5ad',
'ERR7423371_GeneFull_raw.h5ad',
'ERR7423377_GeneFull_raw.h5ad',
'ERR7423381_GeneFull_raw.h5ad',
'ERR7423383_GeneFull_raw.h5ad',
'ERR7423388_GeneFull_raw.h5ad',
'ERR7423389_GeneFull_raw.h5ad',
'ERR7423391_GeneFull_raw.h5ad',
'ERR7423393_GeneFull_raw.h5ad',
'ERR7423394_GeneFull_raw.h5ad',
'ERR7423397_GeneFull_raw.h5ad',
'ERR7423399_GeneFull_raw.h5ad',
'ERR7423401_GeneFull_raw.h5ad',
'ERR7423403_GeneFull_raw.h5ad',
'ERR7423405_GeneFull_raw.h5ad',
'ERR7423408_GeneFull_raw.h5ad',
'ERR7423409_GeneFull_raw.h5ad',
'ERR7423410_GeneFull_raw.h5ad',
'ERR7423411_GeneFull_raw.h5ad',
'ERR7423412_GeneFull_raw.h5ad',
'ERR7423415_GeneFull_raw.h5ad',
'ERR7423418_GeneFull_raw.h5ad',
'ERR7423430_GeneFull_raw.h5ad',
'ERR7423431_GeneFull_raw.h5ad',
'ERR7423432_GeneFull_raw.h5ad',
'ERR7423437_GeneFull_raw.h5ad',
'ERR7423439_GeneFull_raw.h5ad',
'ERR7423440_GeneFull_raw.h5ad',
'ERR7423445_GeneFull_raw.h5ad',
'ERR7423446_GeneFull_raw.h5ad',
'ERR7423447_GeneFull_raw.h5ad',
'ERR7423470_GeneFull_raw.h5ad',
'ERR7423473_GeneFull_raw.h5ad',
'ERR7423477_GeneFull_raw.h5ad',
'ERR7423478_GeneFull_raw.h5ad',
'ERR7423482_GeneFull_raw.h5ad',
'ERR7423483_GeneFull_raw.h5ad',
'ERR6449745_GeneFull_raw.h5ad',
'ERR6449748_GeneFull_raw.h5ad',
'ERR6449749_GeneFull_raw.h5ad',
'ERR6449752_GeneFull_raw.h5ad',
'ERR6449753_GeneFull_raw.h5ad',
'ERR6449754_GeneFull_raw.h5ad',
'ERR6449755_GeneFull_raw.h5ad',
'ERR6449757_GeneFull_raw.h5ad',
'ERR6449759_GeneFull_raw.h5ad',
'ERR6449760_GeneFull_raw.h5ad',
'ERR6449762_GeneFull_raw.h5ad',
'ERR6449764_GeneFull_raw.h5ad',
'ERR6449766_GeneFull_raw.h5ad',
'ERR6449768_GeneFull_raw.h5ad',
'ERR6449769_GeneFull_raw.h5ad',
'ERR6449770_GeneFull_raw.h5ad',
'ERR6449771_GeneFull_raw.h5ad',
'ERR6449772_GeneFull_raw.h5ad',
'ERR6449773_GeneFull_raw.h5ad',
'ERR6449774_GeneFull_raw.h5ad',
'ERR6449776_GeneFull_raw.h5ad',
'ERR6449777_GeneFull_raw.h5ad',
'ERR6449778_GeneFull_raw.h5ad',
'ERR6449779_GeneFull_raw.h5ad',
'ERR6449780_GeneFull_raw.h5ad',
'ERR6449781_GeneFull_raw.h5ad',
'ERR6449782_GeneFull_raw.h5ad',
'ERR6449784_GeneFull_raw.h5ad',
'ERR6449785_GeneFull_raw.h5ad',
'ERR6449786_GeneFull_raw.h5ad',
'ERR6449787_GeneFull_raw.h5ad',
'ERR6449788_GeneFull_raw.h5ad',
'ERR6449791_GeneFull_raw.h5ad',
'ERR6449794_GeneFull_raw.h5ad',
'ERR6449795_GeneFull_raw.h5ad',
'ERR6449796_GeneFull_raw.h5ad',
'ERR6449797_GeneFull_raw.h5ad',
'ERR6449798_GeneFull_raw.h5ad',
'ERR6449800_GeneFull_raw.h5ad',
'ERR6449802_GeneFull_raw.h5ad',
'ERR6449804_GeneFull_raw.h5ad',
'ERR6449805_GeneFull_raw.h5ad',
'ERR6449806_GeneFull_raw.h5ad',
'ERR6449807_GeneFull_raw.h5ad',
'ERR6449809_GeneFull_raw.h5ad',
'ERR6449811_GeneFull_raw.h5ad',
'ERR6449812_GeneFull_raw.h5ad',
'ERR6449813_GeneFull_raw.h5ad',
'ERR6449814_GeneFull_raw.h5ad',
'ERR6449816_GeneFull_raw.h5ad',
'ERR6449817_GeneFull_raw.h5ad',
'ERR6449819_GeneFull_raw.h5ad',
'ERR6449821_GeneFull_raw.h5ad',
'ERR6449823_GeneFull_raw.h5ad',
'ERR6449825_GeneFull_raw.h5ad',
'ERR6449827_GeneFull_raw.h5ad',
'ERR6449830_GeneFull_raw.h5ad',
'ERR6449832_GeneFull_raw.h5ad',
'ERR6449833_GeneFull_raw.h5ad',
'ERR6449834_GeneFull_raw.h5ad',
'ERR6449836_GeneFull_raw.h5ad',
'ERR6449838_GeneFull_raw.h5ad',
'ERR6449839_GeneFull_raw.h5ad',
'ERR6449841_GeneFull_raw.h5ad',
'ERR6449842_GeneFull_raw.h5ad',
'ERR6449843_GeneFull_raw.h5ad',
'ERR6449844_GeneFull_raw.h5ad',
'ERR6449848_GeneFull_raw.h5ad',
'ERR6449850_GeneFull_raw.h5ad',
'ERR6449851_GeneFull_raw.h5ad',
'ERR6449854_GeneFull_raw.h5ad',
'ERR6449855_GeneFull_raw.h5ad',
'ERR6449856_GeneFull_raw.h5ad',
'ERR6449857_GeneFull_raw.h5ad',
'ERR6449858_GeneFull_raw.h5ad',
'ERR6449859_GeneFull_raw.h5ad',
'ERR6449860_GeneFull_raw.h5ad',
'ERR6449861_GeneFull_raw.h5ad',
'ERR6449863_GeneFull_raw.h5ad',
'ERR6449864_GeneFull_raw.h5ad',
'ERR6449865_GeneFull_raw.h5ad',
'ERR6449867_GeneFull_raw.h5ad',
'ERR6449868_GeneFull_raw.h5ad',
'ERR6449869_GeneFull_raw.h5ad',
'ERR6449870_GeneFull_raw.h5ad',
'ERR6449873_GeneFull_raw.h5ad',
'ERR6449875_GeneFull_raw.h5ad',
'ERR6449876_GeneFull_raw.h5ad',
'ERR6449877_GeneFull_raw.h5ad',
'ERR6449880_GeneFull_raw.h5ad',
'ERR6449881_GeneFull_raw.h5ad',
'ERR6449882_GeneFull_raw.h5ad',
'ERR6449883_GeneFull_raw.h5ad',
'ERR6449885_GeneFull_raw.h5ad',
'ERR6449886_GeneFull_raw.h5ad',
'ERR6449887_GeneFull_raw.h5ad',
'ERR6449889_GeneFull_raw.h5ad',
'ERR6449890_GeneFull_raw.h5ad',
'ERR6449891_GeneFull_raw.h5ad',
'ERR6449893_GeneFull_raw.h5ad',
'ERR6449895_GeneFull_raw.h5ad',
'ERR6449896_GeneFull_raw.h5ad',
'ERR6449897_GeneFull_raw.h5ad',
'ERR6449898_GeneFull_raw.h5ad',
'ERR6449899_GeneFull_raw.h5ad',
'ERR6449900_GeneFull_raw.h5ad',
'ERR6449901_GeneFull_raw.h5ad',
'ERR6449903_GeneFull_raw.h5ad',
'ERR6449904_GeneFull_raw.h5ad',
'ERR6449906_GeneFull_raw.h5ad',
'ERR6449907_GeneFull_raw.h5ad',
'ERR6449909_GeneFull_raw.h5ad',
'ERR6449910_GeneFull_raw.h5ad',
'ERR6449911_GeneFull_raw.h5ad',
'ERR6449916_GeneFull_raw.h5ad',
'ERR6449917_GeneFull_raw.h5ad',
'ERR6449919_GeneFull_raw.h5ad',
'ERR6449921_GeneFull_raw.h5ad',
'ERR6449922_GeneFull_raw.h5ad',
'ERR6449923_GeneFull_raw.h5ad',
'ERR6449925_GeneFull_raw.h5ad',
'ERR6449927_GeneFull_raw.h5ad',
'ERR6449928_GeneFull_raw.h5ad',
'ERR6449929_GeneFull_raw.h5ad',
'ERR6449930_GeneFull_raw.h5ad',
'ERR6449931_GeneFull_raw.h5ad',
'ERR6449935_GeneFull_raw.h5ad',
'ERR6449937_GeneFull_raw.h5ad',
'ERR6449939_GeneFull_raw.h5ad',
'ERR6449940_GeneFull_raw.h5ad',
'ERR6449941_GeneFull_raw.h5ad',
'ERR6449943_GeneFull_raw.h5ad',
'ERR6449946_GeneFull_raw.h5ad',
'ERR6449947_GeneFull_raw.h5ad',
'ERR6449948_GeneFull_raw.h5ad',
'ERR6449949_GeneFull_raw.h5ad',
'ERR6449951_GeneFull_raw.h5ad',
'ERR6449952_GeneFull_raw.h5ad',
'ERR6449956_GeneFull_raw.h5ad',
'ERR6449957_GeneFull_raw.h5ad',
'ERR6449958_GeneFull_raw.h5ad',
'ERR6449963_GeneFull_raw.h5ad',
'ERR6449966_GeneFull_raw.h5ad',
'ERR6449967_GeneFull_raw.h5ad',
'ERR6449968_GeneFull_raw.h5ad',
'ERR6449972_GeneFull_raw.h5ad',
'ERR6449973_GeneFull_raw.h5ad',
'ERR6449974_GeneFull_raw.h5ad',
'ERR6449975_GeneFull_raw.h5ad',
'ERR6449977_GeneFull_raw.h5ad',
'ERR6449978_GeneFull_raw.h5ad',
'ERR6449979_GeneFull_raw.h5ad',
'ERR6449980_GeneFull_raw.h5ad',
'ERR6449981_GeneFull_raw.h5ad',
'ERR6449983_GeneFull_raw.h5ad',
'ERR6449984_GeneFull_raw.h5ad',
'ERR6449987_GeneFull_raw.h5ad',
'ERR6449988_GeneFull_raw.h5ad',
'ERR6449991_GeneFull_raw.h5ad',
'ERR6449993_GeneFull_raw.h5ad',
'ERR6449994_GeneFull_raw.h5ad',
'ERR6449995_GeneFull_raw.h5ad',
'ERR6450000_GeneFull_raw.h5ad',
'ERR7423232_GeneFull_raw.h5ad',
'ERR7423233_GeneFull_raw.h5ad',
'ERR7423234_GeneFull_raw.h5ad',
'ERR7423251_GeneFull_raw.h5ad',
'ERR7423285_GeneFull_raw.h5ad',
'ERR7423286_GeneFull_raw.h5ad',
'ERR7423287_GeneFull_raw.h5ad',
'ERR7423290_GeneFull_raw.h5ad',
'ERR7423291_GeneFull_raw.h5ad',
'ERR7423298_GeneFull_raw.h5ad',
'ERR7423299_GeneFull_raw.h5ad',
'ERR7423301_GeneFull_raw.h5ad',
'ERR7423305_GeneFull_raw.h5ad',
'ERR7423306_GeneFull_raw.h5ad',
'ERR7423308_GeneFull_raw.h5ad',
'ERR7423324_GeneFull_raw.h5ad',
'ERR7423328_GeneFull_raw.h5ad',
'ERR7423329_GeneFull_raw.h5ad',
'ERR7423331_GeneFull_raw.h5ad',
'ERR7423333_GeneFull_raw.h5ad',
'ERR7423334_GeneFull_raw.h5ad',
'ERR7423337_GeneFull_raw.h5ad',
'ERR7423338_GeneFull_raw.h5ad',
'ERR7423339_GeneFull_raw.h5ad',
'ERR7423340_GeneFull_raw.h5ad',
'ERR7423341_GeneFull_raw.h5ad',
'ERR7423343_GeneFull_raw.h5ad',
'ERR7423344_GeneFull_raw.h5ad',
'ERR7423346_GeneFull_raw.h5ad',
'ERR7423349_GeneFull_raw.h5ad',
'ERR7423366_GeneFull_raw.h5ad',
'ERR7423367_GeneFull_raw.h5ad',
'ERR7423368_GeneFull_raw.h5ad',
'ERR7423369_GeneFull_raw.h5ad',
'ERR7423373_GeneFull_raw.h5ad',
'ERR7423374_GeneFull_raw.h5ad',
'ERR7423375_GeneFull_raw.h5ad',
'ERR7423376_GeneFull_raw.h5ad',
'ERR7423378_GeneFull_raw.h5ad',
'ERR7423398_GeneFull_raw.h5ad',
'ERR7423400_GeneFull_raw.h5ad',
'ERR7423402_GeneFull_raw.h5ad',
'ERR7423404_GeneFull_raw.h5ad',
'ERR7423413_GeneFull_raw.h5ad',
'ERR7423414_GeneFull_raw.h5ad',
'ERR7423419_GeneFull_raw.h5ad',
'ERR7423421_GeneFull_raw.h5ad',
'ERR7423427_GeneFull_raw.h5ad',
'ERR7423428_GeneFull_raw.h5ad',
'ERR7423429_GeneFull_raw.h5ad',
'ERR7423433_GeneFull_raw.h5ad',
'ERR7423434_GeneFull_raw.h5ad',
'ERR7423435_GeneFull_raw.h5ad',
'ERR7423436_GeneFull_raw.h5ad',
'ERR7423438_GeneFull_raw.h5ad',
'ERR7423441_GeneFull_raw.h5ad',
'ERR7423442_GeneFull_raw.h5ad',
'ERR7423453_GeneFull_raw.h5ad',
'ERR7423454_GeneFull_raw.h5ad',
'ERR7423455_GeneFull_raw.h5ad',
'ERR7423456_GeneFull_raw.h5ad',
'ERR7423457_GeneFull_raw.h5ad',
'ERR7423465_GeneFull_raw.h5ad',
'ERR7423468_GeneFull_raw.h5ad',
'ERR7423469_GeneFull_raw.h5ad',
'ERR7423471_GeneFull_raw.h5ad',
'ERR7423487_GeneFull_raw.h5ad',
'ERR7423238_GeneFull_raw.h5ad',
'ERR7423239_GeneFull_raw.h5ad',
'ERR7423240_GeneFull_raw.h5ad',
'ERR7423242_GeneFull_raw.h5ad',
'ERR7423244_GeneFull_raw.h5ad',
'ERR7423247_GeneFull_raw.h5ad',
'ERR7423248_GeneFull_raw.h5ad',
'ERR7423249_GeneFull_raw.h5ad',
'ERR7423250_GeneFull_raw.h5ad',
'ERR7423253_GeneFull_raw.h5ad',
'ERR7423255_GeneFull_raw.h5ad',
'ERR7423256_GeneFull_raw.h5ad',
'ERR7423257_GeneFull_raw.h5ad',
'ERR7423259_GeneFull_raw.h5ad',
'ERR7423261_GeneFull_raw.h5ad',
'ERR7423263_GeneFull_raw.h5ad',
'ERR7423264_GeneFull_raw.h5ad',
'ERR7423266_GeneFull_raw.h5ad',
'ERR7423267_GeneFull_raw.h5ad',
'ERR7423269_GeneFull_raw.h5ad',
'ERR7423271_GeneFull_raw.h5ad',
'ERR7423272_GeneFull_raw.h5ad',
'ERR7423273_GeneFull_raw.h5ad',
'ERR7423274_GeneFull_raw.h5ad',
'ERR7423275_GeneFull_raw.h5ad',
'ERR7423276_GeneFull_raw.h5ad',
'ERR7423278_GeneFull_raw.h5ad',
'ERR7423280_GeneFull_raw.h5ad',
'ERR7423282_GeneFull_raw.h5ad',
'ERR7423283_GeneFull_raw.h5ad',
'ERR7423309_GeneFull_raw.h5ad',
'ERR7423310_GeneFull_raw.h5ad',
'ERR7423314_GeneFull_raw.h5ad',
'ERR7423315_GeneFull_raw.h5ad',
'ERR7423317_GeneFull_raw.h5ad',
'ERR7423321_GeneFull_raw.h5ad',
'ERR7423323_GeneFull_raw.h5ad',
'ERR7423352_GeneFull_raw.h5ad',
'ERR7423353_GeneFull_raw.h5ad',
'ERR7423354_GeneFull_raw.h5ad',
'ERR7423358_GeneFull_raw.h5ad',
'ERR7423359_GeneFull_raw.h5ad',
'ERR7423360_GeneFull_raw.h5ad',
'ERR7423363_GeneFull_raw.h5ad',
'ERR7423364_GeneFull_raw.h5ad',
'ERR7423365_GeneFull_raw.h5ad',
'ERR7423379_GeneFull_raw.h5ad',
'ERR7423380_GeneFull_raw.h5ad',
'ERR7423382_GeneFull_raw.h5ad',
'ERR7423385_GeneFull_raw.h5ad',
'ERR7423387_GeneFull_raw.h5ad',
'ERR7423390_GeneFull_raw.h5ad',
'ERR7423392_GeneFull_raw.h5ad',
'ERR7423395_GeneFull_raw.h5ad',
'ERR7423396_GeneFull_raw.h5ad',
'ERR7423406_GeneFull_raw.h5ad',
'ERR7423407_GeneFull_raw.h5ad',
'ERR7423416_GeneFull_raw.h5ad',
'ERR7423417_GeneFull_raw.h5ad',
'ERR7423422_GeneFull_raw.h5ad',
'ERR7423424_GeneFull_raw.h5ad',
'ERR7423425_GeneFull_raw.h5ad',
'ERR7423426_GeneFull_raw.h5ad',
'ERR7423443_GeneFull_raw.h5ad',
'ERR7423444_GeneFull_raw.h5ad',
'ERR7423448_GeneFull_raw.h5ad',
'ERR7423449_GeneFull_raw.h5ad',
'ERR7423450_GeneFull_raw.h5ad',
'ERR7423451_GeneFull_raw.h5ad',
'ERR7423452_GeneFull_raw.h5ad',
'ERR7423458_GeneFull_raw.h5ad',
'ERR7423459_GeneFull_raw.h5ad',
'ERR7423460_GeneFull_raw.h5ad',
'ERR7423461_GeneFull_raw.h5ad',
'ERR7423462_GeneFull_raw.h5ad',
'ERR7423463_GeneFull_raw.h5ad',
'ERR7423464_GeneFull_raw.h5ad',
'ERR7423474_GeneFull_raw.h5ad',
'ERR7423475_GeneFull_raw.h5ad',
'ERR7423476_GeneFull_raw.h5ad',
'ERR7423479_GeneFull_raw.h5ad',
'ERR7423480_GeneFull_raw.h5ad',
'ERR7423481_GeneFull_raw.h5ad',
'ERR7423484_GeneFull_raw.h5ad',
'ERR7423486_GeneFull_raw.h5ad',]

In [25]:
adata_combined = None
vars_combined = []

for file in files:
    file_path = os.path.join(path, file)
    
    try:

        adata = sc.read_h5ad(file_path)

        sample_name = os.path.basename(file_path).split('.')[0]
        adata.obs['sample_name'] = sample_name


        sc.pp.filter_cells(adata, min_counts=10)
        sc.pp.filter_genes(adata, min_counts=10)


        adata.var_names = adata.var_names.str.split('.').str[0]
        adata.var_names = [f"{name}_{sample_name}" for name in adata.var_names]

        vars_combined.extend(adata.var_names)


        if adata_combined is None:
            adata_combined = adata
        else:
            adata_combined = sc.concat([adata_combined, adata], join='inner', index_unique='-')

        print(f"Successfully read and concatenated: {file}")

    except Exception as e:
        print(f"Error reading {file}: {e}")

unique_var_names = pd.Series(vars_combined).unique()
adata_combined.var_names = unique_var_names[:adata_combined.n_vars] 
adata_combined

filtered out 689744 cells that have less than 10 counts
filtered out 45901 genes that are detected in less than 10 counts
Successfully read and concatenated: ERR6449801_GeneFull_raw.h5ad
filtered out 647278 cells that have less than 10 counts
filtered out 42075 genes that are detected in less than 10 counts
Successfully read and concatenated: ERR6449803_GeneFull_raw.h5ad
Error reading ERR6449820_GeneFull_raw.h5ad: [Errno 2] Unable to synchronously open file (unable to open file: name = '/mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6449820_GeneFull_raw.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)
filtered out 6729293 cells that have less than 10 counts
filtered out 41816 genes that are detected in less than 10 counts
Successfully read and concatenated: ERR6449847_GeneFull_raw.h5ad
filtered out 6716573 cells that have less than 10 counts
filtered out 42359 genes that are detected in less than 10 counts
Successfully read and concat

AnnData object with n_obs × n_vars = 5260772 × 0
    obs: 'sample_name', 'n_counts'
    layers: 'spliced'

In [20]:
adata_combined

AnnData object with n_obs × n_vars = 5260772 × 1512502
    obs: 'sample_name', 'n_counts'
    layers: 'spliced'

In [21]:
adata_combined.obs

Unnamed: 0,sample_name,n_counts
AAACCTGAGAATTGTG-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0,ERR6449801_GeneFull_raw,23
AAACCTGAGACAATAC-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0,ERR6449801_GeneFull_raw,24
AAACCTGAGACCGGAT-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0,ERR6449801_GeneFull_raw,15
AAACCTGAGAGAACAG-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0,ERR6449801_GeneFull_raw,11
AAACCTGAGAGGGATA-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0,ERR6449801_GeneFull_raw,11
...,...,...
TTTGTTGTCTACGCAA-1,ERR7423287_GeneFull_raw,59
TTTGTTGTCTATCGTT-1,ERR7423287_GeneFull_raw,10
TTTGTTGTCTCGTCGT-1,ERR7423287_GeneFull_raw,11
TTTGTTGTCTGCGGGT-1,ERR7423287_GeneFull_raw,45


In [22]:
adata_combined.var

WASH7P_ERR6449801_GeneFull_raw
ENSG00000293331_ERR6449801_GeneFull_raw
MTND1P23_ERR6449801_GeneFull_raw
MTND2P28_ERR6449801_GeneFull_raw
MTCO1P12_ERR6449801_GeneFull_raw
...
MT-ND6_ERR7423287_GeneFull_raw
MT-TE_ERR7423287_GeneFull_raw
MT-CYB_ERR7423287_GeneFull_raw
MT-TT_ERR7423287_GeneFull_raw
MT-TP_ERR7423287_GeneFull_raw


In [23]:
adata_combined.obs['sample_name'].value_counts()

sample_name
ERR7423286_GeneFull_raw    101676
ERR7423294_GeneFull_raw    100212
ERR7423322_GeneFull_raw     96905
ERR7423285_GeneFull_raw     93472
ERR6449803_GeneFull_raw     90002
                            ...  
ERR6449969_GeneFull_raw     63633
ERR6449981_GeneFull_raw     63229
ERR6449980_GeneFull_raw     63178
ERR7423304_GeneFull_raw     51730
ERR6449801_GeneFull_raw     47536
Name: count, Length: 71, dtype: int64

## Doublet score prediction

In [24]:
scrub = scr.Scrublet(adata_combined.X)

doublet_scores, predicted_doublets = scrub.scrub_doublets()
            
adata_combined.obs['doublet_scores'] = doublet_scores
adata_combined.obs['predicted_doublets'] = predicted_doublets

Preprocessing...
Simulating doublets...


MemoryError: Unable to allocate 7.92 TiB for an array with shape (5260772, 207031) and data type float64

In [None]:
adata_combined.obs

### Checking the count and percentage of Doublets - sample_name level

In [None]:
doub_tab = pd.crosstab(adata_combined.obs['sample_name'],adata_combined.obs['predicted_doublets'])
doub_tab.sum()

In [None]:
true_doublets = adata_combined.obs['predicted_doublets'] == True
true_doublets_count = true_doublets.sum()

true_doublets_percentage = (true_doublets_count / len(adata_combined.obs)) * 100

true_doublets_count ,true_doublets_percentage

### Saving raw data

In [None]:
sample_name_object = adata_combined.copy()
sample_name_object

## Compute QC stats

In [None]:
sample_name_object.shape

### Labelling Mt and Ribo genes

In [None]:
sample_name_object.var

In [None]:
sample_name_object.var.index = sample_name_object.var.index.str.split('_').str[0]
sample_name_object.var

In [None]:
sample_name_object.var['ensembl'] = sample_name_object.var.index
sample_name_object.var 

### Ensembl annotations

In [18]:
annot = sc.queries.biomart_annotations(
        "hsapiens",
        ["ensembl_gene_id", "external_gene_name", "start_position", "end_position", "chromosome_name"],
    ).set_index("ensembl_gene_id")

In [None]:
annot.head()

In [None]:
sample_name_object.var

In [None]:
sample_name_object.var['gene_name'] = sample_name_object.var.index.map(annot['external_gene_name'])
sample_name_object.var.index =sample_name_object.var['gene_name'] 
sample_name_object.var

In [None]:
sample_name_object.var['mt'] = sample_name_object.var_names.str.startswith('MT-') 
sample_name_object.var['ribo'] = sample_name_object.var_names.str.startswith(("RPS","RPL"))
sample_name_object.var

In [None]:
ribo_counts = sample_name_object.var['ribo'].value_counts()

mt_counts = sample_name_object.var['mt'].value_counts()

print("Counts of Ribosomal (ribo) Genes:")
print("False:", ribo_counts.get(False, 0))
print("True:", ribo_counts.get(True, 0))
print("\nCounts of Mitochondrial (mt) Genes:")
print("False:", mt_counts.get(False, 0))
print("True:", mt_counts.get(True, 0))

In [None]:
sample_name_object.var['mt'] = sample_name_object.var['mt'].fillna(False)
sample_name_object.var['ribo'] = sample_name_object.var['ribo'].fillna(False)

### Calculating QC metrics per cell

In [25]:
sc.pp.calculate_qc_metrics(sample_name_object,qc_vars = ['mt','ribo'],inplace = True)

In [None]:
sample_name_object

## Sex covariate analysis

### Chr Y genes calculation

In [27]:
sample_name_object.var['gene_name'] = sample_name_object.var['ensembl'].map(annot['external_gene_name'])
sample_name_object.var['chromosome'] = sample_name_object.var['ensembl'].map(annot['chromosome_name'])

In [None]:
sample_name_object.var

In [None]:
chrY_genes = sample_name_object.var['chromosome'] == "Y"
chrY_genes

In [None]:
sample_name_object.obs['percent_chrY'] = np.sum(
    sample_name_object[:, chrY_genes].X, axis = 1) / np.sum(sample_name_object.X, axis = 1) * 100

In [None]:
sample_name_object

### XIST counts

In [None]:
sample_name_object.var_names

## Calculate cell cycle scores

### Downloading the list of cell cycle genes

In [33]:
!if [ ! -f ../ncbi_sra/data/regev_lab_cell_cycle_genes.txt ]; then curl -o ../ncbi_sra/data/regev_lab_cell_cycle_genes.txt https://raw.githubusercontent.com/theislab/scanpy_usage/master/180209_cell_cycle/data/regev_lab_cell_cycle_genes.txt; fi

### Marking cell cycle genes

#### Steps followed

1. Loading genes and captilizing 
2. Printing the length of cell cycle genes list
3. Split genes into 2 lists (#First 43 genes,#Gene 43 to end)
4. Filtering cell cycle genes only if present in processed_gene_names
5. Print the list of cell cycle genes observed in our data

In [None]:
cell_cycle_genes = [x.strip() for x in open('../ncbi_sra/data/regev_lab_cell_cycle_genes.txt')]
#cell_cycle_genes = [gene.capitalize() for gene in cell_cycle_genes]
print(len(cell_cycle_genes))

s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]

cell_cycle_genes = [x for x in cell_cycle_genes if x in sample_name_object.var_names]
print(len(cell_cycle_genes))

In [None]:
cell_cycle_genes

### Creating basic anndata and normalization for cell cycle score calculation

In [None]:
adata_combined_log = anndata.AnnData(X = sample_name_object.X,  var = sample_name_object.var, obs = sample_name_object.obs)
sc.pp.normalize_total(adata_combined_log, target_sum = 1e6, exclude_highly_expressed = True)
sc.pp.log1p(adata_combined_log)

### Cell cycle score calculation

In [37]:
adata_combined_log.var_names = adata_combined_log.var_names.astype(str)
adata_combined_log.var_names_make_unique()

In [None]:
sc.tl.score_genes_cell_cycle(adata_combined_log, s_genes = s_genes, g2m_genes = g2m_genes)

sample_name_object.obs['S_score'] = adata_combined_log.obs['S_score']
sample_name_object.obs['G2M_score'] = adata_combined_log.obs['G2M_score']
sample_name_object.obs['phase'] = adata_combined_log.obs['phase']

sample_name_object

In [None]:
cell_cycle_counts = sample_name_object.obs['phase'].value_counts()

cell_cycle_counts

In [None]:
sb.countplot(data=sample_name_object.obs, x='phase')

## Data visualization

In [None]:
variables = 'n_genes_by_counts', 'total_counts', 'doublet_scores', 'G2M_score', 'S_score'

for var in variables:

    fig, ax = plt.subplots(figsize=(12, 6), ncols=2, gridspec_kw={'width_ratios': [4, 1]})

    sb.violinplot(data=sample_name_object.obs,x = 'sample_name' , y=var, ax=ax[0])
   
    medians = sample_name_object.obs.groupby('sample_name')[var].median()

    for sample_name, median in medians.items():
        ax[0].text(sample_name, median, f'{median:.2f}', ha='center', va='bottom', color='black', fontsize=10)
    
    ax[0].set_title(f'Violin Plot of {var} by sample_name - Before filtering')
    ax[0].set_xlabel('sample_name')
    ax[0].set_ylabel(var)
    ax[0].tick_params(axis='x', rotation=45)

    median_df = pd.DataFrame({'sample_name': medians.index, 'Median': medians.values})

    ax[1].axis('off')
    ax[1].table(cellText=median_df.values, colLabels=median_df.columns, loc='center')
    ax[1].set_title('Median Values')
    
    plt.tight_layout()
    plt.show()


### Visualization of qc metrics

In [None]:
variables = ['pct_counts_mt', 'pct_counts_ribo']

sb.violinplot(data=sample_name_object.obs[variables])
plt.xticks(rotation=45)
plt.title(f'Mt and Ribo percentages - Before filtering')

In [None]:
plt.figure(figsize=(10, 6))
sb.scatterplot(data=sample_name_object.obs, x='total_counts', y='n_genes_by_counts' , alpha = 0.4, s=4)
#plt.xticks(range(0, int(max(sample_name_object.obs['total_counts'])) + 1, 3000),rotation=45, fontsize = 10)
#plt.yticks(range(0, int(max(sample_name_object.obs['n_genes_by_counts'])) + 1, 1000),fontsize = 10)
plt.title(f'Counts vs Genes - Before filtering')
plt.show()

### Filtering based on QC metrics

In [None]:
filtered_object = sample_name_object[sample_name_object.obs['n_genes_by_counts'] > 10]
filtered_object = filtered_object[filtered_object.obs['n_genes_by_counts'] < 400]

filtered_object = filtered_object[filtered_object.obs['total_counts'] > 10]
filtered_object = filtered_object[filtered_object.obs['total_counts'] < 400]

filtered_object = filtered_object[filtered_object.obs['pct_counts_mt'] < 60]
filtered_object = filtered_object[filtered_object.obs['pct_counts_ribo'] < 20]

filtered_object = filtered_object[filtered_object.obs['doublet_scores'] < 0.35]

filtered_object

In [None]:
variables = ['pct_counts_mt', 'pct_counts_ribo']

sb.violinplot(data=filtered_object.obs[variables])
plt.xticks(rotation=45)
plt.title(f'Mt and Ribo percentages - After filtering')

In [None]:
sb.set(style = "whitegrid")
covariate_to_visualize = 'total_counts'

plt.figure(figsize = (10, 6))
sb.histplot(data = filtered_object.obs, x = covariate_to_visualize, stat = 'count', common_norm = False)
plt.xlabel(covariate_to_visualize)
plt.ylabel('Abundance')
plt.title(f'Abundance Plot of {covariate_to_visualize} by sample_name - After filtering')
plt.show()

In [None]:
sb.set(style = "whitegrid")
covariate_to_visualize = 'n_genes_by_counts'

plt.figure(figsize = (10, 6))
sb.histplot(data = filtered_object.obs, x = covariate_to_visualize, stat = 'count', common_norm = False)
plt.xlabel(covariate_to_visualize)
plt.ylabel('Abundance')
plt.title(f'Abundance Plot of {covariate_to_visualize} by sample_name - After filtering')
plt.show()

In [None]:
variables = 'n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo', 'doublet_scores', 'G2M_score', 'S_score' 

for var in variables:

    fig, ax = plt.subplots(figsize=(12, 6), ncols=2, gridspec_kw={'width_ratios': [4, 1]})

    sb.violinplot(data=filtered_object.obs, x='sample_name', y=var, ax=ax[0])
   
    medians = filtered_object.obs.groupby('sample_name')[var].median()

    for sample_name, median in medians.items():
        ax[0].text(sample_name, median, f'{median:.2f}', ha='center', va='bottom', color='black', fontsize=10)
    
    ax[0].set_title(f'Violin Plot of {var} by sample_name - After filtering')
    ax[0].set_xlabel('sample_name')
    ax[0].set_ylabel(var)
    ax[0].tick_params(axis='x', rotation=45)

    median_df = pd.DataFrame({'sample_name': medians.index, 'Median': medians.values})

    ax[1].axis('off')
    ax[1].table(cellText=median_df.values, colLabels=median_df.columns, loc='center')
    ax[1].set_title('Median Values')
    
    plt.tight_layout()
    plt.show()

## Data Export

In [None]:
filtered_object.raw = filtered_object.copy()

filtered_object.layers['raw_counts'] = filtered_object.X.copy()

filtered_object.layers["sqrt_norm"] = np.sqrt(
    sc.pp.normalize_total(filtered_object, inplace = False)["X"]
)

filtered_object

In [None]:
filtered_object.obs['sample_name'].value_counts()

In [None]:
filtered_object.var.dtypes

In [None]:
filtered_object.var['mt'].value_counts()

In [53]:
filtered_object.var['mt'] = filtered_object.var['mt'].astype(str)

In [None]:
print(filtered_object.var.dtypes)

In [None]:
filtered_object.var

In [None]:
filtered_object.var = filtered_object.var.rename(columns={'gene_name': 'gene_symbol'})
filtered_object.var = filtered_object.var.reset_index()
filtered_object.var

In [57]:
filtered_object.raw.var.index.name = 'gene_id'  

In [58]:
filtered_object.write_h5ad(f'../ncbi_sra/data/PRJEB59734_sra_filtered_sk_{timestamp}.h5ad')