# Notebook to run basic `scanpy` QC and doublet detection with `scrublet` for SRA Project - PRJNA1007964

- **Developed by**: Srivalli Kolla

- **Created date** : 14 October, 2024

- **Modification date** : 23 October, 2024

- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**

Env : Scanpy(Python 3.12.4)

# Import sample_names

In [1]:
import anndata
import logging
import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sb
import scrublet as scr
import os
import time
import matplotlib.pyplot as plt
from statsmodels.robust.scale import mad as median_abs_deviation
from matplotlib import colors
from matplotlib import rcParams

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 160, color_map = 'RdPu', dpi_save = 180, vector_friendly = True, format = 'svg')
timestamp = time.strftime("%d_%m_%Y")

-----
anndata     0.10.8
scanpy      1.10.2
-----
PIL                 10.3.0
asttokens           NA
attr                23.2.0
cffi                1.16.0
colorama            0.4.6
comm                0.2.2
cycler              0.12.1
cython_runtime      NA
dateutil            2.9.0.post0
debugpy             1.8.2
decorator           5.1.1
defusedxml          0.7.1
distutils           3.12.4
django              5.0.6
executing           2.0.1
h5py                3.11.0
igraph              0.11.5
ipykernel           6.29.5
ipython_genutils    0.2.0
ipywidgets          8.1.3
jedi                0.19.1
joblib              1.4.2
kiwisolver          1.4.5
legacy_api_wrap     NA
leidenalg           0.10.2
llvmlite            0.43.0
louvain             0.8.2
matplotlib          3.8.4
mpl_toolkits        NA
natsort             8.4.0
numba               0.60.0
numexpr             2.10.1
numpy               1.26.4
packaging           24.1
pandas              2.2.2
parso               0.8.4
patsy  

# Import sample_names

In [3]:
path = '../ncbi_sra/data'
files = ['SRR25726227.h5ad',
'SRR25726227.h5ad']

In [4]:
adata_combined = None
vars_combined = []

for file in files:
    file_path = os.path.join(path, file)
    
    try:

        adata = sc.read_h5ad(file_path)

        sample_name = os.path.basename(file_path).split('.')[0]
        adata.obs['sample_name'] = sample_name


        sc.pp.filter_cells(adata, min_counts=10)
        sc.pp.filter_genes(adata, min_counts=10)


        adata.var_names = adata.var_names.str.split('.').str[0]
        adata.var_names = [f"{name}_{sample_name}" for name in adata.var_names]

        vars_combined.extend(adata.var_names)


        if adata_combined is None:
            adata_combined = adata
        else:
            adata_combined = sc.concat([adata_combined, adata], join='outer', index_unique='-')

        print(f"Successfully read and concatenated: {file}")

    except Exception as e:
        print(f"Error reading {file}: {e}")

unique_var_names = pd.Series(vars_combined).unique()
adata_combined.var_names = unique_var_names[:adata_combined.n_vars] 
adata_combined

filtered out 6794585 cells that have less than 10 counts
filtered out 62925 genes that are detected in less than 10 counts
Successfully read and concatenated: SRR25726227.h5ad
filtered out 6794585 cells that have less than 10 counts
filtered out 62925 genes that are detected in less than 10 counts
Successfully read and concatenated: SRR25726227.h5ad


AnnData object with n_obs × n_vars = 590 × 161
    obs: 'sample_name', 'n_counts'

In [5]:
adata_combined

AnnData object with n_obs × n_vars = 590 × 161
    obs: 'sample_name', 'n_counts'

In [6]:
adata_combined.obs

Unnamed: 0,sample_name,n_counts
AAACCCACAATGAGAT-0,SRR25726227,13
AAAGGATTCCCTATTA-0,SRR25726227,21
AAAGTGAAGAAATCCA-0,SRR25726227,48
AACAACCTCCTAGCCT-0,SRR25726227,10
AACGGGAAGGCGAAGA-0,SRR25726227,28
...,...,...
TTGTGGAAGACAGTGT-1,SRR25726227,13
TTGTTGTAGCCACAGG-1,SRR25726227,149
TTTGGTTCATATTCGG-1,SRR25726227,27
TTTGGTTGTCTGTTCC-1,SRR25726227,11


In [7]:
adata_combined.var

ENSG00000186094_SRR25726227
ENSG00000173406_SRR25726227
ENSG00000172260_SRR25726227
ENSG00000137968_SRR25726227
ENSG00000117114_SRR25726227
...
ENSG00000147202_SRR25726227
ENSG00000225689_SRR25726227
ENSG00000129682_SRR25726227
ENSG00000288098_SRR25726227
ENSG00000155966_SRR25726227


In [8]:
adata_combined.obs['sample_name'].value_counts()

sample_name
SRR25726227    590
Name: count, dtype: int64

## Doublet score prediction

In [9]:
sc.pp.filter_cells(adata_combined, min_counts=1)
sc.pp.filter_genes(adata_combined, min_counts=1)
adata_combined

filtered out 66 cells that have less than 1 counts


AnnData object with n_obs × n_vars = 524 × 161
    obs: 'sample_name', 'n_counts'
    var: 'n_counts'

In [10]:
scrub = scr.Scrublet(adata_combined.X)

doublet_scores, predicted_doublets = scrub.scrub_doublets(n_prin_comps= 1)
            
adata_combined.obs['doublet_scores'] = doublet_scores
adata_combined.obs['predicted_doublets'] = predicted_doublets

Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.09
Detected doublet rate = 19.5%
Estimated detectable doublet fraction = 39.9%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 48.8%
Elapsed time: 0.1 seconds


In [11]:
adata_combined.obs

Unnamed: 0,sample_name,n_counts,doublet_scores,predicted_doublets
AAACCCACAATGAGAT-0,SRR25726227,1,0.001631,False
AAAGGATTCCCTATTA-0,SRR25726227,4,0.001631,False
AAAGTGAAGAAATCCA-0,SRR25726227,6,0.001631,False
AACAACCTCCTAGCCT-0,SRR25726227,5,0.121951,True
AACGGGAAGGCGAAGA-0,SRR25726227,7,0.001631,False
...,...,...,...,...
TTGTGGAAGACAGTGT-1,SRR25726227,2,0.001631,False
TTGTTGTAGCCACAGG-1,SRR25726227,13,0.001631,False
TTTGGTTCATATTCGG-1,SRR25726227,5,0.001631,False
TTTGGTTGTCTGTTCC-1,SRR25726227,1,0.157895,True


### Checking the count and percentage of Doublets - sample_name level

In [12]:
doub_tab = pd.crosstab(adata_combined.obs['sample_name'],adata_combined.obs['predicted_doublets'])
doub_tab.sum()

predicted_doublets
False    422
True     102
dtype: int64

In [13]:
true_doublets = adata_combined.obs['predicted_doublets'] == True
true_doublets_count = true_doublets.sum()

true_doublets_percentage = (true_doublets_count / len(adata_combined.obs)) * 100

true_doublets_count ,true_doublets_percentage

(102, 19.46564885496183)

### Saving raw data

In [14]:
sample_name_object = adata_combined.copy()
sample_name_object

AnnData object with n_obs × n_vars = 524 × 161
    obs: 'sample_name', 'n_counts', 'doublet_scores', 'predicted_doublets'
    var: 'n_counts'

## Compute QC stats

In [15]:
sample_name_object.shape

(524, 161)

### Labelling Mt and Ribo genes

In [16]:
sample_name_object.var

Unnamed: 0,n_counts
ENSG00000186094_SRR25726227,28
ENSG00000173406_SRR25726227,32
ENSG00000172260_SRR25726227,22
ENSG00000137968_SRR25726227,22
ENSG00000117114_SRR25726227,32
...,...
ENSG00000147202_SRR25726227,24
ENSG00000225689_SRR25726227,20
ENSG00000129682_SRR25726227,28
ENSG00000288098_SRR25726227,20


In [17]:
sample_name_object.var.index = sample_name_object.var.index.str.split('_').str[0]
sample_name_object.var

Unnamed: 0,n_counts
ENSG00000186094,28
ENSG00000173406,32
ENSG00000172260,22
ENSG00000137968,22
ENSG00000117114,32
...,...
ENSG00000147202,24
ENSG00000225689,20
ENSG00000129682,28
ENSG00000288098,20


In [18]:
sample_name_object.var['ensembl'] = sample_name_object.var.index
sample_name_object.var 

Unnamed: 0,n_counts,ensembl
ENSG00000186094,28,ENSG00000186094
ENSG00000173406,32,ENSG00000173406
ENSG00000172260,22,ENSG00000172260
ENSG00000137968,22,ENSG00000137968
ENSG00000117114,32,ENSG00000117114
...,...,...
ENSG00000147202,24,ENSG00000147202
ENSG00000225689,20,ENSG00000225689
ENSG00000129682,28,ENSG00000129682
ENSG00000288098,20,ENSG00000288098


### Ensembl annotations

In [19]:
annot = sc.queries.biomart_annotations(
        "hsapiens",
        ["ensembl_gene_id", "external_gene_name", "start_position", "end_position", "chromosome_name"],
    ).set_index("ensembl_gene_id")

In [20]:
annot.head()

Unnamed: 0_level_0,external_gene_name,start_position,end_position,chromosome_name
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000210049,MT-TF,577,647,MT
ENSG00000211459,MT-RNR1,648,1601,MT
ENSG00000210077,MT-TV,1602,1670,MT
ENSG00000210082,MT-RNR2,1671,3229,MT
ENSG00000209082,MT-TL1,3230,3304,MT


In [21]:
sample_name_object.var

Unnamed: 0,n_counts,ensembl
ENSG00000186094,28,ENSG00000186094
ENSG00000173406,32,ENSG00000173406
ENSG00000172260,22,ENSG00000172260
ENSG00000137968,22,ENSG00000137968
ENSG00000117114,32,ENSG00000117114
...,...,...
ENSG00000147202,24,ENSG00000147202
ENSG00000225689,20,ENSG00000225689
ENSG00000129682,28,ENSG00000129682
ENSG00000288098,20,ENSG00000288098


In [22]:
sample_name_object.var['gene_name'] = sample_name_object.var.index.map(annot['external_gene_name'])
sample_name_object.var.index =sample_name_object.var['gene_name'] 
sample_name_object.var

Unnamed: 0_level_0,n_counts,ensembl,gene_name
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AGBL4,28,ENSG00000186094,AGBL4
DAB1,32,ENSG00000173406,DAB1
NEGR1,22,ENSG00000172260,NEGR1
SLC44A5,22,ENSG00000137968,SLC44A5
ADGRL2,32,ENSG00000117114,ADGRL2
...,...,...,...
DIAPH2,24,ENSG00000147202,DIAPH2
,20,ENSG00000225689,
FGF13,28,ENSG00000129682,FGF13
,20,ENSG00000288098,


In [23]:
sample_name_object.var['mt'] = sample_name_object.var_names.str.startswith('MT-') 
sample_name_object.var['ribo'] = sample_name_object.var_names.str.startswith(("RPS","RPL"))
sample_name_object.var

Unnamed: 0_level_0,n_counts,ensembl,gene_name,mt,ribo
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AGBL4,28,ENSG00000186094,AGBL4,False,False
DAB1,32,ENSG00000173406,DAB1,False,False
NEGR1,22,ENSG00000172260,NEGR1,False,False
SLC44A5,22,ENSG00000137968,SLC44A5,False,False
ADGRL2,32,ENSG00000117114,ADGRL2,False,False
...,...,...,...,...,...
DIAPH2,24,ENSG00000147202,DIAPH2,False,False
,20,ENSG00000225689,,,
FGF13,28,ENSG00000129682,FGF13,False,False
,20,ENSG00000288098,,,


In [24]:
ribo_counts = sample_name_object.var['ribo'].value_counts()

mt_counts = sample_name_object.var['mt'].value_counts()

print("Counts of Ribosomal (ribo) Genes:")
print("False:", ribo_counts.get(False, 0))
print("True:", ribo_counts.get(True, 0))
print("\nCounts of Mitochondrial (mt) Genes:")
print("False:", mt_counts.get(False, 0))
print("True:", mt_counts.get(True, 0))

Counts of Ribosomal (ribo) Genes:
False: 141
True: 1

Counts of Mitochondrial (mt) Genes:
False: 142
True: 0


In [25]:
sample_name_object.var['mt'] = sample_name_object.var['mt'].fillna(False)
sample_name_object.var['ribo'] = sample_name_object.var['ribo'].fillna(False)

  sample_name_object.var['mt'] = sample_name_object.var['mt'].fillna(False)
  sample_name_object.var['ribo'] = sample_name_object.var['ribo'].fillna(False)


## Sex covariate analysis

### Chr Y genes calculation

In [26]:
sample_name_object.var['gene_name'] = sample_name_object.var['ensembl'].map(annot['external_gene_name'])
sample_name_object.var['chromosome'] = sample_name_object.var['ensembl'].map(annot['chromosome_name'])

In [27]:
sample_name_object.var

Unnamed: 0_level_0,n_counts,ensembl,gene_name,mt,ribo,chromosome
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AGBL4,28,ENSG00000186094,AGBL4,False,False,1
DAB1,32,ENSG00000173406,DAB1,False,False,1
NEGR1,22,ENSG00000172260,NEGR1,False,False,1
SLC44A5,22,ENSG00000137968,SLC44A5,False,False,1
ADGRL2,32,ENSG00000117114,ADGRL2,False,False,1
...,...,...,...,...,...,...
DIAPH2,24,ENSG00000147202,DIAPH2,False,False,X
,20,ENSG00000225689,,False,False,X
FGF13,28,ENSG00000129682,FGF13,False,False,X
,20,ENSG00000288098,,False,False,X


In [28]:
chrY_genes = sample_name_object.var['chromosome'] == "Y"
chrY_genes

gene_name
AGBL4      False
DAB1       False
NEGR1      False
SLC44A5    False
ADGRL2     False
           ...  
DIAPH2     False
NaN        False
FGF13      False
NaN        False
AFF2       False
Name: chromosome, Length: 161, dtype: bool

In [29]:
sample_name_object.obs['percent_chrY'] = np.sum(
    sample_name_object[:, chrY_genes].X, axis = 1) / np.sum(sample_name_object.X, axis = 1) * 100

In [30]:
sample_name_object

AnnData object with n_obs × n_vars = 524 × 161
    obs: 'sample_name', 'n_counts', 'doublet_scores', 'predicted_doublets', 'percent_chrY'
    var: 'n_counts', 'ensembl', 'gene_name', 'mt', 'ribo', 'chromosome'

### XIST counts

In [31]:
sample_name_object.var_names

Index([    'AGBL4',      'DAB1',     'NEGR1',   'SLC44A5',    'ADGRL2',
        'PKN2-AS1',  'RABGAP1L', 'LINC01036',     'USH2A',     'ESRRG',
       ...
          'FRMPD4',         nan,  'IL1RAPL1',       'DMD',     'OPHN1',
          'DIAPH2',         nan,     'FGF13',         nan,      'AFF2'],
      dtype='object', name='gene_name', length=161)

In [32]:
valid_var_names = sample_name_object.var_names[~sample_name_object.var_names.isna()]

xist_genes = valid_var_names[valid_var_names.str.match('XIST')]
xist_genes

Index([], dtype='object', name='gene_name')

## Calculate cell cycle scores

### Downloading the list of cell cycle genes

In [33]:
!if [ ! -f ../ncbi_sra/data/regev_lab_cell_cycle_genes.txt ]; then curl -o ../ncbi_sra/data/regev_lab_cell_cycle_genes.txt https://raw.githubusercontent.com/theislab/scanpy_usage/master/180209_cell_cycle/data/regev_lab_cell_cycle_genes.txt; fi

### Marking cell cycle genes

#### Steps followed

1. Loading genes and captilizing 
2. Printing the length of cell cycle genes list
3. Split genes into 2 lists (#First 43 genes,#Gene 43 to end)
4. Filtering cell cycle genes only if present in processed_gene_names
5. Print the list of cell cycle genes observed in our data

In [34]:
cell_cycle_genes = [x.strip() for x in open('../ncbi_sra/data/regev_lab_cell_cycle_genes.txt')]
#cell_cycle_genes = [gene.capitalize() for gene in cell_cycle_genes]
print(len(cell_cycle_genes))

s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]

cell_cycle_genes = [x for x in cell_cycle_genes if x in sample_name_object.var_names]
print(len(cell_cycle_genes))

97
0


In [35]:
cell_cycle_genes

[]

## Data Export

In [36]:
filtered_object = sample_name_object

In [37]:
filtered_object.raw = filtered_object.copy()

filtered_object.layers['raw_counts'] = filtered_object.X.copy()

filtered_object.layers["sqrt_norm"] = np.sqrt(
    sc.pp.normalize_total(filtered_object, inplace = False)["X"]
)

filtered_object

normalizing counts per cell
    finished (0:00:00)


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 524 × 161
    obs: 'sample_name', 'n_counts', 'doublet_scores', 'predicted_doublets', 'percent_chrY'
    var: 'n_counts', 'ensembl', 'gene_name', 'mt', 'ribo', 'chromosome'
    layers: 'raw_counts', 'sqrt_norm'

In [38]:
filtered_object.obs['sample_name'].value_counts()

sample_name
SRR25726227    524
Name: count, dtype: int64

In [39]:
filtered_object.var.dtypes

n_counts       int64
ensembl       object
gene_name     object
mt              bool
ribo            bool
chromosome    object
dtype: object

In [40]:
filtered_object.var['mt'].value_counts()

mt
False    161
Name: count, dtype: int64

In [41]:
filtered_object.var['mt'] = filtered_object.var['mt'].astype(str)

In [42]:
print(filtered_object.var.dtypes)

n_counts       int64
ensembl       object
gene_name     object
mt            object
ribo            bool
chromosome    object
dtype: object


In [43]:
filtered_object.var = filtered_object.var.rename(columns={'gene_name': 'gene_symbol'})
filtered_object.var = filtered_object.var.reset_index()
filtered_object.var

Unnamed: 0,gene_name,n_counts,ensembl,gene_symbol,mt,ribo,chromosome
0,AGBL4,28,ENSG00000186094,AGBL4,False,False,1
1,DAB1,32,ENSG00000173406,DAB1,False,False,1
2,NEGR1,22,ENSG00000172260,NEGR1,False,False,1
3,SLC44A5,22,ENSG00000137968,SLC44A5,False,False,1
4,ADGRL2,32,ENSG00000117114,ADGRL2,False,False,1
...,...,...,...,...,...,...,...
156,DIAPH2,24,ENSG00000147202,DIAPH2,False,False,X
157,,20,ENSG00000225689,,False,False,X
158,FGF13,28,ENSG00000129682,FGF13,False,False,X
159,,20,ENSG00000288098,,False,False,X


In [44]:
filtered_object.raw.var.index.name = 'gene_id'  

In [45]:
filtered_object.write_h5ad(f'../ncbi_sra/data/PRJNA1007964_sra_filtered_sk_{timestamp}.h5ad')