### Analyses of output files from the pipeline optimus v6.0.0
#### Aim of this part of analysis is to realize QC filtering of the output files from the pipeline optimus

#### Load necessary libraries and useful functions

In [1]:
%load_ext autoreload
%autoreload 2

In [84]:
from typing import Union
import fsspec
import anndata
import pandas as pd
import scanpy as sc
import seaborn as sns
import numpy as np
from cellbender.remove_background.downstream import anndata_from_h5
from upsetplot import from_contents
from upsetplot import UpSet
from scipy.stats import median_abs_deviation
import dalmatian as dm
import collections

In [4]:
def read_file_from_url(url: str) -> Union[anndata.AnnData, pd.DataFrame]:
    """Read a file from a URL and return the appropriate data structure.

    Parameters
    ----------
    url : str
        The URL of the file to read.

    Returns
    -------
    output : Union[anndata.AnnData, pd.DataFrame]
        The data read from the file, which can be either an AnnData object (if ".h5ad" in url)
        or a Pandas DataFrame (if ".csv" in url).
    """

    with fsspec.open(url) as f:
        if ".h5ad" in url:
            output = anndata.read_h5ad(f)
        elif "Summary.csv" in url:
            output = pd.read_csv(f, header=None) #Read the CSV file with the first row as data
        elif ".csv" in url:
            output = pd.read_csv(f)
        else:
            raise ValueError("Unsupported file format. Supported formats are .h5ad and .csv.")
    
    return output

In [6]:
def human_mt_genes_ident(in_adata: anndata.AnnData) -> None:
    """Identify human mitochondrial genes, add the boolean column "mt" to adata.var 
    This function also check if the adata contains mouse mt genes
    NB mitochondrial genes MT- for human data and mt- for mouse data
    
    Parameters
    ----------
    in_adata : AnnData
        The AnnData object to check.
    
    Returns
    -------
    output : None
    """
    
    count_true = (in_adata.var_names.str.startswith("mt-")).sum()
    
    if count_true > 0:
        print("Mouse mt number", count_true)
    else:
        print("No mouse mt genes")
        print("Adding human mt genes to adat.var")
        in_adata.var["mt"] = in_adata.var_names.str.startswith("MT-")

#### STARsolo summary

In [12]:
### Load STARsolo summary file
star_summary_url = "gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/submissions/96ae8c73-2b91-4a30-b188-2a25a341a459/Optimus/be8bc5cc-dfaf-4e7f-905c-e92735750bb5/call-STARsoloFastq/shard-1/cacheCopy/Summary.csv"
star_summary = read_file_from_url(star_summary_url)
display(star_summary)

Unnamed: 0,0,1
0,Number of Reads,319930655
1,Reads With Valid Barcodes,0.973214
2,Sequencing Saturation,0.667937
3,Q30 Bases in CB+UMI,0.931203
4,Q30 Bases in RNA read,0.901736
5,Reads Mapped to Genome: Unique+Multiple,0.961733
6,Reads Mapped to Genome: Unique,0.889793
7,Reads Mapped to Gene: Unique+Multiple Gene,NoMulti
8,Reads Mapped to Gene: Unique Gene,0.47964
9,Estimated Number of Cells,5356


#### Data Loading

In this analysis, we load the data from the output file of job (submission ID 94e3d5584-ac31-456a-b046-4c8554108b7d)

In [7]:
TERRA_WS = "whitelabgx/scRNAseq"
wm = dm.WorkspaceManager(TERRA_WS)
output_df = wm.get_sample_sets()
display(output_df)
display(output_df.columns)

Unnamed: 0_level_0,aligner_metrics,bam,cell_calls,cell_metrics,cellbender_cell_barcodes_csv,cellbender_h5_array,cellbender_html_report_array,cellbender_metrics_csv_array,cellbender_summary_pdf,gene_metrics,genomic_reference_version,h5ad_output_file,matrix,matrix_col_index,matrix_row_index,samples
sample_set_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
optimus_workflow_2023-10-27,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,[gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/...,[gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/...,[gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,"[pbmc_10k_v3_S1_L001, pbmc_10k_v3_S1_L002]"


Index(['aligner_metrics', 'bam', 'cell_calls', 'cell_metrics',
       'cellbender_cell_barcodes_csv', 'cellbender_h5_array',
       'cellbender_html_report_array', 'cellbender_metrics_csv_array',
       'cellbender_summary_pdf', 'gene_metrics', 'genomic_reference_version',
       'h5ad_output_file', 'matrix', 'matrix_col_index', 'matrix_row_index',
       'samples'],
      dtype='object')

h5ad file of optimus (without cellbender correction)

###### You can load the h5ad directly form the bucket, or you can download it from the bucket and load it locally.

In [None]:
### Load h5ad file
"""
h5ad_url = output_df.loc[output_df.index[0], 'h5ad_output_file']
#h5ad_url = output_df.loc['optimus_workflow_2023-10-27', 'h5ad_output_file']
adata = read_file_from_url(h5ad_url)
"""

In [8]:
### Load the data locally, more quickly
input_h5ad = "../scAtlas/tmp/10k_pbmc_v3.h5ad"
adata = sc.read_h5ad(input_h5ad)

  utils.warn_names_duplicates("var")


In [9]:
adata

AnnData object with n_obs × n_vars = 1136912 × 58347
    obs: 'cell_names', 'CellID', 'emptydrops_Limited', 'emptydrops_IsCell', 'n_reads', 'noise_reads', 'perfect_molecule_barcodes', 'reads_mapped_exonic', 'reads_mapped_exonic_as', 'reads_mapped_intronic', 'reads_mapped_intronic_as', 'reads_mapped_uniquely', 'reads_mapped_multiple', 'duplicate_reads', 'spliced_reads', 'antisense_reads', 'n_molecules', 'n_fragments', 'fragments_with_single_read_evidence', 'molecules_with_single_read_evidence', 'perfect_cell_barcodes', 'reads_mapped_intergenic', 'reads_unmapped', 'reads_mapped_too_many_loci', 'n_genes', 'genes_detected_multiple_observations', 'emptydrops_Total', 'molecule_barcode_fraction_bases_above_30_mean', 'molecule_barcode_fraction_bases_above_30_variance', 'genomic_reads_fraction_bases_quality_above_30_mean', 'genomic_reads_fraction_bases_quality_above_30_variance', 'genomic_read_quality_mean', 'genomic_read_quality_variance', 'reads_per_molecule', 'reads_per_fragment', 'fragments

###### check barcode and gene names

In [10]:
adata.var_names

Index(['DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2HG', 'MIR1302-2', 'FAM138A',
       'AL627309.6', 'OR4G11P', 'OR4F5', 'AL627309.1',
       ...
       'pRNA', 'RNA5-8S5', 'pRNA', 'RNA5-8SN2', 'AC007325.3', 'AC007325.1',
       'AC007325.4', 'AC007325.2', 'U6', 'U1'],
      dtype='object', length=58347)

In [11]:
adata.obs_names

Index(['AAACCCAAGAAACACT', 'AAACCCAAGAAACCAT', 'AAACCCAAGAAACTAC',
       'AAACCCAAGAAACTCA', 'AAACCCAAGAAACTGT', 'AAACCCAAGAAAGCGA',
       'AAACCCAAGAAAGTCT', 'AAACCCAAGAAATCCA', 'AAACCCAAGAACGCGT',
       'AAACCCAAGAAGCCAC',
       ...
       'TTTGTTGTCTTCTAAC', 'TTTGTTGTCTTCTCTT', 'TTTGTTGTCTTCTTCC',
       'TTTGTTGTCTTGATTC', 'TTTGTTGTCTTGCGCT', 'TTTGTTGTCTTGGTCC',
       'TTTGTTGTCTTGTACC', 'TTTGTTGTCTTTACAC', 'TTTGTTGTCTTTGCGC',
       'TTTGTTGTCTTTGCTA'],
      dtype='object', length=1136912)

Check if there are no annotated genes (without gene names)

In [11]:
no_name_gene = adata.var[adata.var['Gene'] == ""]
print(f"{no_name_gene.shape[0]} genes without gene names")

0 genes without gene names


Find duplicate variable (gene) names

In [12]:
var_names = adata.var_names
duplicates = [item for item, count in collections.Counter(var_names).items() if count > 1]

print("Duplicate variable names:", duplicates)
print("Number of duplicate variable:", len(duplicates))

Duplicate variable names: ['U6', 'TP73-AS1', 'Y_RNA', 'SNORA77', 'SCARNA16', 'SNORA70', 'SCARNA11', 'SCARNA17', 'SCARNA18', 'snoU13', 'SNORA44', 'SNORA16A', 'SCARNA24', 'Metazoa_SRP', 'uc_338', 'SNORA62', 'SNORA63', 'SNORD46', 'SNORD38B', 'SNORA26', 'SNORA58', 'DLEU2_6', 'DLEU2_5', 'DLEU2_4', 'DLEU2_3', 'DLEU2_2', 'DLEU2_1', 'SNORA31', 'SNORA2', 'SNORD81', 'SNORA51', 'SNORA25', 'SNORA42', 'U3', 'SNORA40', '7SK', 'U1', 'U2', '5S_rRNA', 'U6atac', 'U4', 'SNORD59', 'SCARNA4', 'SNORD64', 'ACA64', 'RGS5', 'SCARNA20', 'U7', 'SNORA67', 'SNORA72', 'SNORD60', 'SNORD116', 'U8', 'LINC01115', 'SNORD18', 'SCARNA21', 'SNORA36', 'SNORD75', 'TMEM247', 'STPG4', 'SNORA75', 'SNORA12', 'SNORD78', 'ACA59', 'SNORA74', 'snoU109', 'SNORA19', 'ACTR3BP2', 'DAOA-AS1_2', 'SCARNA15', 'SNORA48', 'SNORD56', 'PDE11A', 'SNORA43', 'SNORA17', 'PCGEM1', 'SNORA4', 'SNORD70', 'SNORD11', 'SNORA1', 'Vault', 'SNORD51', 'SCARNA6', 'SNORD39', 'LINC01238', 'GHRLOS', 'SNORD5', 'SNORA64', 'SNORD77', 'PRSS50', 'CYB561D2', 'SNORD19B'

Make variable names unique

###### Not all variable names are unique: This indicates that some variables (=genes) appear more than once which can lead to errors or unintended behavior for downstream analysis tasks. We execute the proposed function var_names_make_unique() which makes the variable name unique by appending a number string to each duplicate index element: ‘1’, ‘2’, etc.

In [13]:
adata.var_names_make_unique()



Check if we have zero counts droplets

In [13]:
print(f"{adata.obs.loc[adata.obs.n_reads == 0, :].shape[0]} zero count cell")

0 zero count cell


In [None]:
"""
# the same as above
no_0count_cell = adata.obs.loc[adata.obs.n_reads != 0, :]
no_0count_cell
"""