### Comparison of the Optimus pipeline results from wlg (V6.0.0) and broad institute (V5.8.4)
#### Aim of this part of analysis is identify the strength of the differences between the two version of Optimus pipelines and to understand the reasons for the differences

#### 1. Load necessary libraries and useful functions

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from basicAtlas import terra
import dalmatian as dm
import anndata
import collections
import scanpy as sc
import seaborn as sns
import numpy as np
from scipy.stats import median_abs_deviation

In [3]:
import fsspec
import anndata
import pandas as pd
from typing import Union

def read_file_from_url(url: str) -> Union[anndata.AnnData, pd.DataFrame]:
    """Read a file from a URL and return the appropriate data structure.

    Parameters
    ----------
    url : str
        The URL of the file to read.

    Returns
    -------
    output : Union[anndata.AnnData, pd.DataFrame]
        The data read from the file, which can be either an AnnData object (if ".h5ad" in url)
        or a Pandas DataFrame (if ".csv" in url).
    """

    with fsspec.open(url) as f:
        if ".h5ad" in url:
            output = anndata.read_h5ad(f)
        elif "Summary.csv" in url:
            output = pd.read_csv(f, header=None) #Read the CSV file with the first row as data
        elif ".csv" in url:
            output = pd.read_csv(f)
        else:
            raise ValueError("Unsupported file format. Supported formats are .h5ad and .csv.")
    
    return output

#### 2. Data Loading and checking

In [10]:
# current directory
current_directory = os.getcwd()
print(current_directory)

/Users/xiliu/Documents/analysis/terraPipelines/notebook


In [8]:
# Upload the primary annotation to WLG bucket
#!gsutil cp ../scAtlas/tmp/gencode.v27.primary_assembly.annotation.gtf "gs://whitelabgx-references/hg38/gencode.v27.primary_assembly.annotation.gtf"

Copying file://../scAtlas/tmp/gencode.v27.primary_assembly.annotation.gtf [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/ [1 files][  1.1 GiB/  1.1 GiB]   31.9 MiB/s                                   
Operation completed over 1 objects/1.1 GiB.                                      


In [14]:
# Variables definition
GCP_BUCKET="gs://whitelabgx-references"
GCP_FOLDER_NAME="resources/pbmc_10k_10X_v3"
TERRA_WS = 'whitelabgx/scRNAseq'
PROJECT="optimus_V6.0.0_wlg"

In [81]:
# Uploaded samples in google bucket associated with terra workspace
terra.uploadFromFolder_10x(GCP_BUCKET + '/' + GCP_FOLDER_NAME + '/',
                           TERRA_WS,
                           samplesetname=PROJECT,
                           fformat="fastqR1R2",
                           sep='_00|_00',
                           loc =0,
                           test=True) # test=True to test the function after upload successfully of the files to terra

please be sure you gave access to your terra email account access to this bucket


Unnamed: 0_level_0,r1_fastq,r2_fastq,i1_fastq,Source,participant
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pbmc_10k_v3_S1_L001,gs://whitelabgx-references/resources/pbmc_10k_...,gs://whitelabgx-references/resources/pbmc_10k_...,gs://whitelabgx-references/resources/pbmc_10k_...,U,pbmc_10k_v3_S1
pbmc_10k_v3_S1_L002,gs://whitelabgx-references/resources/pbmc_10k_...,gs://whitelabgx-references/resources/pbmc_10k_...,gs://whitelabgx-references/resources/pbmc_10k_...,U,pbmc_10k_v3_S1


#### 3. Comparison of the STARsolo summary

In [4]:
# WLG's summary file
wlg_star_url = "gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/submissions/07e6d89b-3db8-436e-bc50-fef6b40cdd04/Optimus/934d7faa-21e2-452d-a624-ea8896142b81/call-STARsoloFastq/shard-1/cacheCopy/Summary.csv"
wlg_star_summary = read_file_from_url(wlg_star_url)

In [27]:
# HCA's summary file
hca_star_url ="gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/submissions/ca3a5a8d-3a78-4a10-8ea6-4f4f8eb26c5d/Optimus/600fabcf-7af4-45e4-9ada-c759ed1d6d5f/call-STARsoloFastq/shard-1/Summary.csv"
hca_star_summary = read_file_from_url(hca_star_url)

In [30]:
display(wlg_star_summary)
display(hca_star_summary)

Unnamed: 0,0,1
0,Number of Reads,319930655
1,Reads With Valid Barcodes,0.973214
2,Sequencing Saturation,0.667937
3,Q30 Bases in CB+UMI,0.931203
4,Q30 Bases in RNA read,0.901736
5,Reads Mapped to Genome: Unique+Multiple,0.961733
6,Reads Mapped to Genome: Unique,0.889793
7,Reads Mapped to Gene: Unique+Multiple Gene,NoMulti
8,Reads Mapped to Gene: Unique Gene,0.47964
9,Estimated Number of Cells,5356


Unnamed: 0,0,1
0,Number of Reads,319930655
1,Reads With Valid Barcodes,0.973214
2,Sequencing Saturation,0.667937
3,Q30 Bases in CB+UMI,0.931203
4,Q30 Bases in RNA read,0.901736
5,Reads Mapped to Genome: Unique+Multiple,0.961733
6,Reads Mapped to Genome: Unique,0.889793
7,Reads Mapped to Gene: Unique+Multiple Gene,NoMulti
8,Reads Mapped to Gene: Unique Gene,0.47964
9,Estimated Number of Cells,5356


In [31]:
# Compare the two summary files
differences = hca_star_summary.ne(wlg_star_summary)
differences

Unnamed: 0,0,1
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
5,False,False
6,False,False
7,False,False
8,False,False
9,False,False


#### 4. Load and compare the results (h5ad) from the two pipelines

##### 4.1. Fetch results from broad institute pipeline: HCA_Optimus_Pipeline: pbmc human 10k v3 S1

In [7]:
# Fetch the workspace output information
wm = dm.WorkspaceManager('featured-workspaces-hca/HCA_Optimus_Pipeline')
output_df = wm.get_sample_sets()
display(output_df)
#display(output_df.columns)

hca_h5ad_url = output_df.loc['pbmc_human_v3', 'h5ad_output_file']
hca_adata = read_file_from_url(hca_h5ad_url)

Unnamed: 0_level_0,picard_metrics,cell_metrics,pipeline_version,samples,matrix,gene_metrics,genomic_reference_version,aligner_metrics,matrix_row_index,bam,matrix_col_index,h5ad_output_file,cell_calls
sample_set_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
neurons2k_mouse,,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,5.8.4,"[neurons2k_lane1, neurons2k_lane2, neurons2k_l...",gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...
pbmc4k_human,,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,5.8.4,"[pbmc4k_lane1, pbmc4k_lane2]",gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,
pbmc_human_v3,,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,5.8.4,"[pbmc_human_v3_lane1, pbmc_human_v3_lane2]",gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...,gs://fc-28eda7b7-0059-4aa1-876f-1ba9a95a719c/s...


Index(['picard_metrics', 'cell_metrics', 'pipeline_version', 'samples',
       'matrix', 'gene_metrics', 'genomic_reference_version',
       'aligner_metrics', 'matrix_row_index', 'bam', 'matrix_col_index',
       'h5ad_output_file', 'cell_calls'],
      dtype='object')

  utils.warn_names_duplicates("var")


In [8]:
hca_adata
#display(hca_adata.obs)
#display(hca_adata.var.tail(10))

AnnData object with n_obs × n_vars = 45397 × 58347
    obs: 'cell_names', 'CellID', 'emptydrops_Limited', 'emptydrops_IsCell', 'n_reads', 'noise_reads', 'perfect_molecule_barcodes', 'reads_mapped_uniquely', 'reads_mapped_multiple', 'spliced_reads', 'antisense_reads', 'n_molecules', 'n_fragments', 'fragments_with_single_read_evidence', 'molecules_with_single_read_evidence', 'perfect_cell_barcodes', 'reads_mapped_too_many_loci', 'n_genes', 'genes_detected_multiple_observations', 'emptydrops_Total', 'molecule_barcode_fraction_bases_above_30_mean', 'molecule_barcode_fraction_bases_above_30_variance', 'genomic_reads_fraction_bases_quality_above_30_mean', 'genomic_reads_fraction_bases_quality_above_30_variance', 'genomic_read_quality_mean', 'genomic_read_quality_variance', 'reads_per_fragment', 'fragments_per_molecule', 'cell_barcode_fraction_bases_above_30_mean', 'cell_barcode_fraction_bases_above_30_variance', 'n_mitochondrial_genes', 'n_mitochondrial_molecules', 'pct_mitochondrial_molecul

In [11]:
# Find duplicate variable (gene) names
hca_var_names = hca_adata.var_names
hca_duplicates = [item for item, count in collections.Counter(hca_var_names).items() if count > 1]

print("Duplicate variable names:", hca_duplicates)
print("Number of duplicate variable:", len(hca_duplicates))

Duplicate variable names: ['U6', 'TP73-AS1', 'Y_RNA', 'SNORA77', 'SCARNA16', 'SNORA70', 'SCARNA11', 'SCARNA17', 'SCARNA18', 'snoU13', 'SNORA44', 'SNORA16A', 'SCARNA24', 'Metazoa_SRP', 'uc_338', 'SNORA62', 'SNORA63', 'SNORD46', 'SNORD38B', 'SNORA26', 'SNORA58', 'DLEU2_6', 'DLEU2_5', 'DLEU2_4', 'DLEU2_3', 'DLEU2_2', 'DLEU2_1', 'SNORA31', 'SNORA2', 'SNORD81', 'SNORA51', 'SNORA25', 'SNORA42', 'U3', 'SNORA40', '7SK', 'U1', 'U2', '5S_rRNA', 'U6atac', 'U4', 'SNORD59', 'SCARNA4', 'SNORD64', 'ACA64', 'RGS5', 'SCARNA20', 'U7', 'SNORA67', 'SNORA72', 'SNORD60', 'SNORD116', 'U8', 'LINC01115', 'SNORD18', 'SCARNA21', 'SNORA36', 'SNORD75', 'TMEM247', 'STPG4', 'SNORA75', 'SNORA12', 'SNORD78', 'ACA59', 'SNORA74', 'snoU109', 'SNORA19', 'ACTR3BP2', 'DAOA-AS1_2', 'SCARNA15', 'SNORA48', 'SNORD56', 'PDE11A', 'SNORA43', 'SNORA17', 'PCGEM1', 'SNORA4', 'SNORD70', 'SNORD11', 'SNORA1', 'Vault', 'SNORD51', 'SCARNA6', 'SNORD39', 'LINC01238', 'GHRLOS', 'SNORD5', 'SNORA64', 'SNORD77', 'PRSS50', 'CYB561D2', 'SNORD19B'

In [None]:
# Subset the AnnData var to only include the duplicated variables
for target_var_name in hca_duplicates:
    print("Variable name:", target_var_name)
    print("Duplication quantity:", hca_adata.var[hca_adata.var_names == target_var_name].shape[0])
    print("Variable:", hca_adata.var[hca_adata.var_names == target_var_name])

In [12]:
# Variable names are not unique. To make them unique
hca_adata.var_names_make_unique()



##### 4.2. Fetch results from WLG pipeline

In [17]:
wm = dm.WorkspaceManager(TERRA_WS)
output_df = wm.get_sample_sets()
display(output_df)
#display(output_df.columns)

#wlg_h5ad_url = output_df.loc['optimus_V6.0.0_wlg', 'h5ad_output_file']
#wlg_adata = read_file_from_url(wlg_h5ad_url)

Unnamed: 0_level_0,aligner_metrics,bam,cell_barcodes_csv,cell_calls,cell_metrics,gene_metrics,genomic_reference_version,h5ad_output_file,html_report_array,matrix,matrix_col_index,matrix_row_index,metrics_csv_array,samples,summary_pdf
sample_set_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
optimus_V6.0.0_wlg,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,[gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...,[gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/...,"[pbmc_10k_v3_S1_L001, pbmc_10k_v3_S1_L002]",gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/s...


##### For this part of analysis, we use te code of chunk below to read the data, 
##### because the results of this job used the same parameters as the broad institute pipeline

In [18]:
wlg_h5ad_url = "gs://fc-447aee29-8362-4c0b-b8d0-b3b10eb9e2a6/submissions/07e6d89b-3db8-436e-bc50-fef6b40cdd04/Optimus/934d7faa-21e2-452d-a624-ea8896142b81/call-OptimusH5adGeneration/10k_pbmc_v3.h5ad"
wlg_adata = read_file_from_url(wlg_h5ad_url)

  utils.warn_names_duplicates("var")


In [19]:
wlg_adata
#display(wlg_adata.obs)
#display(wlg_adata.var)

AnnData object with n_obs × n_vars = 1136912 × 58347
    obs: 'cell_names', 'CellID', 'emptydrops_Limited', 'emptydrops_IsCell', 'n_reads', 'noise_reads', 'perfect_molecule_barcodes', 'reads_mapped_exonic', 'reads_mapped_exonic_as', 'reads_mapped_intronic', 'reads_mapped_intronic_as', 'reads_mapped_uniquely', 'reads_mapped_multiple', 'duplicate_reads', 'spliced_reads', 'antisense_reads', 'n_molecules', 'n_fragments', 'fragments_with_single_read_evidence', 'molecules_with_single_read_evidence', 'perfect_cell_barcodes', 'reads_mapped_intergenic', 'reads_unmapped', 'reads_mapped_too_many_loci', 'n_genes', 'genes_detected_multiple_observations', 'emptydrops_Total', 'molecule_barcode_fraction_bases_above_30_mean', 'molecule_barcode_fraction_bases_above_30_variance', 'genomic_reads_fraction_bases_quality_above_30_mean', 'genomic_reads_fraction_bases_quality_above_30_variance', 'genomic_read_quality_mean', 'genomic_read_quality_variance', 'reads_per_molecule', 'reads_per_fragment', 'fragments

##### 4.3. comparison of sparse matrix from the two pipelines

In [53]:
# sparse matrix is named 'sparse_matrix'
wlg_sparse_matrix = wlg_adata.X

# Convert the sparse matrix to a dense NumPy array
wlg_dense_matrix = wlg_sparse_matrix.toarray()

# use regular NumPy indexing to see the head
#head = wlg_dense_matrix[:5, :]  # Assuming you want to see the first 5 rows

In [None]:
hca_sparse_matrix = wlg_adata.X
hca_dense_matrix = wlg_sparse_matrix.toarray()

In [56]:
wlg_dense_matrix.shape
hca_dense_matrix.shape

NameError: name 'hca_dense_matrix' is not defined

## move the following code to preprocessing nb

In [30]:
#display(wlg_adata.obs[wlg_adata.obs['duplicate_reads']== 0])
display(wlg_adata.obs['duplicate_reads'].value_counts())
#display(wlg_adata.var.head())

0    1136912
Name: duplicate_reads, dtype: int64

AAACCCAAGAAACACT    0.000000
AAACCCAAGAAACCAT    0.000000
AAACCCAAGAAACTAC    0.000000
AAACCCAAGAAACTCA    0.054545
AAACCCAAGAAACTGT    0.000000
                      ...   
TTTGTTGTCTTGGTCC    0.000000
TTTGTTGTCTTGTACC    0.000000
TTTGTTGTCTTTACAC    0.000000
TTTGTTGTCTTTGCGC    0.150000
TTTGTTGTCTTTGCTA    0.000000
Length: 1136912, dtype: float64

In [None]:
# mitochondrial genes: MT- for human data, mt- for mouse data
wlg_adata.var["mt"] = wlg_adata.var_names.str.startswith("MT-")

count_true = (adata.var_names.str.startswith("mt-")).sum()
print("mouse mt number", count_true)

sc.pp.calculate_qc_metrics(
    wlg_adata, qc_vars=["mt"], inplace=True, percent_top=[20], log1p=True
)
wlg_adata


In [48]:
pct_mt_theis = wlg_adata.obs["pct_counts_mt"]
pct_mt_optimus = wlg_adata.obs["pct_mitochondrial_molecules"]

In [44]:
if pct_mt_optimus.equals(pct_mt_theis):
    print("The two Series are the same.")
else:
    print("The two Series are different.")

The two Series are the same.


In [45]:
wlg_adata.obs[["pct_mitochondrial_molecules", "pct_counts_mt"]]

Unnamed: 0,pct_mitochondrial_molecules,pct_counts_mt
AAACCCAAGAAACACT,0.000000,0.000000
AAACCCAAGAAACCAT,0.000000,0.000000
AAACCCAAGAAACTAC,0.000000,0.000000
AAACCCAAGAAACTCA,3.333333,6.250000
AAACCCAAGAAACTGT,0.000000,0.000000
...,...,...
TTTGTTGTCTTGGTCC,0.000000,0.000000
TTTGTTGTCTTGTACC,0.000000,0.000000
TTTGTTGTCTTTACAC,0.000000,0.000000
TTTGTTGTCTTTGCGC,5.769231,15.000001


In [None]:
for idx, (val1, val2) in enumerate(zip(pct_mt_optimus, pct_mt_theis)):
    if val1 != val2:
        print(f"Difference at index {idx}:")
        print(f"Difference at index {wlg_adata.obs['CellID'].iloc[idx]}:")
        
        print(f"pct_mt_optimus: {val1}")
        print(f"pct_mt_theis: {val2}")

obs_select = ['n_reads', 'noise_reads', 'perfect_molecule_barcodes', 'reads_mapped_exonic','duplicate_reads',
            ]
    'n_mitochondrial_genes', 'n_mitochondrial_molecules', 'pct_mitochondrial_molecules',
    
    ''emptydrops_Limited', 'emptydrops_IsCell', 'reads_mapped_uniquely', 'reads_mapped_multiple',  'spliced_reads', 'antisense_reads', 'n_molecules', 'n_fragments', 'fragments_with_single_read_evidence', 'molecules_with_single_read_evidence', 'perfect_cell_barcodes', 'reads_mapped_intergenic', 'reads_unmapped', 'reads_mapped_too_many_loci', 'n_genes', 'genes_detected_multiple_observations', 'emptydrops_Total', 'molecule_barcode_fraction_bases_above_30_mean', 'molecule_barcode_fraction_bases_above_30_variance', 'genomic_reads_fraction_bases_quality_above_30_mean', 'genomic_reads_fraction_bases_quality_above_30_variance', 'genomic_read_quality_mean', 'genomic_read_quality_variance', 'reads_per_molecule', 'reads_per_fragment', 'fragments_per_molecule', 'cell_barcode_fraction_bases_above_30_mean', 'cell_barcode_fraction_bases_above_30_variance',  'emptydrops_LogProb', 'emptydrops_PValue', 'emptydrops_FDR', 


    var: 'gene_names', 'ensembl_ids', 'Gene', 'n_reads', 'noise_reads', 'perfect_molecule_barcodes', 'reads_mapped_exonic', 'reads_mapped_exonic_as', 'reads_mapped_intronic', 'reads_mapped_intronic_as', 'reads_mapped_uniquely', 'reads_mapped_multiple', 'duplicate_reads', 'spliced_reads', 'antisense_reads', 'molecule_barcode_fraction_bases_above_30_mean', 'molecule_barcode_fraction_bases_above_30_variance', 'genomic_reads_fraction_bases_quality_above_30_mean', 'genomic_reads_fraction_bases_quality_above_30_variance', 'genomic_read_quality_mean', 'genomic_read_quality_variance', 'n_molecules', 'n_fragments', 'reads_per_molecule', 'reads_per_fragment', 'fragments_per_molecule', 'fragments_with_single_read_evidence', 'molecules_with_single_read_evidence', 'number_cells_detected_multiple', 'number_cells_expressing'
    

In [5]:
# Find duplicate variable (gene) names
wlg_var_names = wlg_adata.var_names

wlg_duplicates = [item for item, count in collections.Counter(wlg_var_names).items() if count > 1]

print("Duplicate variable names:", wlg_duplicates)
print("Number of duplicate variable:", len(wlg_duplicates))

Duplicate variable names: ['U6', 'TP73-AS1', 'Y_RNA', 'SNORA77', 'SCARNA16', 'SNORA70', 'SCARNA11', 'SCARNA17', 'SCARNA18', 'snoU13', 'SNORA44', 'SNORA16A', 'SCARNA24', 'Metazoa_SRP', 'uc_338', 'SNORA62', 'SNORA63', 'SNORD46', 'SNORD38B', 'SNORA26', 'SNORA58', 'DLEU2_6', 'DLEU2_5', 'DLEU2_4', 'DLEU2_3', 'DLEU2_2', 'DLEU2_1', 'SNORA31', 'SNORA2', 'SNORD81', 'SNORA51', 'SNORA25', 'SNORA42', 'U3', 'SNORA40', '7SK', 'U1', 'U2', '5S_rRNA', 'U6atac', 'U4', 'SNORD59', 'SCARNA4', 'SNORD64', 'ACA64', 'RGS5', 'SCARNA20', 'U7', 'SNORA67', 'SNORA72', 'SNORD60', 'SNORD116', 'U8', 'LINC01115', 'SNORD18', 'SCARNA21', 'SNORA36', 'SNORD75', 'TMEM247', 'STPG4', 'SNORA75', 'SNORA12', 'SNORD78', 'ACA59', 'SNORA74', 'snoU109', 'SNORA19', 'ACTR3BP2', 'DAOA-AS1_2', 'SCARNA15', 'SNORA48', 'SNORD56', 'PDE11A', 'SNORA43', 'SNORA17', 'PCGEM1', 'SNORA4', 'SNORD70', 'SNORD11', 'SNORA1', 'Vault', 'SNORD51', 'SCARNA6', 'SNORD39', 'LINC01238', 'GHRLOS', 'SNORD5', 'SNORA64', 'SNORD77', 'PRSS50', 'CYB561D2', 'SNORD19B'

##### 3.3. Differences between the two pipelines

In [14]:
# Find elements in hca_duplicates that are not in wlg_duplicates
difference1 = [item for item in hca_duplicates if item not in wlg_duplicates]

# Find elements in wlg_duplicates that are not in hca_duplicates
difference2 = [item for item in wlg_duplicates if item not in hca_duplicates]

# Print the differences
print("Elements in hca_duplicates that are not in wlg_duplicates:", difference1)
print("Elements in wlg_duplicates that are not in hca_duplicates:", difference2)

Elements in hca_duplicates that are not in wlg_duplicates: []
Elements in wlg_duplicates that are not in hca_duplicates: ['U6', 'TP73-AS1', 'Y_RNA', 'SNORA77', 'SCARNA16', 'SNORA70', 'SCARNA11', 'SCARNA17', 'SCARNA18', 'snoU13', 'SNORA44', 'SNORA16A', 'SCARNA24', 'Metazoa_SRP', 'uc_338', 'SNORA62', 'SNORA63', 'SNORD46', 'SNORD38B', 'SNORA26', 'SNORA58', 'DLEU2_6', 'DLEU2_5', 'DLEU2_4', 'DLEU2_3', 'DLEU2_2', 'DLEU2_1', 'SNORA31', 'SNORA2', 'SNORD81', 'SNORA51', 'SNORA25', 'SNORA42', 'U3', 'SNORA40', '7SK', 'U1', 'U2', '5S_rRNA', 'U6atac', 'U4', 'SNORD59', 'SCARNA4', 'SNORD64', 'ACA64', 'RGS5', 'SCARNA20', 'U7', 'SNORA67', 'SNORA72', 'SNORD60', 'SNORD116', 'U8', 'LINC01115', 'SNORD18', 'SCARNA21', 'SNORA36', 'SNORD75', 'TMEM247', 'STPG4', 'SNORA75', 'SNORA12', 'SNORD78', 'ACA59', 'SNORA74', 'snoU109', 'SNORA19', 'ACTR3BP2', 'DAOA-AS1_2', 'SCARNA15', 'SNORA48', 'SNORD56', 'PDE11A', 'SNORA43', 'SNORA17', 'PCGEM1', 'SNORA4', 'SNORD70', 'SNORD11', 'SNORA1', 'Vault', 'SNORD51', 'SCARNA6', 'SN

In [15]:
index1 = wlg_adata.var.columns
index2 = hca_adata.var.columns

# Find all differences between index1 and index2
all_differences = index1.symmetric_difference(index2)

# Print all differences
print("All differences between index1 and index2:", all_differences)

All differences between index1 and index2: Index(['duplicate_reads', 'reads_mapped_exonic', 'reads_mapped_exonic_as',
       'reads_mapped_intronic', 'reads_mapped_intronic_as'],
      dtype='object')


In [16]:
index1 = wlg_adata.obs.columns
index2 = hca_adata.obs.columns

all_differences = index1.symmetric_difference(index2)
print("All differences between index1 and index2:", all_differences)

All differences between index1 and index2: Index(['duplicate_reads', 'reads_mapped_exonic', 'reads_mapped_exonic_as',
       'reads_mapped_intergenic', 'reads_mapped_intronic',
       'reads_mapped_intronic_as', 'reads_per_molecule', 'reads_unmapped'],
      dtype='object')


In [17]:
wlg_adata.var_names_make_unique()



#### 4. Filtering low quality reads

##### 4.1. numpy matrix comparison: HCA vs WLG

In [16]:
# sparse matrix is named 'sparse_matrix'
hca_sparse_matrix = hca_adata.X

# Convert the sparse matrix to a dense NumPy array
hca_dense_matrix = hca_sparse_matrix.toarray()

# use regular NumPy indexing to see the head
head = hca_dense_matrix[:5, :]  # Assuming you want to see the first 5 rows
print(head)

: 

In [9]:
wlg_sparse_matrix = wlg_adata.X
wlg_dense_matrix = wlg_sparse_matrix.toarray()

In [15]:
display(len(hca_dense_matrix))
display(len(wlg_dense_matrix))

NameError: name 'hca_dense_matrix' is not defined

In [None]:
# Create two example NumPy arrays (matrices)
array1 = wlg_dense_matrix
array2 = hca_dense_matrix

In [None]:
# Method 1: Check if the arrays are equal element-wise
are_equal = np.array_equal(array1, array2)
print("Are the arrays equal element-wise?", are_equal)

# Method 2: Check if the arrays are equal using ==
are_equal = (array1 == array2).all()
print("Are the arrays equal using '=='?", are_equal)

# Method 3: Check element-wise inequality
elementwise_inequality = array1 != array2
print("Element-wise inequality:\n", elementwise_inequality)

# Method 4: Check if any element is greater than or less than another
any_greater = np.any(array1 > array2)
any_less = np.any(array1 < array2)
print("Any element in array1 greater than array2?", any_greater)
print("Any element in array1 less than array2?", any_less)s

##### 4.2. HCA pipeline

In [8]:
# mitochondrial genes: MT- for human data, mt- for mouse data
hca_adata.var["mt"] = hca_adata.var_names.str.startswith("MT-")
# check if have mouse mt genes
#todo: see alix function
count_true = (hca_adata.var_names.str.startswith("mt-")).sum()
print("mouse mt number", count_true)

# ribosomal genes
hca_adata.var["ribo"] = hca_adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
hca_adata.var["hb"] = hca_adata.var_names.str.contains(("^HB[^(P)]"))

mouse mt number 0


In [9]:
# add qc metric to obs
sc.pp.calculate_qc_metrics(
    hca_adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)
hca_adata

AnnData object with n_obs × n_vars = 1136912 × 58347
    obs: 'cell_names', 'CellID', 'emptydrops_Limited', 'emptydrops_IsCell', 'n_reads', 'noise_reads', 'perfect_molecule_barcodes', 'reads_mapped_uniquely', 'reads_mapped_multiple', 'spliced_reads', 'antisense_reads', 'n_molecules', 'n_fragments', 'fragments_with_single_read_evidence', 'molecules_with_single_read_evidence', 'perfect_cell_barcodes', 'reads_mapped_too_many_loci', 'n_genes', 'genes_detected_multiple_observations', 'emptydrops_Total', 'molecule_barcode_fraction_bases_above_30_mean', 'molecule_barcode_fraction_bases_above_30_variance', 'genomic_reads_fraction_bases_quality_above_30_mean', 'genomic_reads_fraction_bases_quality_above_30_variance', 'genomic_read_quality_mean', 'genomic_read_quality_variance', 'reads_per_fragment', 'fragments_per_molecule', 'cell_barcode_fraction_bases_above_30_mean', 'cell_barcode_fraction_bases_above_30_variance', 'n_mitochondrial_genes', 'n_mitochondrial_molecules', 'pct_mitochondrial_molec

In [None]:
p1 = sns.displot(hca_adata.obs["total_counts"], bins=100, kde=False)
# sc.pl.violin(adata, 'total_counts')
p2 = sc.pl.violin(hca_adata, "pct_counts_mt")
p3 = sc.pl.scatter(hca_adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [11]:
# have too much 0 for library size
hca_barcodes_to_keep, _ = sc.pp.filter_cells(hca_adata, min_counts=500, inplace = False)
# by default inplace = True, so adata is modified directly
hca_adata = hca_adata[hca_barcodes_to_keep, :]
hca_adata

View of AnnData object with n_obs × n_vars = 12283 × 58347
    obs: 'cell_names', 'CellID', 'emptydrops_Limited', 'emptydrops_IsCell', 'n_reads', 'noise_reads', 'perfect_molecule_barcodes', 'reads_mapped_uniquely', 'reads_mapped_multiple', 'spliced_reads', 'antisense_reads', 'n_molecules', 'n_fragments', 'fragments_with_single_read_evidence', 'molecules_with_single_read_evidence', 'perfect_cell_barcodes', 'reads_mapped_too_many_loci', 'n_genes', 'genes_detected_multiple_observations', 'emptydrops_Total', 'molecule_barcode_fraction_bases_above_30_mean', 'molecule_barcode_fraction_bases_above_30_variance', 'genomic_reads_fraction_bases_quality_above_30_mean', 'genomic_reads_fraction_bases_quality_above_30_variance', 'genomic_read_quality_mean', 'genomic_read_quality_variance', 'reads_per_fragment', 'fragments_per_molecule', 'cell_barcode_fraction_bases_above_30_mean', 'cell_barcode_fraction_bases_above_30_variance', 'n_mitochondrial_genes', 'n_mitochondrial_molecules', 'pct_mitochondrial

In [12]:
# mitochondrial genes: MT- for human data, mt- for mouse data
hca_adata.var["mt"] = hca_adata.var_names.str.startswith("MT-")
# check if have mouse mt genes
#todo: see alix function
count_true = (hca_adata.var_names.str.startswith("mt-")).sum()
print("mouse mt number", count_true)

# ribosomal genes
hca_adata.var["ribo"] = hca_adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
hca_adata.var["hb"] = hca_adata.var_names.str.contains(("^HB[^(P)]"))

  hca_adata.var["mt"] = hca_adata.var_names.str.startswith("MT-")


mouse mt number 0


In [None]:
p1 = sns.displot(hca_adata.obs["total_counts"], bins=100, kde=False)
# sc.pl.violin(adata, 'total_counts')
p2 = sc.pl.violin(hca_adata, "pct_counts_mt")
p3 = sc.pl.scatter(hca_adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

##### 4.3. WLG pipeline

In [22]:
# mitochondrial genes: MT- for human data, mt- for mouse data
wlg_adata.var["mt"] = wlg_adata.var_names.str.startswith("MT-")
# check if have mouse mt genes
#todo: see alix function
count_true = (wlg_adata.var_names.str.startswith("mt-")).sum()
print("mouse mt number", count_true)

# ribosomal genes
wlg_adata.var["ribo"] = wlg_adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
wlg_adata.var["hb"] = wlg_adata.var_names.str.contains(("^HB[^(P)]"))

mouse mt number 0


In [23]:
# add qc metric to obs
sc.pp.calculate_qc_metrics(
    wlg_adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)
wlg_adata

AnnData object with n_obs × n_vars = 1136912 × 58347
    obs: 'cell_names', 'CellID', 'emptydrops_Limited', 'emptydrops_IsCell', 'n_reads', 'noise_reads', 'perfect_molecule_barcodes', 'reads_mapped_exonic', 'reads_mapped_exonic_as', 'reads_mapped_intronic', 'reads_mapped_intronic_as', 'reads_mapped_uniquely', 'reads_mapped_multiple', 'duplicate_reads', 'spliced_reads', 'antisense_reads', 'n_molecules', 'n_fragments', 'fragments_with_single_read_evidence', 'molecules_with_single_read_evidence', 'perfect_cell_barcodes', 'reads_mapped_intergenic', 'reads_unmapped', 'reads_mapped_too_many_loci', 'n_genes', 'genes_detected_multiple_observations', 'emptydrops_Total', 'molecule_barcode_fraction_bases_above_30_mean', 'molecule_barcode_fraction_bases_above_30_variance', 'genomic_reads_fraction_bases_quality_above_30_mean', 'genomic_reads_fraction_bases_quality_above_30_variance', 'genomic_read_quality_mean', 'genomic_read_quality_variance', 'reads_per_molecule', 'reads_per_fragment', 'fragments

In [None]:
p1 = sns.displot(wlg_adata.obs["total_counts"], bins=100, kde=False)
#sc.pl.violin(wlg_adata, 'total_counts')
p2 = sc.pl.violin(wlg_adata, "pct_counts_mt")
p3 = sc.pl.scatter(wlg_adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [26]:
wlg_barcodes_to_keep, _ = sc.pp.filter_cells(wlg_adata, min_counts=500, inplace = False)
wlg_adata = wlg_adata[wlg_barcodes_to_keep, :]
wlg_adata

View of AnnData object with n_obs × n_vars = 12283 × 58347
    obs: 'cell_names', 'CellID', 'emptydrops_Limited', 'emptydrops_IsCell', 'n_reads', 'noise_reads', 'perfect_molecule_barcodes', 'reads_mapped_exonic', 'reads_mapped_exonic_as', 'reads_mapped_intronic', 'reads_mapped_intronic_as', 'reads_mapped_uniquely', 'reads_mapped_multiple', 'duplicate_reads', 'spliced_reads', 'antisense_reads', 'n_molecules', 'n_fragments', 'fragments_with_single_read_evidence', 'molecules_with_single_read_evidence', 'perfect_cell_barcodes', 'reads_mapped_intergenic', 'reads_unmapped', 'reads_mapped_too_many_loci', 'n_genes', 'genes_detected_multiple_observations', 'emptydrops_Total', 'molecule_barcode_fraction_bases_above_30_mean', 'molecule_barcode_fraction_bases_above_30_variance', 'genomic_reads_fraction_bases_quality_above_30_mean', 'genomic_reads_fraction_bases_quality_above_30_variance', 'genomic_read_quality_mean', 'genomic_read_quality_variance', 'reads_per_molecule', 'reads_per_fragment', 'fra

In [None]:
p1 = sns.displot(wlg_adata.obs["total_counts"], bins=100, kde=False)
# sc.pl.violin(adata, 'total_counts')
p2 = sc.pl.violin(wlg_adata, "pct_counts_mt")
p3 = sc.pl.scatter(wlg_adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt")