### Analyses of output files from the pipeline optimus v6.0.0+
#### Aim of this part of analysis is predict the cell cycle

#### Load necessary libraries and useful functions

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from typing import Union
import pandas as pd
import scanpy as sc
import numpy as np
import collections

In [3]:
import warnings
warnings.filterwarnings("ignore") 

#### Data Loading

In [9]:
input_h5ad = "../scAtlas/tmp/10k_pbmc_v3_out_filtered_cellbenderDefault_lowQcell_scDblFinder.h5ad"
adata = sc.read_h5ad(input_h5ad_raw)

Assuming we are loading a "filtered" file that contains only cells.


In [10]:
display(adata)

AnnData object with n_obs × n_vars = 1136912 × 58347
    obs: 'cell_names', 'CellID', 'emptydrops_Limited', 'emptydrops_IsCell', 'n_reads', 'noise_reads', 'perfect_molecule_barcodes', 'reads_mapped_exonic', 'reads_mapped_exonic_as', 'reads_mapped_intronic', 'reads_mapped_intronic_as', 'reads_mapped_uniquely', 'reads_mapped_multiple', 'duplicate_reads', 'spliced_reads', 'antisense_reads', 'n_molecules', 'n_fragments', 'fragments_with_single_read_evidence', 'molecules_with_single_read_evidence', 'perfect_cell_barcodes', 'reads_mapped_intergenic', 'reads_unmapped', 'reads_mapped_too_many_loci', 'n_genes', 'genes_detected_multiple_observations', 'emptydrops_Total', 'molecule_barcode_fraction_bases_above_30_mean', 'molecule_barcode_fraction_bases_above_30_variance', 'genomic_reads_fraction_bases_quality_above_30_mean', 'genomic_reads_fraction_bases_quality_above_30_variance', 'genomic_read_quality_mean', 'genomic_read_quality_variance', 'reads_per_molecule', 'reads_per_fragment', 'fragments

AnnData object with n_obs × n_vars = 12561 × 58347
    obs: 'background_fraction', 'cell_probability', 'cell_size', 'droplet_efficiency'
    var: 'ambient_expression', 'feature_type', 'genome', 'gene_id', 'cellbender_analyzed'
    uns: 'barcode_indices_for_latents', 'cell_size_lognormal_std', 'empty_droplet_size_lognormal_loc', 'empty_droplet_size_lognormal_scale', 'swapping_fraction_dist_params', 'barcodes_analyzed', 'barcodes_analyzed_inds', 'estimator', 'features_analyzed_inds', 'fraction_data_used_for_testing', 'learning_curve_learning_rate_epoch', 'learning_curve_learning_rate_value', 'learning_curve_test_elbo', 'learning_curve_test_epoch', 'learning_curve_train_elbo', 'learning_curve_train_epoch', 'target_false_positive_rate'
    obsm: 'gene_expression_encoding'

###### check barcode and gene names

In [11]:
display(adata.var_names)
display(adata.obs_names)

Index(['DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2HG', 'MIR1302-2', 'FAM138A',
       'AL627309.6', 'OR4G11P', 'OR4F5', 'AL627309.1',
       ...
       'pRNA', 'RNA5-8S5', 'pRNA', 'RNA5-8SN2', 'AC007325.3', 'AC007325.1',
       'AC007325.4', 'AC007325.2', 'U6', 'U1'],
      dtype='object', length=58347)

Index(['DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2HG', 'MIR1302-2', 'FAM138A',
       'AL627309.6', 'OR4G11P', 'OR4F5', 'AL627309.1',
       ...
       'pRNA', 'RNA5-8S5', 'pRNA', 'RNA5-8SN2', 'AC007325.3', 'AC007325.1',
       'AC007325.4', 'AC007325.2', 'U6', 'U1'],
      dtype='object', name='gene_name', length=58347)

Index(['AAACCCAAGAAACACT', 'AAACCCAAGAAACCAT', 'AAACCCAAGAAACTAC',
       'AAACCCAAGAAACTCA', 'AAACCCAAGAAACTGT', 'AAACCCAAGAAAGCGA',
       'AAACCCAAGAAAGTCT', 'AAACCCAAGAAATCCA', 'AAACCCAAGAACGCGT',
       'AAACCCAAGAAGCCAC',
       ...
       'TTTGTTGTCTTCTAAC', 'TTTGTTGTCTTCTCTT', 'TTTGTTGTCTTCTTCC',
       'TTTGTTGTCTTGATTC', 'TTTGTTGTCTTGCGCT', 'TTTGTTGTCTTGGTCC',
       'TTTGTTGTCTTGTACC', 'TTTGTTGTCTTTACAC', 'TTTGTTGTCTTTGCGC',
       'TTTGTTGTCTTTGCTA'],
      dtype='object', length=1136912)

Index(['TGCTTGCTCTAAGCCA', 'AATCACGTCCCGATCT', 'CTTGAGATCCATGCAA',
       'TTCCTTCAGTCTCCTC', 'TCATTACCAGCTCATA', 'TAATCTCAGCACCCAC',
       'TGAATGCGTCGCACGT', 'CTGAATGTCCTAGCTC', 'CAGAGCCTCTTCGATT',
       'TACATTCTCCGTAGTA',
       ...
       'ATTCTACGTGTCCAAT', 'TATCAGGAGAACCGCA', 'GAAGGGTGTACTAGCT',
       'GATGACTGTACAGTAA', 'AGAGAGCGTCCAGTTA', 'AGTACCAAGCGGGTAT',
       'CGTTCTGTCAAGCCGC', 'CGTAATGCAGCGTACC', 'CTCAGTCGTTATCTTC',
       'TATCAGGTCACTTGTC'],
      dtype='object', name='barcode', length=12561)

Find duplicate variable (gene) names

In [13]:
var_names = adata.var_names
duplicates = [item for item, count in collections.Counter(var_names).items() if count > 1]
print("Duplicate variable names:", duplicates)
print("Number of duplicate variable:", len(duplicates))

Duplicate variable names: ['U6', 'TP73-AS1', 'Y_RNA', 'SNORA77', 'SCARNA16', 'SNORA70', 'SCARNA11', 'SCARNA17', 'SCARNA18', 'snoU13', 'SNORA44', 'SNORA16A', 'SCARNA24', 'Metazoa_SRP', 'uc_338', 'SNORA62', 'SNORA63', 'SNORD46', 'SNORD38B', 'SNORA26', 'SNORA58', 'DLEU2_6', 'DLEU2_5', 'DLEU2_4', 'DLEU2_3', 'DLEU2_2', 'DLEU2_1', 'SNORA31', 'SNORA2', 'SNORD81', 'SNORA51', 'SNORA25', 'SNORA42', 'U3', 'SNORA40', '7SK', 'U1', 'U2', '5S_rRNA', 'U6atac', 'U4', 'SNORD59', 'SCARNA4', 'SNORD64', 'ACA64', 'RGS5', 'SCARNA20', 'U7', 'SNORA67', 'SNORA72', 'SNORD60', 'SNORD116', 'U8', 'LINC01115', 'SNORD18', 'SCARNA21', 'SNORA36', 'SNORD75', 'TMEM247', 'STPG4', 'SNORA75', 'SNORA12', 'SNORD78', 'ACA59', 'SNORA74', 'snoU109', 'SNORA19', 'ACTR3BP2', 'DAOA-AS1_2', 'SCARNA15', 'SNORA48', 'SNORD56', 'PDE11A', 'SNORA43', 'SNORA17', 'PCGEM1', 'SNORA4', 'SNORD70', 'SNORD11', 'SNORA1', 'Vault', 'SNORD51', 'SCARNA6', 'SNORD39', 'LINC01238', 'GHRLOS', 'SNORD5', 'SNORA64', 'SNORD77', 'PRSS50', 'CYB561D2', 'SNORD19B'

Make variable names unique

###### Not all variable names are unique: This indicates that some variables (=genes) appear more than once which can lead to errors or unintended behavior for downstream analysis tasks. We execute the proposed function var_names_make_unique() which makes the variable name unique by appending a number string to each duplicate index element: ‘1’, ‘2’, etc.

In [14]:
adata.var_names_make_unique()

#### Cell cycle prediction

In [None]:
cell_cycle_genes = pd.read_csv(
    "https://raw.githubusercontent.com/hbc/tinyatlas/master/cell_cycle/Homo_sapiens.csv"
)  # This is the same source as the automated Seurat function
cell_cycle_genes

sc.tl.score_genes_cell_cycle(
    adata_cellbender,
    s_genes=cell_cycle_genes[cell_cycle_genes.phase == "S"].geneID.unique(),
    g2m_genes=cell_cycle_genes[cell_cycle_genes.phase == "G2/M"].geneID.unique(),
)
#TODO better for filtered cell or all cell

In [None]:
adata.layers["log_transformed"] = np.log1p(adata.X)
adata.to_df(layer="log_transformed")
