### Analyses of output files from the pipeline optimus v6.0.0+
#### Aim of this part of analysis is predict the cell cycle

#### Load necessary libraries and useful functions

In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
import scanpy as sc
import pandas as pd

In [3]:
import warnings
warnings.filterwarnings("ignore") 

#### Data Loading

In [25]:
input_h5ad = "../scAtlas/tmp/10k_pbmc_v3_out_filtered_cellbenderDefault_lowQcell_scDblFinder.h5ad"
adata = sc.read_h5ad(input_h5ad)

In [21]:
display(adata)

AnnData object with n_obs × n_vars = 11018 × 58347
    obs: 'background_fraction', 'cell_probability', 'cell_size', 'droplet_efficiency', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'outlier', 'scDblFinder_score', 'scDblFinder_class'
    var: 'ambient_expression', 'feature_type', 'genome', 'gene_id', 'cellbender_analyzed', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'barcode_indices_for_latents', 'barcodes_analyzed', 'barcodes_analyzed_inds', 'cell_size_lognormal_std', 'empty_droplet_size_lognormal_loc', 'empty_droplet_size_lognormal_scale', 'estimator', 'features_analyzed_inds', 'fraction_data_used_for_testing', 'learning_curve_learning_rate_epoch', 'learning_curve_learning_rate_value', 'learning_curve_test_elbo', 'learning_curve_test_epoch', 'learning_curve_train_e

###### check barcode and gene names

In [5]:
display(adata.var_names)
display(adata.obs_names)

Index(['DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2HG', 'MIR1302-2', 'FAM138A',
       'AL627309.6', 'OR4G11P', 'OR4F5', 'AL627309.1',
       ...
       'pRNA-11', 'RNA5-8S5', 'pRNA-12', 'RNA5-8SN2', 'AC007325.3',
       'AC007325.1', 'AC007325.4', 'AC007325.2', 'U6-36', 'U1-15'],
      dtype='object', name='gene_name', length=58347)

Index(['CAGAGCCTCTTCGATT', 'TACATTCTCCGTAGTA', 'CACTAAGGTCACCACG',
       'ATCACGAAGGGACACT', 'TGTTACTCATAACTCG', 'ATAGACCGTCAGGCAA',
       'AATCGACGTTTCGTAG', 'ATCTTCATCCCGAATA', 'TGTTACTTCTACAGGT',
       'AGACCCGAGAATACAC',
       ...
       'CTTTCGGAGATCGCTT', 'CCCTCTCCACAGCTTA', 'CCACAAAAGCGTCTCG',
       'TTCTGTACAGCAGTTT', 'CACCGTTCAATCCTAG', 'CCTAAGACAGCGAACA',
       'CTCCTTTGTACAGTTC', 'CCTACGTAGCACCTGC', 'GAGGGATAGTAAACGT',
       'TATTGGGGTCATACCA'],
      dtype='object', name='barcode', length=11018)

In [26]:
adata_singlet = adata.copy()
adata_singlet = adata[adata.obs['scDblFinder_class'] == 'singlet']

In [27]:
adata_singlet

View of AnnData object with n_obs × n_vars = 10214 × 58347
    obs: 'background_fraction', 'cell_probability', 'cell_size', 'droplet_efficiency', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'outlier', 'scDblFinder_score', 'scDblFinder_class'
    var: 'ambient_expression', 'feature_type', 'genome', 'gene_id', 'cellbender_analyzed', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'barcode_indices_for_latents', 'barcodes_analyzed', 'barcodes_analyzed_inds', 'cell_size_lognormal_std', 'empty_droplet_size_lognormal_loc', 'empty_droplet_size_lognormal_scale', 'estimator', 'features_analyzed_inds', 'fraction_data_used_for_testing', 'learning_curve_learning_rate_epoch', 'learning_curve_learning_rate_value', 'learning_curve_test_elbo', 'learning_curve_test_epoch', 'learning_curve

#### Cell cycle prediction

In [10]:
cell_cycle_genes = pd.read_csv(
    "https://raw.githubusercontent.com/hbc/tinyatlas/master/cell_cycle/Homo_sapiens.csv"
)  # This is the same source as the automated Seurat function
cell_cycle_genes

Unnamed: 0,phase,geneID,modified
0,G2/M,ENSG00000010292,9/13/17
1,G2/M,ENSG00000011426,9/13/17
2,G2/M,ENSG00000013810,9/13/17
3,G2/M,ENSG00000072571,9/13/17
4,G2/M,ENSG00000075218,9/13/17
...,...,...,...
92,S,ENSG00000175305,9/13/17
93,S,ENSG00000176890,9/13/17
94,S,ENSG00000197299,9/13/17
95,S,ENSG00000198056,9/13/17


In [12]:
hgnc_ensg_df = pd.DataFrame(adata.var.gene_id.str.split('.').str[0]).reset_index()

cell_cycle_genes_with_hgnc = cell_cycle_genes.merge(hgnc_ensg_df, left_on='geneID', right_on='gene_id', how='left')
cell_cycle_genes_with_hgnc = cell_cycle_genes_with_hgnc.drop(columns=['gene_id'])
cell_cycle_genes_with_hgnc

Unnamed: 0,phase,geneID,modified,gene_name
0,G2/M,ENSG00000010292,9/13/17,NCAPD2
1,G2/M,ENSG00000011426,9/13/17,ANLN
2,G2/M,ENSG00000013810,9/13/17,TACC3
3,G2/M,ENSG00000072571,9/13/17,HMMR
4,G2/M,ENSG00000075218,9/13/17,GTSE1
...,...,...,...,...
92,S,ENSG00000175305,9/13/17,CCNE2
93,S,ENSG00000176890,9/13/17,TYMS
94,S,ENSG00000197299,9/13/17,BLM
95,S,ENSG00000198056,9/13/17,PRIM1


In [28]:
sc.tl.score_genes_cell_cycle(
    adata_singlet,
    s_genes=cell_cycle_genes_with_hgnc[cell_cycle_genes_with_hgnc.phase == "S"].gene_name.unique(),
    g2m_genes=cell_cycle_genes_with_hgnc[cell_cycle_genes_with_hgnc.phase == "G2/M"].gene_name.unique(),
)

  adata.obs[score_name] = pd.Series(
  next(self.gen)


In [29]:
adata_singlet.obs

Unnamed: 0_level_0,background_fraction,cell_probability,cell_size,droplet_efficiency,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,total_counts_mt,log1p_total_counts_mt,pct_counts_mt,outlier,scDblFinder_score,scDblFinder_class,S_score,G2M_score,phase
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
CAGAGCCTCTTCGATT,0.001418,0.999955,17112.318359,2.482191,4812,8.479076,40838.0,10.617393,49.248249,4445.0,8.399760,10.884470,False,0.001850,singlet,-0.570551,-0.771318,G1
TACATTCTCCGTAGTA,0.001654,0.999955,17020.669922,2.468767,5152,8.547334,40432.0,10.607402,39.490997,3020.0,8.013343,7.469331,False,0.060444,singlet,-0.819135,-1.629027,G1
ATCACGAAGGGACACT,0.001414,0.999955,16725.431641,2.459050,4174,8.336870,39553.0,10.585422,65.992466,1273.0,7.149917,3.218466,False,0.001599,singlet,-0.376130,-0.529199,G1
ATCTTCATCCCGAATA,0.001732,0.999955,16453.822266,2.411249,6309,8.749891,38049.0,10.546656,18.662777,3729.0,8.224164,9.800520,False,0.037430,singlet,-1.309073,-1.486649,G1
TGTTACTTCTACAGGT,0.001568,0.999955,16203.316406,2.377210,5381,8.590815,36933.0,10.516888,38.142041,2562.0,7.848934,6.936886,False,0.129985,singlet,-0.990421,-3.163910,G1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CCTAAGACAGCGAACA,0.293777,0.999984,2954.608154,0.608236,522,6.259581,976.0,6.884487,42.315574,118.0,4.779123,12.090164,False,0.000025,singlet,-0.002591,-0.020930,G1
CTCCTTTGTACAGTTC,0.264426,0.999971,2967.668945,0.599723,515,6.246107,1007.0,6.915723,43.594836,71.0,4.276666,7.050645,False,0.000035,singlet,-0.036269,0.039276,G2M
CCTACGTAGCACCTGC,0.276518,0.999998,2711.298096,0.675461,614,6.421622,989.0,6.897705,29.120324,92.0,4.532599,9.302326,False,0.000019,singlet,-0.002651,-0.018691,G1
GAGGGATAGTAAACGT,0.275862,0.999977,2838.524658,0.632272,532,6.278521,987.0,6.895683,37.284701,89.0,4.499810,9.017224,False,0.000055,singlet,-0.023316,-0.025581,G1


In [32]:
adata_singlet.obs['phase'].value_counts()

G1     9898
S       239
G2M      77
Name: phase, dtype: int64

#### Write output to h5ad

In [49]:
PATH_TO_OUTPUT = "../scAtlas/tmp/10k_pbmc_v3_out_filtered_cellbenderDefault_lowQcell_scDblFinderSinglet_CellCycle.h5ad"
adata_singlet.write_h5ad(PATH_TO_OUTPUT)

#### Draft

In [None]:
def get_ensembl_mappings():                                   
                                              
    server = biomart.BiomartServer('http://ensembl.org/biomart')         
    mart = server.datasets['hsapiens_gene_ensembl']
    
                                             
    attributes = ['hgnc_symbol', 'ensembl_gene_id']  # List the types of data we want  
                                     
    response = mart.search({'attributes': attributes})                          
    data = response.raw.data.decode('ascii')                                    
                                                                                
    attributes_dic= {}
    for line in data.splitlines():                                              
        hgnc_symbol, ensembl_gene = line.split('\t')
        attributes_dic[hgnc_symbol] = ensembl_gene
    
    return attributes_dic

#hgnc_ensembl = get_ensembl_mappings()