# What is about ?

Here we load data and make simple EDA. 

Data from Nature 2020 paper: 
https://www.nature.com/articles/s41586-021-03232-9
"Spatiotemporal dissection of the cell cycle with single-cell proteogenomics"
Emma Lundberg et.al.

Remarks on data:
Loom file: 1152 cells × 58884 genes expression matrix.  Stored in sparse matrix format. 

**PROBLEM WITH LOOM file** - we canNOT corectly merge it with csv file with information on cell cycle phases - 
the file SraRunTable.txt  (the order of rows in files is different and no key to merge - conlusions made from visulization and compararaison with CSV file with count matrix).

Better to use CSV count matrix file - it does not have such problem. We will use it in next versions of the notebook.








# Import and install modules

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import time
t0start = time.time()

import pandas as pd
import numpy as np
import os
import sys

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 70
plt.style.use('dark_background')

import seaborn as sns

from sklearn.decomposition import PCA

In [None]:
!pip install scanpy
import scanpy as sc
import anndata

!pip install loompy # scanpy needs it to load loom files 


# Load data

In [None]:
str_data_inf = ' Lundberg2020 u2os LoomFile' # Cell type u2os

In [None]:
fn = '/kaggle/input/single-cell-rnaseq-data-related-to-cell-cycle/u2os.loom'

import time
t0 = time.time()
adata = sc.read_loom(fn)
adata.obs['n.umi'] = np.asarray(adata.X.sum(axis = 1)).ravel()
adata_orig = adata.copy()
print(np.round(time.time() - t0,1), 'seconds')
adata

# Look on data

In [None]:
adata.var

In [None]:
adata.obs

In [None]:
adata.X

In [None]:
adata.X[:15,:15].toarray()

In [None]:
adata.layers

In [None]:
adata.layers['matrix']

In [None]:
adata.layers['matrix'][:15,:15].toarray()

In [None]:
adata.layers['ambiguous']

In [None]:
adata.layers['ambiguous'][:15,:15].toarray()

In [None]:
adata.layers['spanning']

In [None]:
adata.layers['spanning'][:15,:15].toarray()

In [None]:
adata.layers['spliced']

In [None]:
adata.layers['spliced'][:15,:15].toarray()

In [None]:
adata.layers['unspliced']

In [None]:
adata.layers['unspliced'][:15,:15].toarray()

# EDA

## Expressions  per cells

In [None]:
adata.obs['n.umi'] = np.asarray(adata.X.sum(axis = 1)).ravel()

fig = plt.figure(figsize = (20,6)); c = 0
c+=1; fig.add_subplot(1,2,c);
plt.plot(np.sort(adata.obs['n.umi'])) 
plt.title('Expression per cell')
plt.xlabel('cells sorted')
plt.ylabel('Counts')

c+=1; fig.add_subplot(1,2,c);
plt.hist(np.sort(adata.obs['n.umi']), bins = 100) 
plt.title('Expression per cell')

plt.show()

adata.obs['n.umi'].describe()

## Expression per gene

In [None]:
v = np.asarray(adata.X.sum(axis = 0)).ravel()
display(pd.Series(v).describe())
v = np.log10(1+v)
fig = plt.figure(figsize = (20,6)); c = 0
c+=1; fig.add_subplot(1,2,c);
plt.plot(np.sort(v)) 
plt.title('LOG10 Expression per gene')
plt.xlabel('genes sorted')
plt.ylabel('Log10 (1+Counts) ')

c+=1; fig.add_subplot(1,2,c);
plt.hist(np.sort(v), bins = 30) 
plt.title('LOG10 Expression per gene')

plt.show()

pd.Series(v).describe()

In [None]:
v = np.asarray(adata.X.sum(axis = 0)).ravel()
adata.var['counts'] = v
adata.var.sort_values('counts',ascending = False ).head(50) # ['counts']

# Visualizations 

## PCA (without preprocessing)

In [None]:
import scipy
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
import time
t0 = time.time()
if scipy.sparse.issparse(adata.X):
    reducer = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
else:
    reducer = PCA(n_components=2)
    
r = reducer.fit_transform(adata.X)

fig = plt.figure(figsize = (25,12))
plt.title(str_data_inf + ' PCA  n_cells: ' + str(adata.X.shape[0]) +\
            ' n_genes: ' + str(adata.X.shape[1])  + ' '  , fontsize = 20   )#' 
ax = sns.scatterplot(x=r[:,0],y=r[:,1])# ,   hue= color_by)#, alpha = 0.7, marker = '.', legend = None)
plt.xlabel('PCA1' , fontsize = 20 )
plt.ylabel('PCA2' , fontsize = 20 )
plt.show()
print( np.round( time.time() - t0,1) , ' seconds passed ' ) 


## UMAP (without preprocessing)

In [None]:
import scipy
import umap 
t0 = time.time()
reducer = umap.UMAP()
r = reducer.fit_transform(adata.X)

fig = plt.figure(figsize = (25,12))
plt.title(str_data_inf + ' UMAP n_cells: ' + str(adata.X.shape[0]) +\
            ' n_genes: ' + str(adata.X.shape[1])  + ' '  , fontsize = 20   )#' 
ax = sns.scatterplot(x=r[:,0],y=r[:,1])# ,   hue= color_by)#, alpha = 0.7, marker = '.', legend = None)
plt.xlabel('UMAP1' , fontsize = 20 )
plt.ylabel('UMAP2' , fontsize = 20 )
plt.show()
print( np.round( time.time() - t0,1) , ' seconds passed ' ) 


In [None]:
import scipy
import umap 
t0 = time.time()
#reducer = umap.UMAP()
reducer = umap.UMAP(n_neighbors = 250,min_dist = 0.9)

r = reducer.fit_transform(adata.X)

fig = plt.figure(figsize = (25,12))
plt.title(str_data_inf + ' UMAP(250,0.9) n_cells: ' + str(adata.X.shape[0]) +\
            ' n_genes: ' + str(adata.X.shape[1])  + ' '  , fontsize = 20   )#' 
ax = sns.scatterplot(x=r[:,0],y=r[:,1])# ,   hue= color_by)#, alpha = 0.7, marker = '.', legend = None)
plt.xlabel('UMAP1' , fontsize = 20 )
plt.ylabel('UMAP2' , fontsize = 20 )
plt.show()
print( np.round( time.time() - t0,1) , ' seconds passed ' ) 


## PCA colored by genes

In [None]:
plot_mode = 'PCA'# 'Phase_plot':
n_x_subplots = 3
genes_processing_mode = 'discretize_3bins_'# 'median_binarize_'

mask = np.ones( adata.X.shape[0]).astype(bool) # 

import scipy
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
import time
t0 = time.time()
if scipy.sparse.issparse(adata.X):
    reducer = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
else:
    reducer = PCA(n_components=2)
    
r = reducer.fit_transform(adata.X)

c = 0
for color_by_mode in  ['n.umi',#'cell_cycle_phase', #  'PCA1ALL', 'n.umi', 'threshold_binarize_5_PCA1ALL',
    'E2F1','FOXM1']:#,'PCNA','TOP2A', 'CCNE1', 'CCNE2', 'CDK2', 'CCNB1','CCNB2','CCNB3','CCNA2',  ]: # , 'pct_counts_mt']:
    if c % n_x_subplots == 0:
        fig = plt.figure(figsize = (20,5) ); c = 0
        plt.suptitle(str_data_inf + ' PCA  n_cells: ' + str(adata.X.shape[0]) +\
            ' n_genes: ' + str(adata.X.shape[1])  + ' '  , fontsize = 20   )#' 
        c = 0
    c += 1; fig.add_subplot(1,n_x_subplots ,c)
    plt.title(str(color_by_mode), fontsize = 20)
    
    color_by_field_name = color_by_mode
    if color_by_mode in adata.obs:
        color_by = (adata[mask].obs[color_by_field_name])
    elif color_by_mode in adata.var.index:
        I_gene = np.where(adata.var.index == color_by_mode)[0]
        v = adata[mask].X[:,I_gene]
        if scipy.sparse.issparse(adata.X):
            v = np.asarray(v.toarray()).ravel()
        if genes_processing_mode == 'median_binarize_':
            median_loc = np.median( v  )
            color_by = np.asarray (v  > median_loc ).ravel()
        elif genes_processing_mode == 'discretize_3bins_':
            t1 = np.percentile(v,33)    
            t2 = np.percentile(v,66)
            color_by = np.asarray (v  > t1 ).ravel().astype(int) + np.asarray (v  > t2 ).ravel().astype(int)
        else:
            color_by = v
    elif 'median_binarize_' in color_by_mode:
        color_by_field_name = color_by_mode[16:]
        if color_by_field_name in adata.obs:
            color_by = (adata[mask].obs[color_by_field_name]) > np.median( (adata.obs[color_by_field_name]) )
        else: color_by_mode = None
    elif 'threshold_binarize_' in color_by_mode:
        color_by_field_name = color_by_mode.split('_')[3]
        threshold_binarize = float( color_by_mode.split('_')[2] )
        if color_by_field_name in adata.obs:
            color_by = (adata[mask].obs[color_by_field_name]) > threshold_binarize
        else: color_by_mode = None
    else:
        color_by_mode = None

    v1 = r[:,0]
    v2 = r[:,1]
    if color_by_mode is None:
        ax = sns.scatterplot(x=v1, y = v2)# ,  hue= color_by,   alpha = 0.8, marker = '.')#, legend=None)
    else:
        if color_by_field_name == 'cell_cycle_phase':
            if len(np.unique(color_by)) == 3:
                color_palette = ['red', 'green','blue' ]
            else:
                color_palette = "viridis"# sns.color_palette("tab10")
        else:
            color_palette = "viridis"# sns.color_palette("tab10")        #color_by = (adata.obs[color_by_field_name]) > np.median( adata.obs[color_by_field_name].values ) 
        ax = sns.scatterplot(x=v1, y = v2,  hue= color_by, palette = color_palette )#, palette = "viridis")# sns.color_palette("viridis", as_cmap=True),
                            #)# ,   alpha = 0.8, marker = '.')#, )#, legend=None)
        plt.setp(ax.get_legend().get_texts(), fontsize='20') # for legend text
        plt.setp(ax.get_legend().get_title(), fontsize='20') # for legend title

        
plt.show()
print( np.round( time.time() - t0,1) , ' seconds passed ' ) 


## Different axes PCA colored by genes 

In [None]:
plot_mode = 'PCA'# 'Phase_plot':
n_x_subplots = 3
genes_processing_mode = 'discretize_3bins_'# 'median_binarize_'

mask = np.ones( adata.X.shape[0]).astype(bool) # 

import scipy
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
import time
t0 = time.time()
if scipy.sparse.issparse(adata.X):
    reducer = TruncatedSVD(n_components=10, n_iter=7, random_state=42)
else:
    reducer = PCA(n_components=10)
    
r = reducer.fit_transform(adata.X)

c = 0
for i,j in [(0,1),(2,3),(3,4),(5,6)]:#,(1,3),(2,3),(3,4)]:
    for color_by_mode in  ['n.umi',#'cell_cycle_phase', #  'PCA1ALL', 'n.umi', 'threshold_binarize_5_PCA1ALL',
        'E2F1','FOXM1']:#,'PCNA','TOP2A', 'CCNE1', 'CCNE2', 'CDK2', 'CCNB1','CCNB2','CCNB3','CCNA2',  ]: # , 'pct_counts_mt']:
        if c % n_x_subplots == 0:
            fig = plt.figure(figsize = (20,5) ); c = 0
            plt.suptitle(str_data_inf + ' PCA  n_cells: ' + str(adata.X.shape[0]) +\
                ' n_genes: ' + str(adata.X.shape[1])  + ' '  , fontsize = 20   )#' 
            c = 0
        c += 1; fig.add_subplot(1,n_x_subplots ,c)
        plt.title(str(color_by_mode) + ' PCA '+str(i)+','+str(j), fontsize = 20)

        color_by_field_name = color_by_mode
        if color_by_mode in adata.obs:
            color_by = (adata[mask].obs[color_by_field_name])
        elif color_by_mode in adata.var.index:
            I_gene = np.where(adata.var.index == color_by_mode)[0]
            v = adata[mask].X[:,I_gene]
            if scipy.sparse.issparse(adata.X):
                v = np.asarray(v.toarray()).ravel()
            if genes_processing_mode == 'median_binarize_':
                median_loc = np.median( v  )
                color_by = np.asarray (v  > median_loc ).ravel()
            elif genes_processing_mode == 'discretize_3bins_':
                t1 = np.percentile(v,33)    
                t2 = np.percentile(v,66)
                color_by = np.asarray (v  > t1 ).ravel().astype(int) + np.asarray (v  > t2 ).ravel().astype(int)
            else:
                color_by = v
        elif 'median_binarize_' in color_by_mode:
            color_by_field_name = color_by_mode[16:]
            if color_by_field_name in adata.obs:
                color_by = (adata[mask].obs[color_by_field_name]) > np.median( (adata.obs[color_by_field_name]) )
            else: color_by_mode = None
        elif 'threshold_binarize_' in color_by_mode:
            color_by_field_name = color_by_mode.split('_')[3]
            threshold_binarize = float( color_by_mode.split('_')[2] )
            if color_by_field_name in adata.obs:
                color_by = (adata[mask].obs[color_by_field_name]) > threshold_binarize
            else: color_by_mode = None
        else:
            color_by_mode = None

        v1 = r[:,i]
        v2 = r[:,j]
        if color_by_mode is None:
            ax = sns.scatterplot(x=v1, y = v2)# ,  hue= color_by,   alpha = 0.8, marker = '.')#, legend=None)
        else:
            if color_by_field_name == 'cell_cycle_phase':
                if len(np.unique(color_by)) == 3:
                    color_palette = ['red', 'green','blue' ]
                else:
                    color_palette = "viridis"# sns.color_palette("tab10")
            else:
                color_palette = "viridis"# sns.color_palette("tab10")        #color_by = (adata.obs[color_by_field_name]) > np.median( adata.obs[color_by_field_name].values ) 
            ax = sns.scatterplot(x=v1, y = v2,  hue= color_by, palette = color_palette )#, palette = "viridis")# sns.color_palette("viridis", as_cmap=True),
                                #)# ,   alpha = 0.8, marker = '.')#, )#, legend=None)
            plt.setp(ax.get_legend().get_texts(), fontsize='20') # for legend text
            plt.setp(ax.get_legend().get_title(), fontsize='20') # for legend title

        
plt.show()
print( np.round( time.time() - t0,1) , ' seconds passed ' ) 


## Plots in genes axes

In [None]:

mask = np.ones( adata.X.shape[0]).astype(bool) # 

gene1 = 'E2F1' 
gene2 = 'FOXM1'

color_by_mode = 'n.umi' # ,#'cell_cycle_phase', #  'PCA1ALL', 'n.umi', 'threshold_binarize_5_PCA1ALL',
if color_by_mode in adata.obs:
    color_by_field_name = color_by_mode
    color_by = (adata[mask].obs[color_by_field_name])
else:
    color_by_mode = None



I_gene = np.where(adata.var.index == gene1)[0]
v = adata[mask].X[:,I_gene]
if scipy.sparse.issparse(adata.X):
    v = np.asarray(v.toarray()).ravel()
v1 = v
I_gene = np.where(adata.var.index == gene2)[0]
v = adata[mask].X[:,I_gene]
if scipy.sparse.issparse(adata.X):
    v = np.asarray(v.toarray()).ravel()
v2 = v

fig = plt.figure(figsize = (20,10) ); c = 0
plt.title(str_data_inf + ' Genes axes   n_cells: ' + str(adata.X.shape[0]) +\
    ' n_genes: ' + str(adata.X.shape[1])  + ' '  , fontsize = 20   )#' 

if color_by_mode is None:
    ax = sns.scatterplot(x=v1,y=v2)# ,   hue= color_by)#, alpha = 0.7, marker = '.', legend = None)
else:
    ax = sns.scatterplot(x=v1,y=v2,   hue= color_by)#, alpha = 0.7, marker = '.', legend = None)
    plt.setp(ax.get_legend().get_texts(), fontsize='15') # for legend text
    plt.setp(ax.get_legend().get_title(), fontsize='15') # for legend title    plt.xlabel('PCA1' , fontsize = 20 )
    
plt.xlabel(gene1,  fontsize = 20)
plt.ylabel(gene2,  fontsize = 20)
plt.show()   
    


In [None]:
mask = np.ones( adata.X.shape[0]).astype(bool) # 

gene1 = 'E2F1' 
gene2 = 'FOXM1'
n_x_subplots = 4

c = 0 
for gene1,gene2 in [ ('E2F1','FOXM1'), ('PCNA','TOP2A'), ('CCNB1', 'CCNB2'), ('CCNE1', 'CCNB1'),
                   ('E2F1','PCNA'),('FOXM1','PCNA'), ('E2F1','TOP2A'),('FOXM1','TOP2A'), ]: 
    #,'PCNA','TOP2A', 'CCNE1', 'CCNE2', 'CDK2', 'CCNB1','CCNB2','CCNB3','CCNA2',  ]: # , 'pct_counts_mt']:
    color_by_mode = 'n.umi' # ,#'cell_cycle_phase', #  'PCA1ALL', 'n.umi', 'threshold_binarize_5_PCA1ALL',
    if color_by_mode in adata.obs:
        color_by_field_name = color_by_mode
        color_by = (adata[mask].obs[color_by_field_name])
    else:
        color_by_mode = None
    
    if c % n_x_subplots == 0:
        fig = plt.figure(figsize = (20,6) ); c = 0
        plt.suptitle(str_data_inf + ' Genes axes   n_cells: ' + str(adata.X.shape[0]) +\
            ' n_genes: ' + str(adata.X.shape[1])  + ' color by '+ str(color_by_mode) , fontsize = 20   )#' 
        c = 0
        
    c += 1; fig.add_subplot(1,n_x_subplots ,c)
    plt.title(gene1 + ' ' + gene2) # str(color_by_mode) + ' PCA '+str(i)+','+str(j))

    plt.legend()
    I_gene = np.where(adata.var.index == gene1)[0]
    v = adata[mask].X[:,I_gene]
    if scipy.sparse.issparse(adata.X):
        v = np.asarray(v.toarray()).ravel()
    v1 = v
    I_gene = np.where(adata.var.index == gene2)[0]
    v = adata[mask].X[:,I_gene]
    if scipy.sparse.issparse(adata.X):
        v = np.asarray(v.toarray()).ravel()
    v2 = v


    if color_by_mode is None:
        ax = sns.scatterplot(x=v1,y=v2)# ,   hue= color_by)#, alpha = 0.7, marker = '.', legend = None)
    else:
        ax = sns.scatterplot(x=v1,y=v2,   hue= color_by)#, alpha = 0.7, marker = '.', legend = None)
        plt.setp(ax.get_legend().get_texts(), fontsize='15') # for legend text
        plt.setp(ax.get_legend().get_title(), fontsize='15') # for legend title    plt.xlabel('PCA1' , fontsize = 20 )
        ax.legend().set_visible(False)

    plt.xlabel(gene1,  fontsize = 20)
    plt.ylabel(gene2,  fontsize = 20)
plt.show()  

# Preprocessing - normalizations / log / filtering 

In [None]:
adata_orig.var_names_make_unique()
adata_orig.var_names_make_unique()

In [None]:
# First standard preprocessing

list_genes2include_mandotory = [ 'E2F1','FOXM1'] # can specify genes which we will keep even if they are not top variable
# Params: 
n_top_genes_to_keep = 10000
threshold_pct_counts_mt = 32
min_count = 150_000
max_count = 420_000# 12000
if 'LoomFile' not in str_data_inf:
    threshold_pct_counts_mt = 25
    min_count = 300_000
    max_count = 700_000# 12000
    

print(adata_orig.X.sum())

# ################################################################################################
# Preprocessing first step: filter CELLs by  counts and level of MT-percent
# thresholds are set visually looking on violin plots 
# Examples:
# threshold_pct_counts_mt = 20 - 40  min_count = 500 - 1000; max_count = 10000 - 12000; 
# These thresholds depends on cell line dataset


adata = adata_orig.copy()
print(adata)

# Calculate and plot statistics of counts per cell, percent of mitochondrial genes (high percent is bad for cell)
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
#sv.pp.remove_duplicate_cells(adata)
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(adata, ['n_genes_by_counts', 'n.umi', 'total_counts', 'pct_counts_mt'],jitter=1.9, multi_panel=True)
median_count = np.median(adata.obs['total_counts'])
print('Median total counts =',median_count)
# min_count = np.max((median_count/2,5000))
print('min_count=',min_count,'max_count=',max_count)

print('Look at total_counts va MT-percent, expect some linear dependence - but does not happen: ')
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')

# Filtering out cells which beyond thresholds
inds1 = np.where((adata.obs['total_counts']>min_count) & (adata.obs['total_counts']<max_count))
inds2 = np.where(adata.obs['pct_counts_mt']<threshold_pct_counts_mt)
print(len(inds1[0]),'samples pass the count filter')
print(len(inds2[0]),' samples pass the mt filter')
ind_samples = np.intersect1d(inds1[0],inds2[0])
print('Samples selected',len(ind_samples))
adata.uns['ind_samples'] = ind_samples

# Here we cut cells. Filtering out those with counts too low or too big
adata = adata[ind_samples,:]

# ################################################################################################
# Preprocessing second step: 
# 1) normalization to some value, i.e. median of the total counts
# 2) taking logs
# 3) keeping only higly variable genes

sc.pp.normalize_total(adata, target_sum=np.median(adata.obs["total_counts"]))
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata,n_top_genes=n_top_genes_to_keep,n_bins=20)
ind_genes = np.where(adata.var['highly_variable'])[0]
ind_genes2 = np.where(adata.var.index.isin(list_genes2include_mandotory ) )[0]
ind_genes = list( set( ind_genes )|set( ind_genes2 ) )
adata = adata[:,ind_genes]

print('Violin plots after filtering cells and genes')
sc.pl.violin(adata, ['n_genes_by_counts','n.umi', 'total_counts', 'pct_counts_mt'],jitter=1.9, multi_panel=True)


# Visualizations after preprocessing

## Various PCA

In [None]:
import scipy
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
import time
t0 = time.time()
if scipy.sparse.issparse(adata.X):
    reducer = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
else:
    reducer = PCA(n_components=2)
    
r = reducer.fit_transform(adata.X)

fig = plt.figure(figsize = (25,12))
plt.title(str_data_inf + ' PCA  n_cells: ' + str(adata.X.shape[0]) +\
            ' n_genes: ' + str(adata.X.shape[1])  + ' '  , fontsize = 20   )#' 
ax = sns.scatterplot(x=r[:,0],y=r[:,1])# ,   hue= color_by)#, alpha = 0.7, marker = '.', legend = None)
plt.xlabel('PCA1' , fontsize = 20 )
plt.ylabel('PCA2' , fontsize = 20 )
plt.show()
print( np.round( time.time() - t0,1) , ' seconds passed ' ) 

# ################################################################################################################
# PCA colored by genes n.umi etc
# ################################################################################################################

plot_mode = 'PCA'# 'Phase_plot':
n_x_subplots = 3
genes_processing_mode = 'discretize_3bins_'# 'median_binarize_'

mask = np.ones( adata.X.shape[0]).astype(bool) # 

import scipy
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
import time
t0 = time.time()
if scipy.sparse.issparse(adata.X):
    reducer = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
else:
    reducer = PCA(n_components=2)
    
r = reducer.fit_transform(adata.X)

c = 0
for color_by_mode in  ['n.umi',#'cell_cycle_phase', #  'PCA1ALL', 'n.umi', 'threshold_binarize_5_PCA1ALL',
    'E2F1','FOXM1']:#,'PCNA','TOP2A', 'CCNE1', 'CCNE2', 'CDK2', 'CCNB1','CCNB2','CCNB3','CCNA2',  ]: # , 'pct_counts_mt']:
    if c % n_x_subplots == 0:
        fig = plt.figure(figsize = (20,5) ); c = 0
        plt.suptitle(str_data_inf + ' PCA  n_cells: ' + str(adata.X.shape[0]) +\
            ' n_genes: ' + str(adata.X.shape[1])  + ' '  , fontsize = 20   )#' 
        c = 0
    c += 1; fig.add_subplot(1,n_x_subplots ,c)
    plt.title(str(color_by_mode), fontsize = 20)
    
    color_by_field_name = color_by_mode
    if color_by_mode in adata.obs:
        color_by = (adata[mask].obs[color_by_field_name])
    elif color_by_mode in adata.var.index:
        I_gene = np.where(adata.var.index == color_by_mode)[0]
        v = adata[mask].X[:,I_gene]
        if scipy.sparse.issparse(adata.X):
            v = np.asarray(v.toarray()).ravel()
        if genes_processing_mode == 'median_binarize_':
            median_loc = np.median( v  )
            color_by = np.asarray (v  > median_loc ).ravel()
        elif genes_processing_mode == 'discretize_3bins_':
            t1 = np.percentile(v,33)    
            t2 = np.percentile(v,66)
            color_by = np.asarray (v  > t1 ).ravel().astype(int) + np.asarray (v  > t2 ).ravel().astype(int)
        else:
            color_by = v
    elif 'median_binarize_' in color_by_mode:
        color_by_field_name = color_by_mode[16:]
        if color_by_field_name in adata.obs:
            color_by = (adata[mask].obs[color_by_field_name]) > np.median( (adata.obs[color_by_field_name]) )
        else: color_by_mode = None
    elif 'threshold_binarize_' in color_by_mode:
        color_by_field_name = color_by_mode.split('_')[3]
        threshold_binarize = float( color_by_mode.split('_')[2] )
        if color_by_field_name in adata.obs:
            color_by = (adata[mask].obs[color_by_field_name]) > threshold_binarize
        else: color_by_mode = None
    else:
        color_by_mode = None

    v1 = r[:,0]
    v2 = r[:,1]
    if color_by_mode is None:
        ax = sns.scatterplot(x=v1, y = v2)# ,  hue= color_by,   alpha = 0.8, marker = '.')#, legend=None)
    else:
        if color_by_field_name == 'cell_cycle_phase':
            if len(np.unique(color_by)) == 3:
                color_palette = ['red', 'green','blue' ]
            else:
                color_palette = "viridis"# sns.color_palette("tab10")
        else:
            color_palette = "viridis"# sns.color_palette("tab10")        #color_by = (adata.obs[color_by_field_name]) > np.median( adata.obs[color_by_field_name].values ) 
        ax = sns.scatterplot(x=v1, y = v2,  hue= color_by, palette = color_palette )#, palette = "viridis")# sns.color_palette("viridis", as_cmap=True),
                            #)# ,   alpha = 0.8, marker = '.')#, )#, legend=None)
        plt.setp(ax.get_legend().get_texts(), fontsize='20') # for legend text
        plt.setp(ax.get_legend().get_title(), fontsize='20') # for legend title

        
plt.show()
print( np.round( time.time() - t0,1) , ' seconds passed ' ) 

# ################################################################################################################
# PCA different axes 
# ################################################################################################################

plot_mode = 'PCA'# 'Phase_plot':
n_x_subplots = 3
genes_processing_mode = 'discretize_3bins_'# 'median_binarize_'

mask = np.ones( adata.X.shape[0]).astype(bool) # 

import scipy
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
import time
t0 = time.time()
if scipy.sparse.issparse(adata.X):
    reducer = TruncatedSVD(n_components=10, n_iter=7, random_state=42)
else:
    reducer = PCA(n_components=10)
    
r = reducer.fit_transform(adata.X)

c = 0
for i,j in [(2,3),(3,4),(5,6)]:#(0,1),(1,3),(2,3),(3,4)]:
    for color_by_mode in  ['n.umi',#'cell_cycle_phase', #  'PCA1ALL', 'n.umi', 'threshold_binarize_5_PCA1ALL',
        'E2F1','FOXM1']:#,'PCNA','TOP2A', 'CCNE1', 'CCNE2', 'CDK2', 'CCNB1','CCNB2','CCNB3','CCNA2',  ]: # , 'pct_counts_mt']:
        if c % n_x_subplots == 0:
            fig = plt.figure(figsize = (20,5) ); c = 0
            plt.suptitle(str_data_inf + ' PCA  n_cells: ' + str(adata.X.shape[0]) +\
                ' n_genes: ' + str(adata.X.shape[1])  + ' '  , fontsize = 20   )#' 
            c = 0
        c += 1; fig.add_subplot(1,n_x_subplots ,c)
        plt.title(str(color_by_mode) + ' PCA '+str(i)+','+str(j), fontsize = 20)

        color_by_field_name = color_by_mode
        if color_by_mode in adata.obs:
            color_by = (adata[mask].obs[color_by_field_name])
        elif color_by_mode in adata.var.index:
            I_gene = np.where(adata.var.index == color_by_mode)[0]
            v = adata[mask].X[:,I_gene]
            if scipy.sparse.issparse(adata.X):
                v = np.asarray(v.toarray()).ravel()
            if genes_processing_mode == 'median_binarize_':
                median_loc = np.median( v  )
                color_by = np.asarray (v  > median_loc ).ravel()
            elif genes_processing_mode == 'discretize_3bins_':
                t1 = np.percentile(v,33)    
                t2 = np.percentile(v,66)
                color_by = np.asarray (v  > t1 ).ravel().astype(int) + np.asarray (v  > t2 ).ravel().astype(int)
            else:
                color_by = v
        elif 'median_binarize_' in color_by_mode:
            color_by_field_name = color_by_mode[16:]
            if color_by_field_name in adata.obs:
                color_by = (adata[mask].obs[color_by_field_name]) > np.median( (adata.obs[color_by_field_name]) )
            else: color_by_mode = None
        elif 'threshold_binarize_' in color_by_mode:
            color_by_field_name = color_by_mode.split('_')[3]
            threshold_binarize = float( color_by_mode.split('_')[2] )
            if color_by_field_name in adata.obs:
                color_by = (adata[mask].obs[color_by_field_name]) > threshold_binarize
            else: color_by_mode = None
        else:
            color_by_mode = None

        v1 = r[:,i]
        v2 = r[:,j]
        if color_by_mode is None:
            ax = sns.scatterplot(x=v1, y = v2)# ,  hue= color_by,   alpha = 0.8, marker = '.')#, legend=None)
        else:
            if color_by_field_name == 'cell_cycle_phase':
                if len(np.unique(color_by)) == 3:
                    color_palette = ['red', 'green','blue' ]
                else:
                    color_palette = "viridis"# sns.color_palette("tab10")
            else:
                color_palette = "viridis"# sns.color_palette("tab10")        #color_by = (adata.obs[color_by_field_name]) > np.median( adata.obs[color_by_field_name].values ) 
            ax = sns.scatterplot(x=v1, y = v2,  hue= color_by, palette = color_palette )#, palette = "viridis")# sns.color_palette("viridis", as_cmap=True),
                                #)# ,   alpha = 0.8, marker = '.')#, )#, legend=None)
            plt.setp(ax.get_legend().get_texts(), fontsize='20') # for legend text
            plt.setp(ax.get_legend().get_title(), fontsize='20') # for legend title

        
plt.show()
print( np.round( time.time() - t0,1) , ' seconds passed ' ) 

## UMAP (with several param choices)

In [None]:
import scipy
import umap 
t0 = time.time()
#reducer = umap.UMAP(n_neighbors = 250,min_dist = 0.9)
reducer = umap.UMAP()

r = reducer.fit_transform(adata.X)

fig = plt.figure(figsize = (25,12))
plt.title(str_data_inf + ' UMAP n_cells: ' + str(adata.X.shape[0]) +\
            ' n_genes: ' + str(adata.X.shape[1])  + ' '  , fontsize = 20   )#' 
ax = sns.scatterplot(x=r[:,0],y=r[:,1])# ,   hue= color_by)#, alpha = 0.7, marker = '.', legend = None)
plt.xlabel('UMAP1' , fontsize = 20 )
plt.ylabel('UMAP2' , fontsize = 20 )
plt.show()
print( np.round( time.time() - t0,1) , ' seconds passed ' ) 

############################################################################
# UMAP change params 
############################################################################
print('change umap params')

import scipy
import umap 
t0 = time.time()
#reducer = umap.UMAP()
reducer = umap.UMAP(n_neighbors = 250,min_dist = 0.9)

r = reducer.fit_transform(adata.X)

fig = plt.figure(figsize = (25,12))
plt.title(str_data_inf + ' UMAP(250,0.9) n_cells: ' + str(adata.X.shape[0]) +\
            ' n_genes: ' + str(adata.X.shape[1])  + ' '  , fontsize = 20   )#' 
ax = sns.scatterplot(x=r[:,0],y=r[:,1])# ,   hue= color_by)#, alpha = 0.7, marker = '.', legend = None)
plt.xlabel('UMAP1' , fontsize = 20 )
plt.ylabel('UMAP2' , fontsize = 20 )
plt.show()
print( np.round( time.time() - t0,1) , ' seconds passed ' ) 

############################################################################
# UMAP change params 
############################################################################
print('change umap params')

import scipy
import umap 
t0 = time.time()
#reducer = umap.UMAP()
reducer = umap.UMAP(n_neighbors = 20,min_dist = 0.2)

r = reducer.fit_transform(adata.X)

fig = plt.figure(figsize = (25,12))
plt.title(str_data_inf + ' UMAP(250,0.9) n_cells: ' + str(adata.X.shape[0]) +\
            ' n_genes: ' + str(adata.X.shape[1])  + ' '  , fontsize = 20   )#' 
ax = sns.scatterplot(x=r[:,0],y=r[:,1])# ,   hue= color_by)#, alpha = 0.7, marker = '.', legend = None)
plt.xlabel('UMAP1' , fontsize = 20 )
plt.ylabel('UMAP2' , fontsize = 20 )
plt.show()
print( np.round( time.time() - t0,1) , ' seconds passed ' ) 

In [None]:
import scipy
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
import time
t0 = time.time()
reducer = umap.UMAP()    
r = reducer.fit_transform(adata.X)

c = 0
i,j = 0,1
for color_by_mode in  ['n.umi',#'cell_cycle_phase', #  'PCA1ALL', 'n.umi', 'threshold_binarize_5_PCA1ALL',
    'E2F1','FOXM1']:#,'PCNA','TOP2A', 'CCNE1', 'CCNE2', 'CDK2', 'CCNB1','CCNB2','CCNB3','CCNA2',  ]: # , 'pct_counts_mt']:
    if c % n_x_subplots == 0:
        fig = plt.figure(figsize = (20,5) ); c = 0
        plt.suptitle(str_data_inf + ' PCA  n_cells: ' + str(adata.X.shape[0]) +\
            ' n_genes: ' + str(adata.X.shape[1])  + ' '  , fontsize = 20   )#' 
        c = 0
    c += 1; fig.add_subplot(1,n_x_subplots ,c)
    plt.title(str(color_by_mode) + ' UMAP ', fontsize = 20)

    color_by_field_name = color_by_mode
    if color_by_mode in adata.obs:
        color_by = (adata[mask].obs[color_by_field_name])
    elif color_by_mode in adata.var.index:
        I_gene = np.where(adata.var.index == color_by_mode)[0]
        v = adata[mask].X[:,I_gene]
        if scipy.sparse.issparse(adata.X):
            v = np.asarray(v.toarray()).ravel()
        if genes_processing_mode == 'median_binarize_':
            median_loc = np.median( v  )
            color_by = np.asarray (v  > median_loc ).ravel()
        elif genes_processing_mode == 'discretize_3bins_':
            t1 = np.percentile(v,33)    
            t2 = np.percentile(v,66)
            color_by = np.asarray (v  > t1 ).ravel().astype(int) + np.asarray (v  > t2 ).ravel().astype(int)
        else:
            color_by = v
    elif 'median_binarize_' in color_by_mode:
        color_by_field_name = color_by_mode[16:]
        if color_by_field_name in adata.obs:
            color_by = (adata[mask].obs[color_by_field_name]) > np.median( (adata.obs[color_by_field_name]) )
        else: color_by_mode = None
    elif 'threshold_binarize_' in color_by_mode:
        color_by_field_name = color_by_mode.split('_')[3]
        threshold_binarize = float( color_by_mode.split('_')[2] )
        if color_by_field_name in adata.obs:
            color_by = (adata[mask].obs[color_by_field_name]) > threshold_binarize
        else: color_by_mode = None
    else:
        color_by_mode = None

    v1 = r[:,i]
    v2 = r[:,j]
    if color_by_mode is None:
        ax = sns.scatterplot(x=v1, y = v2)# ,  hue= color_by,   alpha = 0.8, marker = '.')#, legend=None)
    else:
        if color_by_field_name == 'cell_cycle_phase':
            if len(np.unique(color_by)) == 3:
                color_palette = ['red', 'green','blue' ]
            else:
                color_palette = "viridis"# sns.color_palette("tab10")
        else:
            color_palette = "viridis"# sns.color_palette("tab10")        #color_by = (adata.obs[color_by_field_name]) > np.median( adata.obs[color_by_field_name].values ) 
        ax = sns.scatterplot(x=v1, y = v2,  hue= color_by, palette = color_palette )#, palette = "viridis")# sns.color_palette("viridis", as_cmap=True),
                            #)# ,   alpha = 0.8, marker = '.')#, )#, legend=None)
        plt.setp(ax.get_legend().get_texts(), fontsize='20') # for legend text
        plt.setp(ax.get_legend().get_title(), fontsize='20') # for legend title

        
plt.show()
print( np.round( time.time() - t0,1) , ' seconds passed ' ) 

## Plots in genes axes

In [None]:
mask = np.ones( adata.X.shape[0]).astype(bool) # 

gene1 = 'E2F1' 
gene2 = 'FOXM1'
n_x_subplots = 4

c = 0 
for gene1,gene2 in [ ('E2F1','FOXM1'), ('PCNA','TOP2A'), ('CCNB1', 'CCNB2'), ('CCNE1', 'CCNB1'),
                   ('E2F1','PCNA'),('FOXM1','PCNA'), ('E2F1','TOP2A'),('FOXM1','TOP2A'), ]: 
    #,'PCNA','TOP2A', 'CCNE1', 'CCNE2', 'CDK2', 'CCNB1','CCNB2','CCNB3','CCNA2',  ]: # , 'pct_counts_mt']:
    color_by_mode = 'n.umi' # ,#'cell_cycle_phase', #  'PCA1ALL', 'n.umi', 'threshold_binarize_5_PCA1ALL',
    if color_by_mode in adata.obs:
        color_by_field_name = color_by_mode
        color_by = (adata[mask].obs[color_by_field_name])
    else:
        color_by_mode = None
    
    if c % n_x_subplots == 0:
        fig = plt.figure(figsize = (20,6) ); c = 0
        plt.suptitle(str_data_inf + ' Genes axes   n_cells: ' + str(adata.X.shape[0]) +\
            ' n_genes: ' + str(adata.X.shape[1])  + ' color by '+ str(color_by_mode) , fontsize = 20   )#' 
        c = 0
        
    c += 1; fig.add_subplot(1,n_x_subplots ,c)
    plt.title(gene1 + ' ' + gene2) # str(color_by_mode) + ' PCA '+str(i)+','+str(j))

    plt.legend()
    I_gene = np.where(adata.var.index == gene1)[0]
    v = adata[mask].X[:,I_gene]
    if scipy.sparse.issparse(adata.X):
        v = np.asarray(v.toarray()).ravel()
    v1 = v
    I_gene = np.where(adata.var.index == gene2)[0]
    v = adata[mask].X[:,I_gene]
    if scipy.sparse.issparse(adata.X):
        v = np.asarray(v.toarray()).ravel()
    v2 = v


    if color_by_mode is None:
        ax = sns.scatterplot(x=v1,y=v2)# ,   hue= color_by)#, alpha = 0.7, marker = '.', legend = None)
    else:
        ax = sns.scatterplot(x=v1,y=v2,   hue= color_by)#, alpha = 0.7, marker = '.', legend = None)
        plt.setp(ax.get_legend().get_texts(), fontsize='15') # for legend text
        plt.setp(ax.get_legend().get_title(), fontsize='15') # for legend title    plt.xlabel('PCA1' , fontsize = 20 )
        ax.legend().set_visible(False)

    plt.xlabel(gene1,  fontsize = 20)
    plt.ylabel(gene2,  fontsize = 20)
plt.show()  

In [None]:
print(np.round(time.time()-t0start,1),  np.round( (time.time()-t0start)/60,1),  np.round( (time.time()-t0start)/3600,1), 
      'seconds, munutes, hours  passed')
