Requires high mem (at least 150 GB)

# Setup

In [None]:
import scanpy as sc
import infercnvpy as cnv
import matplotlib as plt
import random
import pandas as pd
import numpy as np
import scipy as sp
import os
import sys
from datetime import datetime

# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#hpc figures
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # print multiple outputs

In [None]:
sc.set_figure_params(figsize=(4, 4), dpi=100, dpi_save=300)

In [None]:
nCores = 8 #number of supplied cores
sc.settings.n_jobs = nCores #nCores

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

set_seed(123)

In [None]:
!cd /scratch/gent/vo/000/gvo00027/projects/Single_Cell_Neuroblastoma/NBAtlas/03e_post_scVI_inferCNV_NBAtlas
os.getcwd()

In [None]:
output_h5objects = "h5objects/"
output_tables = "Tables/"
output_figures = "Figures/"

In [None]:
os.makedirs(output_tables, exist_ok=True)
os.makedirs(output_tables, exist_ok=True)
os.makedirs(output_figures, exist_ok=True)

In [None]:
sc.settings.figdir = output_figures

In [None]:
sc.settings.verbosity = 4

In [None]:
# mem check
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']
# get memory in bytes
mem = {
    key: value
    for key, value in sorted(
        [
            (x, sys.getsizeof(globals().get(x)))
            for x in dir()
            if not x.startswith("_") and x not in sys.modules and x not in ipython_vars
        ],
        key=lambda x: x[1],
        reverse=True,
    )
}
mem

# Total atlas

In [None]:
adata = sc.read("/scratch/gent/vo/000/gvo00027/projects/Single_Cell_Neuroblastoma/NBAtlas/03e_post_scVI_inferCNV_NBAtlas/h5objects/03e_post_scVI_inferCNV_NBAtlas_inferCNV_onTotal_VZZ.h5ad")

In [None]:
adata

In [None]:
metadata = pd.read_csv("/scratch/gent/vo/000/gvo00027/projects/Single_Cell_Neuroblastoma/NBAtlas/03c_post_scVI_R_plots/Tables/03c_post_scVI_R_plots_covSample_MetaDataForInferCNV.csv", 
                       index_col=0)
metadata

In [None]:
adata.obs['annot_NBN_iCNV'] = metadata['annot_NBN_iCNV'].values

In [None]:
cnv.pl.chromosome_heatmap(adata, groupby="annot_NBN_iCNV", cmap = 'seismic', save = "03e_post_scVI_inferCNV_totalRun_GroupByAnnotiCNV.svg")

In [None]:
cnv.tl.pca(adata)
cnv.pp.neighbors(adata)
cnv.tl.leiden(adata)

In [None]:
cnv.tl.umap(adata)
cnv.tl.cnv_score(adata)

In [None]:
#save
adata.write(output_h5objects + "03e_post_scVI_inferCNV_NBAtlas_iCNVresults_VZZ_totalRun_NBN_NBAtlas.h5ad")

In [None]:
#Reload
adata = sc.read(output_h5objects + "03e_post_scVI_inferCNV_NBAtlas_iCNVresults_VZZ_totalRun_NBN_NBAtlas.h5ad")

In [None]:
cnv.pl.umap(adata, 
            color="cnv_score",
            sort_order = True, #default
            #color_map = 'magma',
            vmax = 'p99',
            save = "03e_post_scVI_inferCNV_totalRun_UMAP_CNVScore_NBAtlas.png"
           )

In [None]:
cnv.pl.umap(adata, 
            color="annot_NBN_iCNV",
            save = "03e_post_scVI_inferCNV_totalRun_UMAP_annotiCNV_NBAtlas.png"
           )

In [None]:
# export metadata
to_export = adata.obs[["cnv_leiden", "cnv_score"]]
#to_export = to_export.concatenate(adata.obsm['cnv_umap'])
umap_df = pd.DataFrame(adata.obsm['X_cnv_umap'], columns=['CNV_UMAP1', 'CNV_UMAP2'], index=adata.obs_names)
to_export = pd.concat([to_export, umap_df], axis=1)
to_export

In [None]:
to_export.to_csv(output_tables + "03e_post_scVI_inferCNV_inferCNVpy_totalRun_NBN_NBAtlas.csv")

# Chr score

In [None]:
#reload
adata = sc.read(output_h5objects + "03e_post_scVI_inferCNV_NBAtlas_iCNVresults_VZZ_PerStudyMerged_NBN_NBAtlas.h5ad")

In [None]:
adata.obsm["X_cnv"].shape #[0]: cells, [1]: windows

In [None]:
adata.uns['cnv']['chr_pos']

In [None]:
chr_pos = list(dict.values(adata.uns['cnv']['chr_pos']))
chr_order = list(dict.keys(adata.uns['cnv']['chr_pos']))
chr_order.index('chr7')

In [None]:
chr_intervals = list(zip(chr_pos , chr_pos[1:] + [adata.obsm["X_cnv"].shape[1]])) #make list of chromosome intervals for infercnv windows
chr_intervals

In [None]:
# select chr7
chr7_pos = chr_intervals[19]
chr7_pos[0]
chr7_pos[1]

In [None]:
use_rep = 'cnv'
tmp_adata = sc.AnnData(X=adata.obsm[f"X_{use_rep}"], obs=adata.obs, uns=adata.uns) # extract cnv only obsm -> becomes adata.X
tmp_adata

In [None]:
tmp_adata.shape

In [None]:
tmp_adata = tmp_adata[:, chr7_pos[0]:chr7_pos[1]]

In [None]:
#score per cell
chr7_score_per_cell = np.mean(np.abs(tmp_adata.X), axis = 1) #np.mean # take X because tmp_adata
chr7_score_per_cell

In [None]:
adata.obs["chr7_score_per_cell"] = chr7_score_per_cell

In [None]:
cnv.pl.umap(adata, 
            color="chr7_score_per_cell",
            sort_order = True, #default
            vmax = 'p99',
            save = "03e_post_scVI_inferCNV_PerStudyMerged_UMAP_Chr7ScorePerCell_NBAtlas.png"
           )

In [None]:
cnv.pl.umap(adata, 
            color="annot_NBN_iCNV")

## Chromosome arm score

In [None]:
# import from R
centromeres = pd.read_csv('/data/gent/vo/000/gvo00027/SingleCell10X/cellrangerref/Gencode_v43_VZZ/Gencode_v43_cytoband_centromere_positions.csv', sep=",", header=None) #using #http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz
centromeres # c1: chromosome, c2: start, c3: end, c4: cytoband, c5: chr with arm

In [None]:
adata.var['chromosome'].value_counts()

In [None]:
# add chromosome arm to "chr_arm"
for index, row in adata.var.iterrows(): #iterate across rows
    chromosome = row['chromosome']
    start_position = row['start'] #start gene pos
    
    if chromosome in centromeres[0].values:
        centromere_position = centromeres[centromeres[0] == chromosome][1].values[0] #0: chr, 1: centromere pos

        if start_position < centromere_position:
            adata.var.loc[index, 'chr_arm'] = str(chromosome) + 'p'
        else:
            adata.var.loc[index, 'chr_arm'] = str(chromosome) + 'q'
    else:
        adata.var.loc[index, 'chr_arm'] = np.nan

adata.var["chr_arm"]

In [None]:
adata.var.loc[adata.var['gene_ids'].isin(["MYCN","SOX11","RRM2","TP53","IGF2BP1"]), 'chr_arm'] #check some known genes

In [None]:
adata.var["chr_arm"].value_counts()

In [None]:
chr_arm_pos = adata.var["chr_arm"].value_counts()

In [None]:
from natsort import index_natsorted
chr_arm_pos = chr_arm_pos.iloc[index_natsorted(chr_arm_pos.index)]
chr_arm_pos = chr_arm_pos.cumsum() # take cumsum to get positions
chr_arm_pos #these are endpoints
# chr arms not to use (<100): chr18p, chr21p, chr22p

In [None]:
adata.var["chromosome_old"] = adata.var["chromosome"]

chr_arm_pos_endpoint = chr_arm_pos

chr_arm_pos_startpoint = chr_arm_pos_endpoint.shift(1)
chr_arm_pos_startpoint[0] = 0
chr_arm_pos_startpoint = chr_arm_pos_startpoint.astype(int)
chr_arm_pos_startpoint

### 17q

In [None]:
selected_chr_arm = 'chr17q'

chr_arm_pos_startpoint[selected_chr_arm] # startpoint at endpoint of 17p
chr_arm_pos_endpoint[selected_chr_arm] #endpoint

In [None]:
use_rep = 'cnv'
tmp_adata = sc.AnnData(X=adata.obsm[f"X_{use_rep}"], obs=adata.obs, uns=adata.uns) # extract cnv only obsm -> becomes adata.X
tmp_adata

In [None]:
tmp_adata.shape

In [None]:
tmp_adata = tmp_adata[:, chr_arm_pos_startpoint[selected_chr_arm]:chr_arm_pos_endpoint[selected_chr_arm]]

In [None]:
tmp_adata[:, chr_arm_pos_startpoint[selected_chr_arm]:chr_arm_pos_endpoint[selected_chr_arm]]

In [None]:
#score per cell
chr17q_score_per_cell = np.mean(np.abs(tmp_adata.X), axis = 1)
chr17q_score_per_cell

In [None]:
adata.obs["chr17q_score_per_cell"] = chr17q_score_per_cell

In [None]:
cnv.pl.umap(adata, 
            color="chr17q_score_per_cell",
            sort_order = True, #default
            vmax = 'p99',
            save = "03e_post_scVI_inferCNV_PerStudyMerged_NE_UMAP_Chr17qScorePerCell_NBAtlas.png"
           )

### 1p

In [None]:
#1p
selected_chr_arm = 'chr1p'

In [None]:
#don't modify
use_rep = 'cnv'
tmp_adata = sc.AnnData(X=adata.obsm[f"X_{use_rep}"], obs=adata.obs, uns=adata.uns) # extract cnv only obsm -> becomes adata.X
tmp_adata = tmp_adata[:, chr_arm_pos_startpoint[selected_chr_arm]:chr_arm_pos_endpoint[selected_chr_arm]]

chr_arm_score_per_cell = np.mean(np.abs(tmp_adata.X), axis = 1) #np.mean # take X because tmp_adata

In [None]:
#1p
adata.obs["chr1p_score_per_cell"] = chr_arm_score_per_cell

In [None]:
cnv.pl.umap(adata, 
            color="chr1p_score_per_cell",
            sort_order = True, #default,
            vmax = 'p99',
            save = "03e_post_scVI_inferCNV_PerStudyMerged_UMAP_Chr1pScorePerCell_NBAtlas.png"
           )

### 2p gain

In [None]:
selected_chr_arm = 'chr2p'

In [None]:
#don't modify
use_rep = 'cnv'
tmp_adata = sc.AnnData(X=adata.obsm[f"X_{use_rep}"], obs=adata.obs, uns=adata.uns) # extract cnv only obsm -> becomes adata.X
tmp_adata = tmp_adata[:, chr_arm_pos_startpoint[selected_chr_arm]:chr_arm_pos_endpoint[selected_chr_arm]]

chr_arm_score_per_cell = np.mean(np.abs(tmp_adata.X), axis = 1) #np.mean # take X because tmp_adata

In [None]:
#2p
adata.obs["chr2p_score_per_cell"] = chr_arm_score_per_cell
adata.obs["chr2p_score_per_cell_mean"] = chr_arm_score_per_cell_mean

In [None]:
cnv.pl.umap(adata, 
            color="chr2p_score_per_cell",
            sort_order = True, #default,
            vmax = 'p99',
            vmin = 0,
            save = "03e_post_scVI_inferCNV_PerStudyMerged_UMAP_Chr2pScorePerCell_NBAtlas.png"
           )

### 3p loss

In [None]:
selected_chr_arm = 'chr3p'

In [None]:
#don't modify
use_rep = 'cnv'
tmp_adata = sc.AnnData(X=adata.obsm[f"X_{use_rep}"], obs=adata.obs, uns=adata.uns) # extract cnv only obsm -> becomes adata.X
tmp_adata = tmp_adata[:, chr_arm_pos_startpoint[selected_chr_arm]:chr_arm_pos_endpoint[selected_chr_arm]]

chr_arm_score_per_cell = np.mean(np.abs(tmp_adata.X), axis = 1) #np.mean # take X because tmp_adata

In [None]:
#3p
adata.obs["chr3p_score_per_cell"] = chr_arm_score_per_cell
adata.obs["chr3p_score_per_cell_mean"] = chr_arm_score_per_cell_mean

In [None]:
cnv.pl.umap(adata, 
            color="chr3p_score_per_cell",
            sort_order = True, #default,
            vmax = 'p99',
            save = "03e_post_scVI_inferCNV_PerStudyMerged_UMAP_Chr3pScorePerCell_NBAtlas.png"
           )

### 4p loss

In [None]:
selected_chr_arm = 'chr4p'

In [None]:
#don't modify
use_rep = 'cnv'
tmp_adata = sc.AnnData(X=adata.obsm[f"X_{use_rep}"], obs=adata.obs, uns=adata.uns) # extract cnv only obsm -> becomes adata.X
tmp_adata = tmp_adata[:, chr_arm_pos_startpoint[selected_chr_arm]:chr_arm_pos_endpoint[selected_chr_arm]]

chr_arm_score_per_cell = np.mean(np.abs(tmp_adata.X), axis = 1) #np.mean # take X because tmp_adata

In [None]:
#4p
adata.obs["chr4p_score_per_cell"] = chr_arm_score_per_cell
adata.obs["chr4p_score_per_cell_mean"] = chr_arm_score_per_cell_mean

In [None]:
cnv.pl.umap(adata, 
            color="chr4p_score_per_cell",
            sort_order = True, #default,
            vmax = 'p99',
            #vmin = 0,
            save = "03e_post_scVI_inferCNV_PerStudyMerged_UMAP_Chr4pScorePerCell_NBAtlas.png"
           )

### 11q loss

In [None]:
selected_chr_arm = 'chr11q'

In [None]:
#don't modify
use_rep = 'cnv'
tmp_adata = sc.AnnData(X=adata.obsm[f"X_{use_rep}"], obs=adata.obs, uns=adata.uns) # extract cnv only obsm -> becomes adata.X
tmp_adata = tmp_adata[:, chr_arm_pos_startpoint[selected_chr_arm]:chr_arm_pos_endpoint[selected_chr_arm]]

chr_arm_score_per_cell = np.mean(np.abs(tmp_adata.X), axis = 1)

In [None]:
#11q
adata.obs["chr11q_score_per_cell"] = chr_arm_score_per_cell
adata.obs["chr11q_score_per_cell_mean"] = chr_arm_score_per_cell_mean

In [None]:
cnv.pl.umap(adata, 
            color="chr11q_score_per_cell",
            sort_order = True, #default,
            vmax = 'p99',
            save = "03e_post_scVI_inferCNV_PerStudyMerged_UMAP_Chr11qScorePerCell_NBAtlas.png"
           )

### 14q loss

In [None]:
selected_chr_arm = 'chr14q'

In [None]:
#don't modify
use_rep = 'cnv'
tmp_adata = sc.AnnData(X=adata.obsm[f"X_{use_rep}"], obs=adata.obs, uns=adata.uns) # extract cnv only obsm -> becomes adata.X
tmp_adata = tmp_adata[:, chr_arm_pos_startpoint[selected_chr_arm]:chr_arm_pos_endpoint[selected_chr_arm]]

chr_arm_score_per_cell = np.mean(np.abs(tmp_adata.X), axis = 1) #np.mean # take X because tmp_adata

In [None]:
#14q
adata.obs["chr14q_score_per_cell"] = chr_arm_score_per_cell
adata.obs["chr14q_score_per_cell_mean"] = chr_arm_score_per_cell_mean

In [None]:
cnv.pl.umap(adata, 
            color="chr14q_score_per_cell",
            sort_order = True, #default,
            vmax = 'p99',
            save = "03e_post_scVI_inferCNV_PerStudyMerged_UMAP_Chr14qScorePerCell_NBAtlas.png"
           )

### Save

In [None]:
#save
adata.write(output_h5objects + "03e_post_scVI_inferCNV_NBAtlas_iCNVresults_VZZ_PerStudyMerged_NBN_CNVscores_NBAtlas.h5ad")

### Export

In [None]:
# export metadata
to_export = adata.obs[["cnv_leiden","cnv_score","cnv_score_per_cell","chr7_score_per_cell","chr17q_score_per_cell","chr1p_score_per_cell","chr2p_score_per_cell","chr3p_score_per_cell","chr4p_score_per_cell","chr11q_score_per_cell","chr14q_score_per_cell"]]
#to_export = to_export.concatenate(adata.obsm['cnv_umap'])
umap_df = pd.DataFrame(adata.obsm['X_cnv_umap'], columns=['CNV_UMAP1', 'CNV_UMAP2'], index=adata.obs_names)
to_export = pd.concat([to_export, umap_df], axis=1)

to_export

In [None]:
to_export.to_csv(output_tables + "03e_post_scVI_inferCNV_inferCNVpy_PerStudyMergedResults_ChrScores_NBN_NBAtlas.csv")