In [None]:
import scanpy as sc
import infercnvpy as cnv
import matplotlib as plt
import pandas as pd
import numpy as np
import scipy as sp
import random
import pickle
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import cut_tree

sc.set_figure_params(figsize=(4,4))
sc.set_figure_params(dpi=200)
sc.settings.n_jobs = 3 #nCores/CPUs for scanpy

# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#hpc figures
np.set_printoptions(precision = 4, suppress = True)
%matplotlib inline
plt.pyplot.figure(figsize=(10,3))
plt.pyplot.style.use('seaborn-whitegrid')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # print multiple outputs per code cell (not just last)

### Step 1: load data

In [None]:
h5_path = "/scratch/gent/vo/000/gvo00027/projects/Single_Cell_Neuroblastoma/NBAtlas/01_Import_Preprocessing_NBAtlas/h5objects/nb_matrix_NBAtlas.h5"
adata = sc.read_10x_h5(h5_path)
adata

In [None]:
meta_path = "/scratch/gent/vo/000/gvo00027/projects/Single_Cell_Neuroblastoma/NBAtlas/03c_post_scVI_R_plots/Tables/03c_post_scVI_R_plots_covSample_MetaDataForInferCNV.csv"
metaData = pd.read_csv(meta_path, index_col = 0)

In [None]:
adata.obs = metaData
adata.obs.index = adata.obs.index.astype(str)

adata

In [None]:
#save raw counts reparately

adata.layers["counts"] = adata.X

In [None]:
#normalize

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
#find highly variable genes

sc.pp.highly_variable_genes(
    adata,
    n_top_genes=5000,
    subset=False,
    layer="counts",
    flavor="seurat",
    batch_key="Study"
)

In [None]:
#PCA

sc.tl.pca(adata)
sc.pl.pca_variance_ratio(adata, n_pcs=50)

In [None]:
#Dimensionality reduction

sc.pp.neighbors(adata, n_pcs=20, n_neighbors=20)
sc.tl.umap(adata, min_dist=0.3)


In [None]:
#Visualize cells colored by study and by annotation (non-integrated)

fig, (ax1, ax2) = plt.pyplot.subplots(1, 2, figsize=(15, 5), gridspec_kw=dict(wspace=1))

sc.pl.umap(
    adata,
    color = ["Study"],
    frameon=True,
    show = False, 
    ax=ax1
)

sc.pl.umap(
    adata,
    color = ["annot_NBN_iCNV"],
    ncols=2,
    show= False, 
    ax = ax2
)

### Step 2: annotate object with genomic locations

In [None]:
#annotate genomic location of genes

cnv.io.genomic_position_from_gtf(
    gtf_file= "/kyukon/scratch/gent/vo/000/gvo00027/projects/CBIGR/22VZZ_singlecellNB/DongJansky_Meta/data/annot_files_and_more/gencode.v43.annotation.gtf.gz", 
    adata=adata, 
    gtf_gene_id="gene_name")

### Step 3: run CNV inference

In [None]:
#adata.write("../../data/NBAtlas_h5ad/NBAtlas_prep_infercnv.h5ad")
adata = sc.read("../../data/NBAtlas_h5ad/NBAtlas_prep_infercnv.h5ad")
adata

In [None]:
#check cell types

adata.obs["annot_NBN_iCNV"].value_counts()

In [None]:
#Define reference cell types: 

ref_cells = ["T/NK cell", "Myeloid", "B cell", "Endothelial", "Stromal other", "pDC", "Plasma"]

In [None]:
# Check names of chromosomes and chromosomes to exclude from the analysis

set(adata.var.chromosome)

In [None]:
chr_to_exclude = [np.nan, 'chrM']

In [None]:
groups = adata.obs["Study"].values.unique()

In [None]:
for study in groups: #or the set with all of the studies in case you want that

    data = adata[adata.obs["Study"] == study,:]
    ref_cells_touse = []
    #remove reference cell types that are not found in the data subset
    for i in ref_cells: 
        if i in data.obs["annot_NBN_iCNV"].cat.categories:
                ref_cells_touse.append(i)
    print(study, data.shape)
    cnv.tl.infercnv(data, 
                    reference_key="annot_NBN_iCNV", 
                    reference_cat=ref_cells_touse, 
                    exclude_chromosomes=chr_to_exclude, 
                    window_size=250, 
                    n_jobs = 1,
                    step=1)
    cnv.tl.pca(data)
    
    write_dest = "/kyukon/scratch/gent/vo/000/gvo00027/projects/CBIGR/22VZZ_singlecellNB/metaanalysis_atlas/data/NBAtlas_h5ad/CNV_results/" + study + "_CNVresults.h5ad"
    data.write(write_dest)

In [None]:
#Combine the CNV profile and their respective PCAs into one object

c = 0
for study in groups : 
    write_dest = "/kyukon/scratch/gent/vo/000/gvo00027/projects/CBIGR/22VZZ_singlecellNB/metaanalysis_atlas/data/NBAtlas_h5ad/CNV_results/" + study + "_CNVresults.h5ad"
    if c == 0 :
        cnv_data = sc.read(write_dest)
        c +=1
    else :
        tmp = sc.read(write_dest)
        cnv_data = sc.concat(adatas=[cnv_data, tmp])
    
cnv_data

In [None]:
#Also transfer all these info to integrated adata

adata.obsm["X_cnv"] = cnv_data.obsm["X_cnv"]
adata.obsm["X_cnv_pca"] = cnv_data.obsm["X_cnv_pca"]
adata.obs["annot_NBN_iCNV"] = pd.Categorical(cnv_clust["annot_NBN_iCNV"])
adata

In [None]:
#Also insert information about chromosome positions based on the genes

adata.uns["cnv"] = {'chr_pos': {'chr1': 0,
  'chr10': 11356,
  'chr11': 12328,
  'chr12': 13608,
  'chr13': 14825,
  'chr14': 15389,
  'chr15': 16223,
  'chr16': 16953,
  'chr17': 17893,
  'chr18': 19207,
  'chr19': 19560,
  'chr2': 2361,
  'chr20': 21053,
  'chr21': 21699,
  'chr22': 21972,
  'chr3': 3928,
  'chr4': 5329,
  'chr5': 6259,
  'chr6': 7355,
  'chr7': 8587,
  'chr8': 9703,
  'chr9': 10488,
  'chrX': 22547,
  'chrY': 23421}}

In [None]:
#finally write externally the integrated atlas with inferCNVpy results
write_dest = '/kyukon/scratch/gent/vo/000/gvo00027/projects/CBIGR/22VZZ_singlecellNB/metaanalysis_atlas/data/NBAtlas_h5ad/CNV_results/NBAtlas_Int_CNVresults_VZZ.h5ad'
adata.write(write_dest)

In [None]:
cnv.pl.chromosome_heatmap(adata, groupby="Sample", dendrogram = False, show=False, save = "_CNVperstudy_All.png")