## Import packages

In [1]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import torch
sys.path.append('./')  # uncomment for local import
import tangram as tg
import copy
import anndata

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

tg.__version__

'1.0.3'

## Import data

### Filtered ISH

In [3]:
ish = np.load("/beegfs/home/pmatyskova/project/ishfstep_e_hm185cor_minres_filled0.npy")
gene_annot = np.load("/beegfs/home/pmatyskova/project/ishfstep_geneset_hm185cor_minres_filled0.npy", 
                      allow_pickle=True)
ish = pd.DataFrame(ish)
ish.index = gene_annot

### Spacial data

In [4]:
ish = np.transpose(ish)
ish_sp = sc.AnnData(ish, obs = ish.index.to_frame(), 
                    var = ish.columns.to_frame())

### Single cell data

In [6]:
hm_sc = sc.read_h5ad("/beegfs/scratch/bruening_scratch/lsteuernagel/data/hypoMap_publication/hypoMap.h5ad")

AnnData object with n_obs × n_vars = 384925 × 57362
    obs: 'Cell_ID', 'Dataset', 'SRA_ID', 'Sample_ID', 'GEO_ID', 'Run10x', 'Technology', 'Strain', 'Diet', 'Pooled', 'Age', 'Author_Region', 'inferred_sex', 'nCount_RNA', 'nFeature_RNA', 'percent_mt', 'Author_Exclude', 'Author_Class', 'Author_CellType', 'percent_exclude_features', 'S.Score', 'G2M.Score', 'Phase', 'Batch_ID', 'Author_Condition', 'Sex', 'Author_Batch', 'Author_Class_Curated', 'C2', 'C7', 'C25', 'C66', 'C185', 'C286', 'C465', 'C2_named', 'C7_named', 'C25_named', 'C66_named', 'C185_named', 'C286_named', 'C465_named', 'Region_predicted', 'Region_summarized'
    var: 'features'
    uns: 'neighbors'
    obsm: 'X_scvi', 'X_umap_scvi'
    obsp: 'distances'

### Gene markers

#### DEG

In [None]:
sc.tl.rank_genes_groups(hm_sc, groupby="C185_named", use_raw=False, method='wilcoxon')
markers_df = pd.DataFrame(hm_sc.uns["rank_genes_groups"]["names"]).iloc[0:100, :] #50,100,200
markers = list(np.unique(markers_df.melt().value.values))
len(markers)

#### MR approaches

In [7]:
df_genes = np.load("/beegfs/home/pmatyskova/project/mrmrstep_geneset_miss_hm185cor_minres_mrx31500.npy", 
                   allow_pickle=True)
df_genes = pd.DataFrame(df_genes)

genes_to_exclude = pd.read_csv('/beegfs/scratch/bruening_scratch/lsteuernagel/projects/analysis_projects/volumetric_analysis/genes_to_exclude.csv',
                               header = None)
df_genes = df_genes[~df_genes.iloc[:,0].isin(genes_to_exclude.iloc[:,0])] #exclude the genes to exclude from hypomap output

markers = np.reshape(df_genes.values, (-1, ))
markers= list(markers)
markers = markers[0:1300]
len(markers)

1300

#### The rest of the gene marker pipeline

In [17]:
tg.pp_adatas(hm_sc, ish_sp, genes=markers)

INFO:root:1145 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:3574 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.


In [18]:
assert hm_sc.uns['training_genes'] == ish_sp.uns['training_genes']

## Train the model

In [19]:
ad_map = tg.map_cells_to_space(adata_sc=hm_sc, adata_sp=ish_sp, mode='clusters', 
                               cluster_label = 'C185_named')

INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 1145 genes and rna_count_based density_prior in clusters mode...
INFO:root:Printing scores every 100 epochs.


Score: 0.669, KL reg: 0.028
Score: 0.755, KL reg: 0.001
Score: 0.757, KL reg: 0.001
Score: 0.757, KL reg: 0.001
Score: 0.757, KL reg: 0.001
Score: 0.757, KL reg: 0.001
Score: 0.757, KL reg: 0.001
Score: 0.757, KL reg: 0.001
Score: 0.758, KL reg: 0.001
Score: 0.758, KL reg: 0.001


INFO:root:Saving results..


## Save the results

In [20]:
np.save(file = "/beegfs/home/pmatyskova/project/d_tangram_hm185cor_minres_markers_filled0", 
        arr = np.transpose(ad_map.X))
ad_map.obs.to_csv("/beegfs/home/pmatyskova/project/dobs_tangram_hm185cor_minres_markers_filled0")
ad_map.var.to_csv("/beegfs/home/pmatyskova/project/dvar_tangram_hm185cor_minres_markers_filled0")

## Quantitative model evaluation

### Fit back

In [13]:
training_genes = hm_sc.uns['training_genes']
hm_sct = hm_sc[:, training_genes].copy()

training_genes = ish_sp.uns['training_genes']
ish_spt = ish_sp[:, training_genes].copy()

In [21]:
ad_ge = tg.project_genes(
                  ad_map, 
                  hm_sc, #hm_sc_neurons or hm_sc, or hm_sct for mode='constrained'
                  cluster_label='C185_named') #C286 ir C185

In [22]:
df_all_genes = tg.compare_spatial_geneexp(ad_ge, ish_sp, hm_sc) #hm_sc_neurons or hm_sc, or hm_sct

In [23]:
df_all_genes.to_csv("/beegfs/home/pmatyskova/project/corr_tangram_hm185cor_minres_markers_filled0.csv") 

In [171]:
full.to_csv("/beegfs/home/pmatyskova/project/feval_tangram_hm185cor_minres_full_csv") 

### Ground truth evaluation

In [24]:
def annot_function(d_matrix, annot, ref): #d_matrix in mode='cluster', d in mode='constrained'
    #annotate predictions (cell type & voxel locations + ABA annotations)
    d = pd.DataFrame(np.transpose(d_matrix.X)) #from anndata to pandas, comment in mode='constrained'
    d.columns = d_matrix.obs['C185_named'] #C286 or C185, comment in mode='constrained
    d = d.loc[:,d.columns.isin(ref.loc[:,'cluster'])] #only keep cell types for which we have reference
    d['merge'] = d.index #index column for merging
    d_annot = pd.merge(d, annot)
    d_annot = d_annot.drop('merge', axis = 1)
    
    return(d_annot)

In [25]:
def eval_function(d_matrix, annot, ref):    
    #evaluation (comparson with the ground truth)
    d_ann = annot_function(d_matrix, annot, ref)
    
    model_eval = []
    for i in range(ref.shape[0]):
        #filter region that is predicted in the ground truth for each cell type
        #to include not only exact region name but also its children - not just "Medial preoptic nucleus"
        #but also Medial preoptic nucleus, central/lateral/medial part:
        filt = [] 
        for j in range(d_matrix.shape[1]): #1 in mode='cluster', 0 in mode='constrained'
            filt_i = ref['Region_ground_truth'][i] in d_ann['name'][j]
            filt.append(filt_i)
        d_filt = d_ann[filt]
    
        #calculations:
        score_i = sum(d_filt[ref['cluster'][i]])
        model_eval.append(score_i)
    
    copy_ref = copy.copy(ref)
    copy_ref['model_eval'] = model_eval
    return(copy_ref)

In [26]:
ann_hypnoSFO = pd.read_csv('/beegfs/home/pmatyskova/project/ish_annot_hypnoSFO.csv')
ann_hypnoSFO['merge'] = ann_hypnoSFO.index
ann_hypnoSFO = ann_hypnoSFO.iloc[:,[0,1,2,7,14]]

In [3]:
gt_hm185 = pd.read_csv('/beegfs/scratch/bruening_scratch/lsteuernagel/projects/analysis_projects/volumetric_analysis/hypoMap_region_annotation_withSpatial_C185.txt', sep = "\t")

gt_hm185 = gt_hm185.iloc[:,0:2]

In [4]:
gt_hm185 = gt_hm185.drop_duplicates(subset = ['cluster'], keep='first')
gt_hm185.index = np.arange(0,len(gt_hm185))

In [29]:
final_eval = eval_function(ad_map, ann_hypnoSFO, gt_hm185) 

Unnamed: 0,cluster,Region_ground_truth,model_eval
0,C185-65: Unassigned.Mixed.GABA-2,Medial preoptic nucleus,0.016048
1,C185-71: Vip.Vipr2.GABA-2,Suprachiasmatic nucleus,0.195754
2,C185-72: Fam122b.Vipr2.GABA-2,Suprachiasmatic nucleus,0.006874
3,C185-73: Cck.Vipr2.GABA-2,Suprachiasmatic nucleus,0.023255
4,C185-11: Cbln2.Trh.GLU-2,Paraventricular hypothalamic nucleus,0.077735
...,...,...,...
63,C185-134: Frzb.Tanycytes,Arcuate hypothalamic nucleus,0.237568
64,C185-51: Tac2.GLU-5,Arcuate hypothalamic nucleus,0.062594
65,C185-61: Prkch.GLU-8,Lateral mammillary nucleus,0.178093
66,C185-64: Meis2.Mixed.GABA-2,Zona incerta,0.514649


## Permutation test

### Permutation test on voxel randomised prediction matrix

In [30]:
def voxelperm_annot_function(d_matrix, annot, ref): #d_matrix in eval mode='cluster', d in mode='constrained'
    #annotate voxel permuted predictions (cell type & randomised voxel location + ABA annotations)
    d = pd.DataFrame(np.transpose(d_matrix.X)) #from anndata to pandas, comment in mode='constrained'
    d.columns = d_matrix.obs['C185_named'] #C286 or C185, comment in mode='constrained'
    d = d.loc[:,d.columns.isin(ref.loc[:,'cluster'])] #only keep cell types for which we have reference
    
    d_perm = d.sample(frac=1, axis=0) #suffle row order
    d_perm.index = d.index
    
    d_perm['merge'] = d_perm.index #index column for merging
    d_annot = pd.merge(d_perm, annot)
    d_annot = d_annot.drop('merge', axis = 1)
    
    model_eval = []
    for i in range(ref.shape[0]):
        #filter region that is predicted in the ground truth for each cell type
        #to include not only exact region name but also its children - not just "Medial preoptic nucleus"
        #but also Medial preoptic nucleus, central/lateral/medial part:
        filt = [] 
        for j in range(d_matrix.shape[1]): #1 & d_matrix in mode='cluster', 0 & d in mode='constrained'
            filt_i = ref['Region_ground_truth'][i] in d_annot['name'][j]
            filt.append(filt_i)
        d_filt = d_annot[filt]
    
        #calculations:
        score_i = sum(d_filt[ref['cluster'][i]])
        model_eval.append(score_i)
    
    copy2_ref = copy.copy(ref)
    copy2_ref['randmodel_eval'] = model_eval
    return(copy2_ref)

In [31]:
voxpermutation_iters = 200
voxpermut_evals = pd.DataFrame(columns = list(map('x{}'.format, range(1, voxpermutation_iters+1))))
for i in range(voxpermutation_iters):
    dvoxperm_evaluation = voxelperm_annot_function(ad_map, ann_hypnoSFO, gt_hm185) #gt_hm286 or gt_hm286n or gt_hm185
    voxpermut_evals.iloc[:,i] = dvoxperm_evaluation['randmodel_eval']

voxpermut_eval_tot = voxpermut_evals.mean(axis=1)
final_eval['voxpermut_eval'] = voxpermut_eval_tot
final_eval

Unnamed: 0,cluster,Region_ground_truth,model_eval,voxpermut_eval
0,C185-65: Unassigned.Mixed.GABA-2,Medial preoptic nucleus,0.016048,0.012267
1,C185-71: Vip.Vipr2.GABA-2,Suprachiasmatic nucleus,0.195754,0.007443
2,C185-72: Fam122b.Vipr2.GABA-2,Suprachiasmatic nucleus,0.006874,0.006415
3,C185-73: Cck.Vipr2.GABA-2,Suprachiasmatic nucleus,0.023255,0.006155
4,C185-11: Cbln2.Trh.GLU-2,Paraventricular hypothalamic nucleus,0.077735,0.017306
...,...,...,...,...
63,C185-134: Frzb.Tanycytes,Arcuate hypothalamic nucleus,0.237568,0.012890
64,C185-51: Tac2.GLU-5,Arcuate hypothalamic nucleus,0.062594,0.013042
65,C185-61: Prkch.GLU-8,Lateral mammillary nucleus,0.178093,0.004280
66,C185-64: Meis2.Mixed.GABA-2,Zona incerta,0.514649,0.131358


In [33]:
final_eval.to_csv("/beegfs/home/pmatyskova/project/feval_tangram_hm185cor_markers_filled0.csv") 