In [1]:
### Load libraries
import os
import sem.es as es
import numpy as np
import pandas as pd
import datetime

In [2]:
# Data
file_data = "/scratch/data-for_fast_access/pub-others/tabula_muris_180920/tabula_muris.umi.csv.gz"
df_data = pd.read_csv(file_data) # this takes 12-14 min for tabula_muris! (Pandas is slow!)
# Metadata
file_metadata = "/scratch/data-for_fast_access/pub-others/tabula_muris_180920/tabula_muris.metadata.csv"
df_metadata = pd.read_csv(file_metadata)

# Mapping MGI to Ensembl gene names

In [6]:
df_counts = df_data.set_index('gene')
gene_name_mapping = pd.read_csv("/projects/timshel/sc-genetics/sc-genetics/data/gene_annotations/Mus_musculus.GRCm38.90.gene_name_version2ensembl.txt.gz", sep=None, engine='python')
# Converting MGI symbols to lowercase to avoid issues with case sensitivity
gene_name_mapping.gene_name_optimal = gene_name_mapping.gene_name_optimal.map(lambda x:x.lower())
df_counts.index = df_counts.index.map(lambda x:x.lower())
# How many of the dataset's genes are in the mapping list and how many aren't
in_mapping_bool = df_counts.index.isin(gene_name_mapping.gene_name_optimal)
print('How many genes can be mapped from MGI name to Ensembl mouse names?')
print(pd.Series(in_mapping_bool).value_counts())

How many genes can be mapped from MGI name to Ensembl mouse names?
True     20874
False     2467
dtype: int64


In [7]:
mapping_dict = pd.Series(gene_name_mapping.ensembl_gene_id.values,index=gene_name_mapping.gene_name_optimal).to_dict() # Quickest way according to https://stackoverflow.com/questions/17426292/
df_counts.index = df_counts.index.map(mapping_dict)
df_counts.head(10)

Unnamed: 0_level_0,A1.B000126.3_39_F.1.1,A1.B003283.3_38_F.1.1,A1.MAA000435.3_10_M.1.1,A1.MAA000549.3_8_M.1.1,A1.MAA000614.3_10_M.1.1,A1.MAA000938.3_8_M.1.1,A10.B003283.3_38_F.1.1,A11.B000126.3_39_F.1.1,A12.B000126.3_39_F.1.1,A12.B003283.3_38_F.1.1,...,O6.MAA001847.3_39_F.1.1,O7.MAA001847.3_39_F.1.1,P2.MAA001847.3_39_F.1.1,P3.MAA000839.3_11_M.1.1,P4.MAA000526.3_9_M.1.1,P4.MAA000839.3_11_M.1.1,P5.MAA000526.3_9_M.1.1,P5.MAA001847.3_39_F.1.1,P6.MAA001847.3_39_F.1.1,P9.MAA001847.3_39_F.1.1
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000109644,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
,265,1026,0,48,26,87,36,245,0,245,...,74,0,164,0,51,315,1,0,193,0
,1,35,4,0,186,2,0,0,0,0,...,72,4,7,0,0,0,0,15,0,33
,0,0,0,0,0,0,0,0,0,0,...,11,0,0,0,0,3,0,1,0,0
,0,0,0,10,0,0,110,0,0,0,...,0,0,0,0,0,0,0,0,53,0
,30,0,24,0,0,0,0,2,0,76,...,0,0,51,0,72,0,0,31,8,0
,0,247,0,0,165,0,0,0,0,0,...,0,37,0,0,3,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000007777,0,63,0,56,0,15,64,16,0,0,...,0,36,0,0,0,0,0,62,40,60


In [8]:
## Inspect metadata. Note that there are 48 unique cell-types.
# N.B. the current implementation only accepts 1 by n_cells numpy arrays for the annotation.
# We will have to match the contents of the metadata file to the cell-id's of the counts dataframe

print(df_metadata.shape)
n_cell_types = df_metadata["tissue_celltype"].astype(str)
print(np.unique(n_cell_types).size)
df_metadata.head()

(44949, 8)
115


Unnamed: 0,cell_id,nGene,nReads,tissue,subtissue_clean,celltype,tissue_celltype,tissue_subtissue_celltype
0,A1.B000126.3_39_F.1.1,3125,599257,Skin,Telogen,epidermal_cell,Skin.epidermal_cell,Skin.Telogen.epidermal_cell
1,A1.B003283.3_38_F.1.1,5543,2585048,Skin,Telogen,epidermal_cell,Skin.epidermal_cell,Skin.Telogen.epidermal_cell
2,A1.MAA000435.3_10_M.1.1,5023,1748535,Skin,Anagen,basal_cell_of_epidermis,Skin.basal_cell_of_epidermis,Skin.Anagen.basal_cell_of_epidermis
3,A1.MAA000549.3_8_M.1.1,3846,309793,Skin,Anagen,epidermal_cell,Skin.epidermal_cell,Skin.Anagen.epidermal_cell
4,A1.MAA000614.3_10_M.1.1,3249,1044110,Skin,Telogen,basal_cell_of_epidermis,Skin.basal_cell_of_epidermis,Skin.Telogen.basal_cell_of_epidermis


# Removing non-neuronal cells

In [9]:
no_neuron_counts = df_counts.loc[:, np.array(df_metadata.tissue!='Brain_Non-Myeloid')]
no_neuron_meta = df_metadata[df_metadata.tissue!='Brain_Non-Myeloid']
sample = no_neuron_counts.columns.values[:10]
anno_str = np.array(no_neuron_meta['tissue_celltype']).astype(str)
print(anno_str)
# no_neuron_meta.set_index("cell_id")[sample]

['Skin.epidermal_cell' 'Skin.epidermal_cell'
 'Skin.basal_cell_of_epidermis' ... 'Lung.epithelial_cell_of_lung'
 'Lung.epithelial_cell_of_lung' 'Lung.epithelial_cell_of_lung']


In [10]:
# Which cells are neurons and which aren't?
non_neurons = np.unique(no_neuron_meta.tissue_celltype)
print(non_neurons)

['Bladder.bladder_cell' 'Bladder.bladder_urothelial_cell'
 'Brain_Myeloid.macrophage' 'Brain_Myeloid.microglial_cell' 'Fat.B_cell'
 'Fat.T_cell' 'Fat.endothelial_cell'
 'Fat.mesenchymal_stem_cell_of_adipose' 'Fat.myeloid_cell'
 'Fat.natural_killer_cell' 'Fat.unknown_cell_type'
 'Heart.cardiac_muscle_cell' 'Heart.endocardial_cell'
 'Heart.endothelial_cell' 'Heart.erythrocyte' 'Heart.fibroblast'
 'Heart.leukocyte' 'Heart.myofibroblast_cell'
 'Heart.professional_antigen_presenting_cell' 'Heart.smooth_muscle_cell'
 'Heart.unknown_cell_type' 'Kidney.endothelial_cell'
 'Kidney.epithelial_cell_of_proximal_tubule'
 'Kidney.kidney_collecting_duct_epithelial_cell' 'Kidney.leukocyte'
 'Kidney.macrophage'
 'Large_Intestine.Brush_cell_of_epithelium_proper_of_large_intestine'
 'Large_Intestine.enterocyte_of_epithelium_of_large_intestine'
 'Large_Intestine.enteroendocrine_cell'
 'Large_Intestine.epithelial_cell_of_large_intestine'
 'Large_Intestine.large_intestine_goblet_cell' 'Limb_Muscle.B_cell'
 '

In [11]:
np.unique(no_neuron_meta.tissue)

array(['Bladder', 'Brain_Myeloid', 'Fat', 'Heart', 'Kidney',
       'Large_Intestine', 'Limb_Muscle', 'Liver', 'Lung', 'Mammary_Gland',
       'Marrow', 'Pancreas', 'Skin', 'Spleen', 'Thymus', 'Tongue',
       'Trachea'], dtype=object)

In [12]:
## Create machine
#N.B. default args for es.object.Machine(preprocess=True)
print("Creating Machine ...")
print("    ", datetime.datetime.now().time())
machine = es.object.Machine(no_neuron_counts)
print("    ", datetime.datetime.now().time())
print("    ", machine.df.shape)
machine.df.head()

Creating Machine ...
     20:04:04.694745
     20:05:57.283243
     (22947, 41548)


Unnamed: 0_level_0,A1.B000126.3_39_F.1.1,A1.B003283.3_38_F.1.1,A1.MAA000435.3_10_M.1.1,A1.MAA000549.3_8_M.1.1,A1.MAA000614.3_10_M.1.1,A1.MAA000938.3_8_M.1.1,A10.B003283.3_38_F.1.1,A11.B000126.3_39_F.1.1,A12.B000126.3_39_F.1.1,A12.B003283.3_38_F.1.1,...,O6.MAA001847.3_39_F.1.1,O7.MAA001847.3_39_F.1.1,P2.MAA001847.3_39_F.1.1,P3.MAA000839.3_11_M.1.1,P4.MAA000526.3_9_M.1.1,P4.MAA000839.3_11_M.1.1,P5.MAA000526.3_9_M.1.1,P5.MAA001847.3_39_F.1.1,P6.MAA001847.3_39_F.1.1,P9.MAA001847.3_39_F.1.1
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000109644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,1.690491,1.603214,0.0,0.935867,0.222356,1.365487,0.260079,1.869196,0.0,1.172834,...,0.850037,0.0,1.409904,0.0,0.850122,1.517171,0.114043,0.0,1.450732,0.0
,0.01655,0.12698,0.022619,0.0,1.022962,0.064918,0.0,0.0,0.0,0.0,...,0.83444,0.082718,0.124099,0.0,0.0,0.0,0.0,0.237875,0.0,0.441302
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.181613,0.0,0.0,0.0,0.0,0.033336,0.0,0.017745,0.0,0.0
,0.0,0.0,0.0,0.279748,0.0,0.0,0.645847,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.640245,0.0


In [13]:
### Add annotation
# N.B. default args for add_annotation(run_anova=True, map_genes=True, verbose=False)
# We must set map_genes to false, since the implementation is hard-coded to use a specific gene list

name_anno = "cell_type"

print("Adding Annotation ...")
print("    ", datetime.datetime.now().time())
machine.add_annotation(name_anno, anno_str, map_genes=True, verbose=True)
print("    ", datetime.datetime.now().time())
print("    ", machine.df.shape)

Adding Annotation ...
     20:06:01.506308
Mapping gene id's to ortholog gene id's ...
/nfsdata/projects/alegbe/sc-genetics/src/benchmark
Removed 8199 unmapped genes ...
0 pct unmapped genes
     20:09:46.762650
     (14205, 41548)


In [14]:
### Compute ESw, ESw* and ESmu
# N.B. the current implementation will run all ES Metrics, unless we specify which ones we want.
# To avoid any side-effects, we specify the ES metrics to those used in BMI brain.
# tl;dr: I recommend specifying esms.

# N.B. default args for compute(self, annotations: list=None, esms: list={"ges", "si", "ss", "tstat", "zstw"}, 
#                                verbose: bool=False, compute_meta: bool=False)
# tl;dr: set compute_meta=True.

esm_list = ["ges", "si", "ss", "tstat"]

print("Computing ESws ...")
print("    ", datetime.datetime.now().time())
machine.compute(annotations=[name_anno], esms={*esm_list}, compute_meta=True, verbose=True)
print("    ", datetime.datetime.now().time())

Computing ESws ...
     20:10:07.207682
Computing TSTAT ...
Computing FDR ...
Computing esw_s ...
Computing GES ...
Computing FDR ...
Computing esw_s ...
Computing SS ...
Computing FDR ...
Computing esw_s ...
Computing SI ...
Computing FDR ...
Computing esw_s ...
Computing esw_mu ...
Computed ['cell_type.tstat.esw', 'cell_type.tstat.esw_null', 'cell_type.tstat.pvals', 'cell_type.tstat.qvals', 'cell_type.tstat.esw_s', 'cell_type.ges.esw', 'cell_type.ges.esw_null', 'cell_type.ges.pvals', 'cell_type.ges.qvals', 'cell_type.ges.esw_s', 'cell_type.ss.esw', 'cell_type.ss.esw_null', 'cell_type.ss.pvals', 'cell_type.ss.qvals', 'cell_type.ss.esw_s', 'cell_type.si.esw', 'cell_type.si.esw_null', 'cell_type.si.pvals', 'cell_type.si.qvals', 'cell_type.si.esw_s', 'cell_type.esmu'] ...
     20:12:08.073937


# Inspect and save results

In [15]:
### Do the results match our expectations?
machine.metrics['cell_type.esmu'].head()

Unnamed: 0_level_0,Bladder.bladder_cell,Bladder.bladder_urothelial_cell,Brain_Myeloid.macrophage,Brain_Myeloid.microglial_cell,Fat.B_cell,Fat.T_cell,Fat.endothelial_cell,Fat.mesenchymal_stem_cell_of_adipose,Fat.myeloid_cell,Fat.natural_killer_cell,...,Spleen.macrophage,Thymus.DN1_thymic_pro-T_cell,Thymus.immature_T_cell,Thymus.leukocyte,Tongue.basal_cell_of_epidermis,Tongue.keratinocyte,Trachea.blood_cell,Trachea.endothelial_cell,Trachea.epithelial_cell,Trachea.mesenchymal_cell
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000081791,0.074908,0.0,0.0,0.057586,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.338069,0.002024,0.0,0.0,0.0,0.0
ENSG00000162929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.136251,0.0,0.0,...,0.0,0.0,0.0,0.0,0.042104,0.0,0.0,0.066347,0.0,0.245316
ENSG00000168887,0.0,0.005577,0.0,0.033071,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.185867,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000162384,0.367245,0.198995,0.0,0.0,0.0,0.0,0.0,0.220827,0.0,0.0,...,0.0,0.0,0.0,0.0,0.051145,0.115346,0.0,0.0,0.0,0.13446
ENSG00000154274,0.0,0.701247,0.387731,0.886106,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
### Save everything in machine.metrics
# machine.metrics is a dictionary that holds the results.
# results include esw, esw_null, pvals, qvals and ESmu.
print("Saving results to disk ...")
current_date = '190801'
name_anno = "no_neurons"
dir_path = "out_{}".format(name_anno)
os.makedirs(dir_path, exist_ok=True) # make dir if it doesn't already exist

### Save results
for m, df in machine.metrics.items():
    fp = "out_{}/{}.{}.mapped.csv.gz".format(name_anno, m, current_date)
    print(fp)
    df.to_csv(fp, compression="gzip")

Saving results to disk ...
out_no_neurons/cell_type.tstat.esw.190801.mapped.csv.gz
out_no_neurons/cell_type.tstat.esw_null.190801.mapped.csv.gz
out_no_neurons/cell_type.tstat.pvals.190801.mapped.csv.gz
out_no_neurons/cell_type.tstat.qvals.190801.mapped.csv.gz
out_no_neurons/cell_type.tstat.esw_s.190801.mapped.csv.gz
out_no_neurons/cell_type.ges.esw.190801.mapped.csv.gz
out_no_neurons/cell_type.ges.esw_null.190801.mapped.csv.gz
out_no_neurons/cell_type.ges.pvals.190801.mapped.csv.gz
out_no_neurons/cell_type.ges.qvals.190801.mapped.csv.gz
out_no_neurons/cell_type.ges.esw_s.190801.mapped.csv.gz
out_no_neurons/cell_type.ss.esw.190801.mapped.csv.gz
out_no_neurons/cell_type.ss.esw_null.190801.mapped.csv.gz
out_no_neurons/cell_type.ss.pvals.190801.mapped.csv.gz
out_no_neurons/cell_type.ss.qvals.190801.mapped.csv.gz
out_no_neurons/cell_type.ss.esw_s.190801.mapped.csv.gz
out_no_neurons/cell_type.si.esw.190801.mapped.csv.gz
out_no_neurons/cell_type.si.esw_null.190801.mapped.csv.gz
out_no_neurons

# CELLECT ESmu

In [17]:
es_metric = 'ESmu'
binary_or_cont = 'binary'
name_of_dataset = 'tabula_muris-no_neuron'
bench_date = 'benchmark' + current_date

esmu_df = machine.metrics["cell_type.esmu"].copy()
esmu_df.where(cond=esmu_df==0,other=1, inplace=True)
esmu_df.index.rename(name='gene',inplace=True)
esmu_df.reset_index(inplace=True)
esmu_long_df = pd.melt(esmu_df,id_vars=['gene'],var_name='annotation', value_name='specificity')
multi_geneset_celltypes = esmu_long_df[['annotation','gene','specificity']]
multi_geneset_celltypes = multi_geneset_celltypes.loc[multi_geneset_celltypes.specificity>0]
multi_geneset_celltypes.to_csv('../../data/benchmark_multigenesets/multi_geneset.{}.{}.{}.{}.txt'.format(name_of_dataset, es_metric, binary_or_cont, bench_date),header=None, index=False,sep='\t')
multi_geneset_celltypes.head()

In [None]:
es_metric = 'ESmu'
binary_or_cont = 'continuous'

esmu_df = machine.metrics["cell_type.esmu"].copy()
# esmu_df.where(cond=esmu_df==0,other=1, inplace=True)
esmu_df.index.rename(name='gene',inplace=True)
esmu_df.reset_index(inplace=True)
esmu_long_df = pd.melt(esmu_df,id_vars=['gene'],var_name='annotation', value_name='specificity')
multi_geneset_celltypes = esmu_long_df[['annotation','gene','specificity']]
multi_geneset_celltypes = multi_geneset_celltypes.loc[multi_geneset_celltypes.specificity>0]
multi_geneset_celltypes.to_csv('../../data/benchmark_multigenesets/multi_geneset.{}.{}.{}.{}.txt'.format(name_of_dataset, es_metric, binary_or_cont, bench_date),header=None, index=False,sep='\t')
multi_geneset_celltypes.head()

In [None]:
es_metric = 'ESmu'
binary_or_cont = 'continuous-squared'

esmu_df = machine.metrics["cell_type.esmu"].copy()
# esmu_df = pd.read_csv('out_cell_type/cell_type.esmu.mapped.csv.gz',index_col=0)
# esmu_df.where(cond=esmu_df==0,other=1, inplace=True)
esmu_df.index.rename(name='gene',inplace=True)
esmu_df = esmu_df**2
esmu_df.reset_index(inplace=True)
esmu_long_df = pd.melt(esmu_df,id_vars=['gene'],var_name='annotation', value_name='specificity')
multi_geneset_celltypes = esmu_long_df[['annotation','gene','specificity']]
multi_geneset_celltypes = multi_geneset_celltypes.loc[multi_geneset_celltypes.specificity>0]
multi_geneset_celltypes.to_csv('../../data/benchmark_multigenesets/multi_geneset.{}.{}.{}.{}.txt'.format(name_of_dataset, es_metric, binary_or_cont, bench_date),header=None, index=False,sep='\t')
multi_geneset_celltypes.head()

In [18]:
multi_geneset_all_genes = pd.DataFrame(data={'annotation':'all_genes_in_dataset.{}'.format(name_of_dataset), "gene":np.unique(machine.df.index),'specificity':1})
multi_geneset_all_genes.to_csv('./multi_geneset.all_genes_in_dataset.{}.txt'.format(name_of_dataset),header=None, index=False,sep='\t')

# Load in existing CELLEX data and then drop the neurons

In [20]:
esmu_df = pd.read_csv('out_relevance/cell_type.esmu.190722.mapped.csv.gz',index_col=0)
name_of_dataset = 'tabula_muris-no_neuron-post_CELLEX'
print(esmu_df[non_neurons].shape)
print(esmu_df.shape)
esmu_df.head()

(14302, 108)
(14302, 115)


Unnamed: 0_level_0,Bladder.bladder_cell,Bladder.bladder_urothelial_cell,Brain_Myeloid.macrophage,Brain_Myeloid.microglial_cell,Brain_Non-Myeloid.Bergmann_glial_cell,Brain_Non-Myeloid.astrocyte,Brain_Non-Myeloid.brain_pericyte,Brain_Non-Myeloid.endothelial_cell,Brain_Non-Myeloid.neuron,Brain_Non-Myeloid.oligodendrocyte,...,Spleen.macrophage,Thymus.DN1_thymic_pro-T_cell,Thymus.immature_T_cell,Thymus.leukocyte,Tongue.basal_cell_of_epidermis,Tongue.keratinocyte,Trachea.blood_cell,Trachea.endothelial_cell,Trachea.epithelial_cell,Trachea.mesenchymal_cell
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000081791,0.02842,0.0,0.0,0.043131,0.0,0.021647,0.147595,0.377622,0.039707,0.24004,...,0.0,0.0,0.0,0.0,0.331036,0.0,0.0,0.0,0.0,0.0
ENSG00000162929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.407688,0.219948,...,0.0,0.0,0.0,0.0,0.038015,0.0,0.0,0.049549,0.0,0.213655
ENSG00000168887,0.0,0.0,0.0,0.056487,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.192736,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000162384,0.34127,0.175932,0.0,0.0,0.0,0.0,0.0,0.252456,0.0,0.0,...,0.0,0.0,0.0,0.0,0.060192,0.104673,0.0,0.0,0.0,0.110991
ENSG00000154274,0.0,0.679072,0.422476,0.900619,0.359992,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
es_metric = 'ESmu'
binary_or_cont = 'binary'

esmu_df = pd.read_csv('out_relevance/cell_type.esmu.190722.mapped.csv.gz',index_col=0)
esmu_df = esmu_df[non_neurons]
esmu_df.where(cond=esmu_df==0,other=1, inplace=True)
esmu_df.index.rename(name='gene',inplace=True)
esmu_df.reset_index(inplace=True)
esmu_long_df = pd.melt(esmu_df,id_vars=['gene'],var_name='annotation', value_name='specificity')
multi_geneset_celltypes = esmu_long_df[['annotation','gene','specificity']]
multi_geneset_celltypes = multi_geneset_celltypes.loc[multi_geneset_celltypes.specificity>0]
multi_geneset_celltypes.to_csv('../../data/benchmark_multigenesets/multi_geneset.{}.{}.{}.{}.txt'.format(name_of_dataset, es_metric, binary_or_cont, bench_date),header=None, index=False,sep='\t')
multi_geneset_celltypes.head()

Unnamed: 0,annotation,gene,specificity
0,Bladder.bladder_cell,ENSG00000081791,1.0
3,Bladder.bladder_cell,ENSG00000162384,1.0
6,Bladder.bladder_cell,ENSG00000110696,1.0
10,Bladder.bladder_cell,ENSG00000137720,1.0
14,Bladder.bladder_cell,ENSG00000149179,1.0


In [22]:
es_metric = 'ESmu'
binary_or_cont = 'continuous'

esmu_df = pd.read_csv('out_relevance/cell_type.esmu.190722.mapped.csv.gz',index_col=0)
esmu_df = esmu_df[non_neurons]
# esmu_df.where(cond=esmu_df==0,other=1, inplace=True)
esmu_df.index.rename(name='gene',inplace=True)
esmu_df.reset_index(inplace=True)
esmu_long_df = pd.melt(esmu_df,id_vars=['gene'],var_name='annotation', value_name='specificity')
multi_geneset_celltypes = esmu_long_df[['annotation','gene','specificity']]
multi_geneset_celltypes = multi_geneset_celltypes.loc[multi_geneset_celltypes.specificity>0]
multi_geneset_celltypes.to_csv('../../data/benchmark_multigenesets/multi_geneset.{}.{}.{}.{}.txt'.format(name_of_dataset, es_metric, binary_or_cont, bench_date),header=None, index=False,sep='\t')
multi_geneset_celltypes.head()

Unnamed: 0,annotation,gene,specificity
0,Bladder.bladder_cell,ENSG00000081791,0.02842
3,Bladder.bladder_cell,ENSG00000162384,0.34127
6,Bladder.bladder_cell,ENSG00000110696,0.031943
10,Bladder.bladder_cell,ENSG00000137720,0.046799
14,Bladder.bladder_cell,ENSG00000149179,0.218278


In [23]:
es_metric = 'ESmu'
binary_or_cont = 'continuous-squared'

esmu_df = pd.read_csv('out_relevance/cell_type.esmu.190722.mapped.csv.gz',index_col=0)
esmu_df = esmu_df[non_neurons]
# esmu_df.where(cond=esmu_df==0,other=1, inplace=True)
esmu_df.index.rename(name='gene',inplace=True)
esmu_df = esmu_df**2
esmu_df.reset_index(inplace=True)
esmu_long_df = pd.melt(esmu_df,id_vars=['gene'],var_name='annotation', value_name='specificity')
multi_geneset_celltypes = esmu_long_df[['annotation','gene','specificity']]
multi_geneset_celltypes = multi_geneset_celltypes.loc[multi_geneset_celltypes.specificity>0]
multi_geneset_celltypes.to_csv('../../data/benchmark_multigenesets/multi_geneset.{}.{}.{}.{}.txt'.format(name_of_dataset, es_metric, binary_or_cont, bench_date),header=None, index=False,sep='\t')
multi_geneset_celltypes.head()

Unnamed: 0,annotation,gene,specificity
0,Bladder.bladder_cell,ENSG00000081791,0.000808
3,Bladder.bladder_cell,ENSG00000162384,0.116465
6,Bladder.bladder_cell,ENSG00000110696,0.00102
10,Bladder.bladder_cell,ENSG00000137720,0.00219
14,Bladder.bladder_cell,ENSG00000149179,0.047645
