# Explore genes - TF

Steps includes:

* calculation of the TF-target distances 
* TF-target top-50 table (compared with SCENIC)
* TF annotation on UMAP-plots
* extraction of gene modules based on TF-target
* cross-species target-set intersection of homologous TFs


In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

import scanpy as sc
from scipy import sparse
from scipy.special import softmax

import networkx as nx
import torch

In [2]:
import sys
sys.path.append('../')

import came
from came import pipeline, pp, pl

Using backend: pytorch


## Load CAME results

In [3]:
# the result directory
came_resdir = Path("../_case_res/testis-('testis_human', 'testis_mouse')(08-01 00.53.55)")
# came_resdir = Path("../_temp/('Baron_human', 'Baron_mouse')-(07-15 23.57.51)")

dpair, model = came.load_dpair_and_model(came_resdir)
predictor = came.Predictor.load(came_resdir / 'predictor.json')

# the feature dict and the heterogrnrous cell-gene graph
feat_dict = dpair.get_feature_dict(scale=True)
g = dpair.get_whole_net()

# reference and query sample-ids / gene-ids
obs_ids1, obs_ids2 = dpair.obs_ids1, dpair.obs_ids2
var_ids1, var_ids2 = dpair.var_ids1, dpair.var_ids2

classes = predictor.classes

[*] Setting dataset names:
	0-->testis_human
	1-->testis_mouse
[*] Setting aligned features for observation nodes (self._features)
[*] Setting un-aligned features (`self._ov_adjs`) for making links connecting observation and variable nodes
[*] Setting adjacent matrix connecting variables from these 2 datasets (`self._vv_adj`)
Index(['nGene', 'nUMI', 'orig.ident', 'percent.mito', 'indiv', 'CellType',
       'percent.x', 'percent.y', 'percent.autosome', 'GiniAll', 'GiniNon0',
       'cell_ontology_class'],
      dtype='object')
Index(['Dataset', 'CellType', 'nGene', 'nUMI', '%MT', '%ChrX', '%ChrY',
       '%Autosome', 'GiniAll', 'GiniNon0', 'GiniHVG', 'nReads', 'nUMIVsnGene',
       'nReadsVsnUMI', '%AllCells_sumL/all', '%AllCells_G1S/all',
       '%AllCells_S/all', '%AllCells_G2M/all', '%AllCells_M/all',
       '%AllCells_MG1/all', '%ActiveCyclingCells_sumL/all',
       '%ActiveCyclingCells_G1S/sumL', '%ActiveCyclingCells_S/sumL',
       '%ActiveCyclingCells_G2M/sumL', '%ActiveCyclingCe

## Compute / save / reload the hidden states

In [4]:
# all hidden states
hidden_list = came.model.get_all_hidden_states(model, feat_dict, g)

In [5]:
hidden_list[0]['cell'].shape, hidden_list[0]['gene'].shape

((24891, 128), (883, 128))

save the hidden states into .h5 file:

In [5]:
came.save_hidden_states(hidden_list, came_resdir / 'hidden_list.h5')

re-load the hidden states from the .h5 file:

In [6]:
hidden_list = came.load_hidden_states(came_resdir / 'hidden_list.h5')
# hidden_list[0]

## Get TF infomation

In [6]:
tfdir = Path('/Users/xingyan/Data/TF')

human_tf = pd.read_csv(tfdir / 'fantomTFs-human.csv')['Symbol'].tolist()
mouse_tf = pd.read_csv(tfdir / 'fantomTFs-mouse.csv')['Symbol'].tolist()


In [10]:
# node-ids and names of TF
tfids1, tfnames1 = dpair.get_vnode_ids_by_name(human_tf, 0, rm_unseen=True)
tfids2, tfnames2 = dpair.get_vnode_ids_by_name(mouse_tf, 1, rm_unseen=True)


len(tfnames1), len(tfnames2)

#  annotate TFs on gene UMAP 

# tfnames1, tfnames1_

(22, 22)

In [15]:
ilayer = [0, -1][1]
metric = 'cosine'
algorithm='brute'
k = 30

# get hidden states
h_gene = hidden_list[ilayer]['gene']
h_gene

df_tf_targets1, df_tf_targets2 = came.ana.tf_cross_knn(
        h_gene, dpair, human_tf, mouse_tf,
    )

In [16]:
df_tf_targets1

Unnamed: 0_level_0,TF_name,knn,knn_cross
TF_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EGR1,EGR1,"[EGR1, MT-ND2, SUB1, IFITM2, RPS5, RIF1, PABPC...","[Rpl14, Rarres2, Rp9, Hdac2, Rif1, Hnrnpa2b1, ..."
SOX5,SOX5,"[SOX5, YBX3, KIF2B, LCA5L, IZUMO4, SIRT2, MS4A...","[Sirt2, Enkur, Ccer1, Rnf32, Tmigd3, 1700086L1..."
CREB3L4,CREB3L4,"[CREB3L4, IFITM3, SERPING1, PPIG, HELLS, SSB, ...","[Hmgn1, 1700020N18Rik, Dnajc21, Fth1, Smc4, Cr..."
DMRTB1,DMRTB1,"[DMRTB1, CENPF, TBPL1, SON, SYCP2, SYCP1, SBNO...","[Vdac2, Malat1, Dnajc21, Hsp90aa1, Son, Thoc7,..."
DNAJC21,DNAJC21,"[DNAJC21, MT-ND2, DDTL, RPS3A, RPS8, SYCP2, RI...","[Rarres2, mt-Nd1, Rif1, Srp14, Rp9, Rdx, Hsp90..."
FOS,FOS,"[FOS, EGR1, IFITM2, SUB1, RPS5, RPL21, CCP110,...","[Hdac2, Rarres2, Rpl14, Rif1, Hnrnpm, Rps5, Eg..."
JUN,JUN,"[JUN, SRSF11, CKS2, RPSA, RP9, ART3, ZC3H13, H...","[mt-Nd2, Hells, Prdx1, mt-Nd4, Sycp2, Psma7, G..."
JUNB,JUNB,"[JUNB, RB1CC1, IFITM3, IGFBP6, IGFBP7, HMGN1, ...","[Junb, Pcm1, Hmgn1, Rps2, Rb1cc1, Igfbp7, Rps1..."
KDM5B,KDM5B,"[KDM5B, DDT, ZBTB38, TBPL1, MNS1, SWI5, AHNAK,...","[Cenpj, Gsn, Kdm5b, Ddt, Swi5, Luc7l2, Zbtb38,..."
LYAR,LYAR,"[LYAR, TUBA3D, ART3, SETX, CKS2, TBPL1, COL1A2...","[Golga4, Lyar, Dnajc21, Acyp1, Sycp2, Tbpl1, H..."


In [17]:
df_tf_targets2

Unnamed: 0_level_0,TF_name,knn,knn_cross
TF_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Spz1,Spz1,"[Spz1, Spata18, Hook1, Akap4, Proca1, 1700009N...","[GSG1, SPATA18, TPPP2, C17orf107, AKAP1, CCSER..."
Sox5,Sox5,"[Sox5, Tekt2, Lyzl6, BC048502, Gtsf1l, Tsga8, ...","[SPACA4, SPACA3, ACRV1, FAM209A, SPATA31A6, MS..."
Egr1,Egr1,"[Egr1, Hsp90ab1, Hectd1, Rps2, Rps19, Atxn7l3b...","[SYCP2, HSP90AB1, PARP1, HSPA5, RPS2, BRD7, RP..."
Zc3h13,Zc3h13,"[Zc3h13, Rpl18a, Dazl, Cox6c, Brd7, Hspa8, Hel...","[MLLT10, PTMA, HDAC2, NASP, SMC3, SMC4, HNRNPD..."
Zbtb38,Zbtb38,"[Zbtb38, Luc7l2, Rarres2, Rif1, mt-Nd1, Hells,...","[PPIG, ZBTB38, U2SURP, MT-CO1, GSN, ATXN7L3B, ..."
Ybx1,Ybx1,"[Ybx1, Spata1, Phospho2, Adam2, 1110004E09Rik,...","[C15orf48, KIF9, SPESP1, LRRC34, MNS1, RPL39L,..."
Tsc22d4,Tsc22d4,"[Tsc22d4, Pcm1, Hmgn1, Hmgb1, Smc4, Pnn, Sptbn...","[PCM1, TSC22D4, B2M, DHX36, CALM1, JUNB, IFITM..."
Tfam,Tfam,"[Tfam, H2afb1, Prm1, Tmem239, 4930544D05Rik, P...","[AKAP1, CCDC91, CCDC179, GLUL, TPPP2, RP11-322..."
Tbpl1,Tbpl1,"[Tbpl1, Art3, Fbp1, Brd7, Tmem97, Rbm25, Wbp11...","[CDC42EP3, TBPL1, RBM25, ART3, HSP90AB1, SETX,..."
Smarca5,Smarca5,"[Smarca5, Mllt10, Rpl21, Golga4, Rps27a, Sycp2...","[DAZL, NMT2, SON, SRSF11, NAP1L1, MLLT10, RPL2..."


In [18]:
# plt.hist(knn_dists12.flatten())

In [19]:
df_varmap = pd.read_csv('../came/sample_data/gene_matches_1v1_human2mouse.csv')
df_varmap = df_varmap.iloc[:, : 2]
name_map = df_varmap.set_index(df_varmap.columns[0]).iloc[:, 0]

def intersect(set1, set2, name_map=None):
    """ `name_map` maps names in set1 to names in set2"""
    if name_map is not None:
        set1 = set([name_map.get(x, None) for x in set1])
        # set1.remove(None)
    return set(set1).intersection(set2)

In [20]:
# TF-regulons --> sets

# target-set-intersection cross species
tf1 = tfnames1[0]
for tf1 in tfnames1:
    tf2 = name_map.get(tf1, None)
    if tf2 in tfnames2:
        common = intersect(df_tf_targets1.loc[tf1, 'knn'], df_tf_targets2.loc[tf2, 'knn'], name_map)
    else:
        common = None

    print(f'{(tf1, tf2)}:\t{common}')
# TF-centered modules




('EGR1', 'Egr1'):	{'Ncl', 'Sub1', 'Hsp90ab1', 'Hnrnph3', 'Atxn7l3b', 'Nmt2', 'mt-Nd1', 'Rif1', 'Egr1'}
('SOX5', 'Sox5'):	{'Enkur', 'Ccdc89', 'Lyzl6', '1700010I14Rik', 'Phf7', 'Gtsf1l', 'Sox5'}
('CREB3L4', 'Creb3l4'):	{'Creb3l4', 'Tsc22d4'}
('DMRTB1', 'Dmrtb1'):	{'Aprt', 'Golga4', 'Rb1cc1', 'Dmrtb1', 'Cox6c'}
('DNAJC21', 'Dnajc21'):	{'Tbpl1', 'Dnajc21', 'Son', 'mt-Nd2', 'Sycp2', 'Rif1'}
('FOS', 'Fos'):	{'Fkbp3', 'Fos', 'Atxn7l3b', 'Ccp110', 'Srp14', 'Rpl21', 'Rps5', 'Igfbp6', 'Rif1', 'Egr1'}
('JUN', 'Jun'):	{'Mphosph8', 'Rps21', 'Rbm39', 'Cks2', 'Jun', 'Rpsa', 'Hspa5', 'mt-Nd5', 'Rpl12'}
('JUNB', 'Junb'):	{'Rps21', 'Junb', 'Setx', 'Sbno1', 'Ik', 'Rb1cc1', 'Igfbp7'}
('KDM5B', 'Kdm5b'):	{'Kdm5b', 'Cenpj', 'Rarres2', 'Zbtb38'}
('LYAR', 'Lyar'):	{'Acyp1', 'Hells', 'Tuba3a', 'Lyar', 'Golga4', 'Art3', 'Sycp2'}
('MATR3', None):	None
('MESP1', 'Mesp1'):	{'Mesp1', 'Hells', 'mt-Nd2', '1700102P08Rik', 'Art3', 'Rif1'}
('MLLT10', 'Mllt10'):	{'Hnrnpm', 'Atxn7l3b', 'Rpl21', 'Mllt10', 'Rpl12'}
('PARP1'

In [87]:


os.listdir('../came/sample_data')


['TF.rar',
 'gene_matches_1v1_human2mouse.csv',
 '.DS_Store',
 'raw-Baron_mouse.h5ad',
 'gene_matches_1v1_mouse2human.csv',
 'raw-Baron_human.h5ad',
 'gene_matches_mouse2human.csv',
 'gene_matches_human2mouse.csv']