In [16]:
import pandas as pd
import glob
import os
import csv
import numpy as np
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

try:
    code_dir
except NameError:
    code_dir = os.getcwd()
    source_dir = code_dir.replace("codes_local", "0_sources")
    out_dir = code_dir.replace("codes_local", "1_ChIP_peak_distribution_RNAseqChIPseqJaccardIndex")

In [2]:
gene_list_file = source_dir + "/gene_list.csv"
gene_list_df = pd.read_csv(gene_list_file)
print(set(gene_list_df['labels']))

chip_anno_dir = '/media/pipkin/Rocket2/T_Cell_ChIP/202012_ChIP/3_peak_annotations'
chip_anno_files = glob.glob("%s/*annoDf.csv"%chip_anno_dir)
chip_names = [x.split("/")[-1].replace("_annoDf.csv", "") for x in chip_anno_files]

out_file = out_dir + "/ChIP_peak_chisq"
out_sum_pval_file = out_dir + "/sum_ChIP_peak_chisq.csv"

{'sc_P6_cl13_up', 'sc_d8_cl13_up', 'd5_KLRG1lo_Runx3_repressed', 'sc_P5_arm_up', 'd5_KLRG1hi_all', 'nascent_Il2_promoted', 'sc_P4_arm_up', 'sc_P3_arm_up', 'sc_P2_arm_up', 'nascent_48h_up', 'sc_P7_arm_up', 'd5_KLRG1lo_all', 'sc_all', 'd5_KLRG1hi_Runx3_repressed', 'nascent_48h_dn', 'sc_P3_cl13_up', 'd5_KLRG1hi_Runx3_promoted', 'sc_P7_cl13_up', 'd5_KLRG1lo_Runx3_promoted', 'sc_P9_cl13_up', 'sc_d8_arm_up', 'nascent_all', 'sc_d5_cl13_up', 'sc_P9_arm_up', 'sc_P6_arm_up', 'sc_P2_cl13_up', 'sc_d5_arm_up', 'nascent_Il2_repressed', 'sc_P5_cl13_up', 'sc_P4_cl13_up'}


In [3]:
chip_df = pd.DataFrame({"ChIP": chip_names})
chip_genes_n = []
for chip_i_file in chip_anno_files:
    chip_i_df = pd.read_csv(chip_i_file)
    chip_i_genes = set(x for x in chip_i_df['SYMBOL'] if str(x) != 'nan')
    chip_genes_n.append(len(chip_i_genes))
chip_df["ChIP gene #"] = chip_genes_n

In [26]:
jaccard_df = chip_df.copy()
for gene_list_label in set(gene_list_df['labels']):
    print(gene_list_label)
    genes_df = pd.DataFrame()
    gene_set = set(gene_list_df[gene_list_df['labels'] == gene_list_label]['gene_names'].tolist())
    jaccard_list = []
    overlap_list = []
    for chip_i_file in chip_anno_files:
        chip_i_name = chip_i_file.split("/")[-1].replace("_annoDf.csv", "")
        chip_i_df = pd.read_csv(chip_i_file)
        chip_i_genes = set(x for x in chip_i_df['SYMBOL'] if str(x) != 'nan')
        
        intersect_genes = gene_set & chip_i_genes
        union_genes = gene_set | chip_i_genes
        
        jaccard_list.append(len(intersect_genes)/ len(union_genes))
        genes_df[chip_i_name] = pd.Series(list(intersect_genes))
        overlap_list.append("|".join(list(intersect_genes)))
    jaccard_df[gene_list_label] = ["{:.4f}".format(x) for x in jaccard_list]
    genes_df.to_csv("%s/%s_RNAseq_ChIPseq_interseqGenes.csv"%(out_dir, gene_list_label), index=False)
jaccard_df.to_csv("%s/RNAseq_ChIPseq_Jaccard.csv"%out_dir, index=False)

sc_P6_cl13_up
sc_d8_cl13_up
d5_KLRG1lo_Runx3_repressed
sc_P5_arm_up
d5_KLRG1hi_all
nascent_Il2_promoted
sc_P4_arm_up


  genes_df[chip_i_name] = pd.Series(list(intersect_genes))


sc_P3_arm_up
sc_P2_arm_up
nascent_48h_up
sc_P7_arm_up
d5_KLRG1lo_all
sc_all
d5_KLRG1hi_Runx3_repressed
nascent_48h_dn
sc_P3_cl13_up
d5_KLRG1hi_Runx3_promoted
sc_P7_cl13_up
d5_KLRG1lo_Runx3_promoted
sc_P9_cl13_up
sc_d8_arm_up
nascent_all
sc_d5_cl13_up
sc_P9_arm_up
sc_P6_arm_up
sc_P2_cl13_up
sc_d5_arm_up
nascent_Il2_repressed
sc_P5_cl13_up
sc_P4_cl13_up


### Cluster heatmap

In [21]:
jaccard_df = pd.read_csv("%s/RNAseq_ChIPseq_Jaccard.csv"%out_dir)
jaccard_heatmap_df = jaccard_df[list(set(gene_list_df['labels']))]
jaccard_heatmap_df.index = jaccard_df['ChIP']

jaccard_clustermap = sns.clustermap(jaccard_heatmap_df, figsize=(30,30))
jaccard_clustermap.savefig("%s/RNAseq_ChIPseq_Jaccard.jpg"%out_dir)
plt.close()

### Cl13 key genes

In [33]:
cl13_d5_genes = gene_list_df[gene_list_df['labels'] == 'sc_d5_cl13_up']['gene_names'].tolist()

In [34]:
cl13_d5_chip_intersection_df = pd.read_csv("%s/sc_d5_cl13_up_RNAseq_ChIPseq_interseqGenes.csv"%out_dir)

In [90]:
nfat1_genes = cl13_d5_chip_intersection_df['2015_IMMUNITY_Martinez___Nfat1_invitro-CD8-PI-1h-WT'].tolist()
nfat1_genes = set(x for x in nfat1_genes if str(x) != 'nan')

tbet_genes = cl13_d5_chip_intersection_df['2015_JEM_Dominguez___Tbet_LCMVarm-D8-CD8-Il12-WT'].tolist()
tbet_genes = set(x for x in tbet_genes if str(x) != 'nan')

jund_genes = cl13_d5_chip_intersection_df['2016_NAT_IMMUNOL_Roychoudhuri___JunD_invitro-stim-CD8-WT'].tolist()
jund_genes = set(x for x in jund_genes if str(x) != 'nan')

tcf1_genes = cl13_d5_chip_intersection_df['2016_NAT_IMMUNOL_Xing___Tcf1_naive-CD8-WT'].tolist()
tcf1_genes = set(x for x in tcf1_genes if str(x) != 'nan')

brd4_genes = cl13_d5_chip_intersection_df['2021_GoldrathLab_Brd4___BRD4_NAV-CD8'].tolist()
brd4_genes = set(x for x in brd4_genes if str(x) != 'nan')

cbfb_genes = cl13_d5_chip_intersection_df['2017_NAT_IMMUNOL_Shan___CBFb_Naive_P14-CD8-WT'].tolist()
cbfb_genes = set(x for x in cbfb_genes if str(x) != 'nan')

In [92]:
il2_repressed_genes = gene_list_df[gene_list_df['labels'] == 'nascent_Il2_repressed']['gene_names'].tolist()
il2_promoted_genes = gene_list_df[gene_list_df['labels'] == 'nascent_Il2_promoted']['gene_names'].tolist()

In [93]:
len(nfat1_genes & tbet_genes & jund_genes & set(il2_repressed_genes))

151

In [94]:
nfat1_genes & tbet_genes & jund_genes & set(il2_repressed_genes)

{'A630001G21Rik',
 'Abhd2',
 'Adora2a',
 'Aff3',
 'Aftph',
 'Aim2',
 'Ap1s3',
 'Arhgap30',
 'Arhgef3',
 'Arid5b',
 'Arl5c',
 'Asap1',
 'Atg16l2',
 'B4galnt1',
 'Bcl2a1d',
 'Bin2',
 'Ccdc148',
 'Ccr6',
 'Cd226',
 'Cd274',
 'Cd3d',
 'Cd3e',
 'Cd44',
 'Cd52',
 'Cd84',
 'Cd9',
 'Cdkn1b',
 'Celf2',
 'Chst12',
 'Commd3',
 'Coq8a',
 'Csf1',
 'Cstad',
 'Ctsw',
 'Cxcr3',
 'Dnajc1',
 'Dnajc15',
 'Dock2',
 'Elk3',
 'Emb',
 'Ephx1',
 'Ets1',
 'Evl',
 'Fam129a',
 'Fam3c',
 'Fam53b',
 'Flnb',
 'Frmd4a',
 'Fryl',
 'Fyco1',
 'Galm',
 'Gata3',
 'Gfod1',
 'Gimap4',
 'Gimap5',
 'Gimap8',
 'Gnptab',
 'Gpr18',
 'Gramd1b',
 'Gramd3',
 'Haao',
 'Heg1',
 'Hmox2',
 'Id3',
 'Ifi211',
 'Ifi213',
 'Ifi27l2a',
 'Ifih1',
 'Ifnar1',
 'Ikzf2',
 'Ikzf3',
 'Il6st',
 'Inpp4b',
 'Ipcef1',
 'Irf9',
 'Kif13b',
 'Lax1',
 'Lbh',
 'Lclat1',
 'Lcp1',
 'Lgals3bp',
 'Limd2',
 'Lrch1',
 'Lsp1',
 'Ly6e',
 'Lyst',
 'Mast4',
 'Mcoln2',
 'Mcoln3',
 'Mgat5',
 'Mmd',
 'Ms4a4b',
 'Ms4a6b',
 'Ms4a6d',
 'Myl12b',
 'Myo1e',
 'Ncoa7',
 'Nfi

In [97]:
len(cbfb_genes & brd4_genes & tcf1_genes & set(il2_promoted_genes))

56

In [98]:
cbfb_genes & brd4_genes & tcf1_genes & set(il2_promoted_genes)

{'Abtb2',
 'Aebp2',
 'BC004004',
 'Bcl2',
 'Cap1',
 'Chsy1',
 'Clic4',
 'Clptm1l',
 'Crem',
 'Csrnp1',
 'Cytip',
 'Dock10',
 'Eif1',
 'Eif2a',
 'Ell2',
 'Ern1',
 'Fam49b',
 'Fam71b',
 'Fndc3a',
 'Gadd45b',
 'Glud1',
 'Gpd2',
 'Gpr146',
 'Gxylt1',
 'Hivep3',
 'Ifng',
 'Il18r1',
 'Kpna1',
 'Kras',
 'Lin54',
 'Mfhas1',
 'Ndfip1',
 'Nktr',
 'Nr4a3',
 'Odc1',
 'Osbpl3',
 'Otulin',
 'P2rx7',
 'Pfkp',
 'Pim1',
 'Prag1',
 'Ptger4',
 'Ptprk',
 'Rasa2',
 'Rbpj',
 'Rnf19b',
 'Rnf216',
 'Skil',
 'Slc38a1',
 'Srgn',
 'Tcf4',
 'Tes',
 'Themis',
 'Tmem131l',
 'Tnfrsf1b',
 'Zfp608'}