In [16]:
import pandas as pd
import glob
import os
import csv
import numpy as np
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

try:
    code_dir
except NameError:
    code_dir = os.getcwd()
    source_dir = code_dir.replace("codes_local", "0_sources")
    out_dir = code_dir.replace("codes_local", "1_ChIP_peak_distribution_RNAseqChIPseqJaccardIndex")

In [2]:
gene_list_file = source_dir + "/gene_list.csv"
gene_list_df = pd.read_csv(gene_list_file)
print(set(gene_list_df['labels']))

chip_anno_dir = '/media/pipkin/Rocket2/T_Cell_ChIP/202012_ChIP/3_peak_annotations'
chip_anno_files = glob.glob("%s/*annoDf.csv"%chip_anno_dir)
chip_names = [x.split("/")[-1].replace("_annoDf.csv", "") for x in chip_anno_files]

out_file = out_dir + "/ChIP_peak_chisq"
out_sum_pval_file = out_dir + "/sum_ChIP_peak_chisq.csv"

{'sc_P6_cl13_up', 'sc_d8_cl13_up', 'd5_KLRG1lo_Runx3_repressed', 'sc_P5_arm_up', 'd5_KLRG1hi_all', 'nascent_Il2_promoted', 'sc_P4_arm_up', 'sc_P3_arm_up', 'sc_P2_arm_up', 'nascent_48h_up', 'sc_P7_arm_up', 'd5_KLRG1lo_all', 'sc_all', 'd5_KLRG1hi_Runx3_repressed', 'nascent_48h_dn', 'sc_P3_cl13_up', 'd5_KLRG1hi_Runx3_promoted', 'sc_P7_cl13_up', 'd5_KLRG1lo_Runx3_promoted', 'sc_P9_cl13_up', 'sc_d8_arm_up', 'nascent_all', 'sc_d5_cl13_up', 'sc_P9_arm_up', 'sc_P6_arm_up', 'sc_P2_cl13_up', 'sc_d5_arm_up', 'nascent_Il2_repressed', 'sc_P5_cl13_up', 'sc_P4_cl13_up'}


In [3]:
chip_df = pd.DataFrame({"ChIP": chip_names})
chip_genes_n = []
for chip_i_file in chip_anno_files:
    chip_i_df = pd.read_csv(chip_i_file)
    chip_i_genes = set(x for x in chip_i_df['SYMBOL'] if str(x) != 'nan')
    chip_genes_n.append(len(chip_i_genes))
chip_df["ChIP gene #"] = chip_genes_n

In [26]:
jaccard_df = chip_df.copy()
for gene_list_label in set(gene_list_df['labels']):
    print(gene_list_label)
    genes_df = pd.DataFrame()
    gene_set = set(gene_list_df[gene_list_df['labels'] == gene_list_label]['gene_names'].tolist())
    jaccard_list = []
    overlap_list = []
    for chip_i_file in chip_anno_files:
        chip_i_name = chip_i_file.split("/")[-1].replace("_annoDf.csv", "")
        chip_i_df = pd.read_csv(chip_i_file)
        chip_i_genes = set(x for x in chip_i_df['SYMBOL'] if str(x) != 'nan')
        
        intersect_genes = gene_set & chip_i_genes
        union_genes = gene_set | chip_i_genes
        
        jaccard_list.append(len(intersect_genes)/ len(union_genes))
        genes_df[chip_i_name] = pd.Series(list(intersect_genes))
        overlap_list.append("|".join(list(intersect_genes)))
    jaccard_df[gene_list_label] = ["{:.4f}".format(x) for x in jaccard_list]
    genes_df.to_csv("%s/%s_RNAseq_ChIPseq_interseqGenes.csv"%(out_dir, gene_list_label), index=False)
jaccard_df.to_csv("%s/RNAseq_ChIPseq_Jaccard.csv"%out_dir, index=False)

sc_P6_cl13_up
sc_d8_cl13_up
d5_KLRG1lo_Runx3_repressed
sc_P5_arm_up
d5_KLRG1hi_all
nascent_Il2_promoted
sc_P4_arm_up


  genes_df[chip_i_name] = pd.Series(list(intersect_genes))


sc_P3_arm_up
sc_P2_arm_up
nascent_48h_up
sc_P7_arm_up
d5_KLRG1lo_all
sc_all
d5_KLRG1hi_Runx3_repressed
nascent_48h_dn
sc_P3_cl13_up
d5_KLRG1hi_Runx3_promoted
sc_P7_cl13_up
d5_KLRG1lo_Runx3_promoted
sc_P9_cl13_up
sc_d8_arm_up
nascent_all
sc_d5_cl13_up
sc_P9_arm_up
sc_P6_arm_up
sc_P2_cl13_up
sc_d5_arm_up
nascent_Il2_repressed
sc_P5_cl13_up
sc_P4_cl13_up


### Cluster heatmap

In [21]:
jaccard_df = pd.read_csv("%s/RNAseq_ChIPseq_Jaccard.csv"%out_dir)
jaccard_heatmap_df = jaccard_df[list(set(gene_list_df['labels']))]
jaccard_heatmap_df.index = jaccard_df['ChIP']

jaccard_clustermap = sns.clustermap(jaccard_heatmap_df, figsize=(30,30))
jaccard_clustermap.savefig("%s/RNAseq_ChIPseq_Jaccard.jpg"%out_dir)
plt.close()

### Cl13 & Arm key genes

In [122]:
cl13_d5_chip_intersection_df = pd.read_csv("%s/sc_d5_cl13_up_RNAseq_ChIPseq_interseqGenes.csv"%out_dir)
arm_d5_chip_intersection_df = pd.read_csv("%s/sc_d5_arm_up_RNAseq_ChIPseq_interseqGenes.csv"%out_dir)

il2_repressed_genes = gene_list_df[gene_list_df['labels'] == 'nascent_Il2_repressed']['gene_names'].tolist()
il2_promoted_genes = gene_list_df[gene_list_df['labels'] == 'nascent_Il2_promoted']['gene_names'].tolist()

nfat1_genes = cl13_d5_chip_intersection_df['2015_IMMUNITY_Martinez___Nfat1_invitro-CD8-PI-1h-WT'].tolist()
nfat1_genes = set(x for x in nfat1_genes if str(x) != 'nan')

tbet_genes = cl13_d5_chip_intersection_df['2015_JEM_Dominguez___Tbet_LCMVarm-D8-CD8-Il12-WT'].tolist()
tbet_genes = set(x for x in tbet_genes if str(x) != 'nan')

jund_genes = cl13_d5_chip_intersection_df['2016_NAT_IMMUNOL_Roychoudhuri___JunD_invitro-stim-CD8-WT'].tolist()
jund_genes = set(x for x in jund_genes if str(x) != 'nan')

tcf1_genes = arm_d5_chip_intersection_df['2016_NAT_IMMUNOL_Xing___Tcf1_naive-CD8-WT'].tolist()
tcf1_genes = set(x for x in tcf1_genes if str(x) != 'nan')

brd4_genes = arm_d5_chip_intersection_df['2021_GoldrathLab_Brd4___BRD4_NAV-CD8'].tolist()
brd4_genes = set(x for x in brd4_genes if str(x) != 'nan')

cbfb_genes = arm_d5_chip_intersection_df['2017_NAT_IMMUNOL_Shan___CBFb_Naive_P14-CD8-WT'].tolist()
cbfb_genes = set(x for x in cbfb_genes if str(x) != 'nan')

avg_expr_file = '/media/pipkin/ROCKET-PRO/CD8_DEV_SC/0_Acute-Chronic/1_Scanpy/0_Scanpy_out_resampled/1_avg_expr/cell_type_simp_mean_scaled_expr.csv'
avg_expr_df = pd.read_csv(avg_expr_file).set_index("gene_name")

In [129]:
cl13_key_genes = nfat1_genes & tbet_genes & jund_genes & set(il2_repressed_genes)
arm_key_genes = cbfb_genes & brd4_genes & tcf1_genes & set(il2_promoted_genes)

key_genes_df = pd.DataFrame()
key_genes_df['Cl13'] = pd.Series(list(cl13_key_genes))
key_genes_df['Arm'] = pd.Series(list(arm_key_genes))
key_genes_df.to_csv("/media/pipkin/ROCKET-PRO/CD8_DEV_SC/3_ChIP/chip_regulated_key_genes_Cl13-Arm-d5.csv",
                   index=False)

In [124]:
cl13_avg_expr_df = avg_expr_df.loc[[x for x in cl13_key_genes if x in avg_expr_df.index]]
cl13_avg_expr_df.sort_values("C5", ascending=False).index.tolist()

['Cd9',
 'Ifi27l2a',
 'Nfkbiz',
 'Traf1',
 'Serpinb6b',
 'Xcl1',
 'Bcl2a1d',
 'Ikzf2',
 'Cd226',
 'Asap1',
 'Ly6e',
 'Tnfsf8',
 'Pou2f2',
 'Ifih1',
 'Emb',
 'Serpinb9',
 'Cd52',
 'Gpr18',
 'Lsp1',
 'Tspan13',
 'Lgals3bp',
 'Zbp1',
 'Smg6',
 'Rora',
 'Ccr6',
 'Myo1e',
 'Cxcr3',
 'Sla',
 'Ppp1r16b',
 'Gimap4',
 'Ms4a4b',
 'Inpp4b',
 'Cd274',
 'Gimap5',
 'Ms4a6d',
 'Ets1',
 'Tnip1',
 'Aff3',
 'Id3',
 'Stat4',
 'Pecam1',
 'Rasa3',
 'Mast4',
 'Cd44',
 'Resf1',
 'Ipcef1',
 'Irf9',
 'Fam129a',
 'Nrip1',
 'Aftph',
 'Nfkbie',
 'Gfod1',
 'Gramd1b',
 'Gata3',
 'Rasgef1b',
 'Gramd3',
 'Elk3',
 'Arl5c',
 'Aim2',
 'Lyst',
 'Ncoa7',
 'Frmd4a',
 'Dnajc1',
 'Nsmaf',
 'Lrch1',
 'Ephx1',
 'Coq8a',
 'Ripor2',
 'Usp3',
 'Tbc1d4',
 'Mgat5',
 'A630001G21Rik',
 'Abhd2',
 'Stat1',
 'Spry2',
 'Prkcb',
 'Ifi213',
 'Haao',
 'Ifnar1',
 'Parp14',
 'Kif13b',
 'Heg1',
 'Il6st',
 'Pacsin1',
 'Arid5b',
 'Lax1',
 'Ctsw',
 'Cstad',
 'Rnasel',
 'Ikzf3',
 'Nfia',
 'Csf1',
 'Xdh',
 'Gimap8',
 'Arhgef3',
 'Rab37',
 'Vsir',
 

In [125]:
arm_avg_expr_df = avg_expr_df.loc[[x for x in arm_key_genes if x in avg_expr_df.index]]
arm_avg_expr_df.sort_values("C5", ascending=False).index.tolist()

['Nme1',
 'Ppa1',
 'Hsp90aa1',
 'Ldha',
 'Hnrnpab',
 'Manf',
 'Hsp90ab1',
 'Vim',
 'Susd1',
 'Tgm2',
 'Pcgf5',
 'Selp',
 'Amd2',
 'Hspa9',
 'Rev3l',
 'Ciart',
 'Rps20',
 'Elovl6',
 'Rgs1',
 'Fgd2',
 'Klf6',
 'Setbp1',
 'Wwp1',
 'Abl2',
 'Tnfrsf10b',
 'Trio',
 'Prf1',
 'Slc3a2',
 'Tgfbr3',
 'Camsap2',
 'Atp2b4',
 'Rffl',
 'Gadd45g',
 'Slc16a6',
 'Irs2',
 'Malt1',
 'Chd7',
 'Znrf3',
 'Rplp1',
 'Crim1',
 'Klf7',
 'Rap2a',
 'Rps19',
 'Gzmb',
 'Tsc22d3',
 'Rps7',
 'Rps9',
 'Il7r',
 'Hspa8']