In [16]:
import pandas as pd
import glob
import os
import csv
import numpy as np
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

try:
    code_dir
except NameError:
    code_dir = os.getcwd()
    source_dir = code_dir.replace("codes_local", "0_sources")
    out_dir = code_dir.replace("codes_local", "1_ChIP_peak_distribution_RNAseqChIPseqJaccardIndex")

In [2]:
gene_list_file = source_dir + "/gene_list.csv"
gene_list_df = pd.read_csv(gene_list_file)
print(set(gene_list_df['labels']))

chip_anno_dir = '/media/pipkin/Rocket2/T_Cell_ChIP/202012_ChIP/3_peak_annotations'
chip_anno_files = glob.glob("%s/*annoDf.csv"%chip_anno_dir)
chip_names = [x.split("/")[-1].replace("_annoDf.csv", "") for x in chip_anno_files]

out_file = out_dir + "/ChIP_peak_chisq"
out_sum_pval_file = out_dir + "/sum_ChIP_peak_chisq.csv"

{'sc_P6_cl13_up', 'sc_d8_cl13_up', 'd5_KLRG1lo_Runx3_repressed', 'sc_P5_arm_up', 'd5_KLRG1hi_all', 'nascent_Il2_promoted', 'sc_P4_arm_up', 'sc_P3_arm_up', 'sc_P2_arm_up', 'nascent_48h_up', 'sc_P7_arm_up', 'd5_KLRG1lo_all', 'sc_all', 'd5_KLRG1hi_Runx3_repressed', 'nascent_48h_dn', 'sc_P3_cl13_up', 'd5_KLRG1hi_Runx3_promoted', 'sc_P7_cl13_up', 'd5_KLRG1lo_Runx3_promoted', 'sc_P9_cl13_up', 'sc_d8_arm_up', 'nascent_all', 'sc_d5_cl13_up', 'sc_P9_arm_up', 'sc_P6_arm_up', 'sc_P2_cl13_up', 'sc_d5_arm_up', 'nascent_Il2_repressed', 'sc_P5_cl13_up', 'sc_P4_cl13_up'}


In [3]:
chip_df = pd.DataFrame({"ChIP": chip_names})
chip_genes_n = []
for chip_i_file in chip_anno_files:
    chip_i_df = pd.read_csv(chip_i_file)
    chip_i_genes = set(x for x in chip_i_df['SYMBOL'] if str(x) != 'nan')
    chip_genes_n.append(len(chip_i_genes))
chip_df["ChIP gene #"] = chip_genes_n

In [26]:
jaccard_df = chip_df.copy()
for gene_list_label in set(gene_list_df['labels']):
    print(gene_list_label)
    genes_df = pd.DataFrame()
    gene_set = set(gene_list_df[gene_list_df['labels'] == gene_list_label]['gene_names'].tolist())
    jaccard_list = []
    overlap_list = []
    for chip_i_file in chip_anno_files:
        chip_i_name = chip_i_file.split("/")[-1].replace("_annoDf.csv", "")
        chip_i_df = pd.read_csv(chip_i_file)
        chip_i_genes = set(x for x in chip_i_df['SYMBOL'] if str(x) != 'nan')
        
        intersect_genes = gene_set & chip_i_genes
        union_genes = gene_set | chip_i_genes
        
        jaccard_list.append(len(intersect_genes)/ len(union_genes))
        genes_df[chip_i_name] = pd.Series(list(intersect_genes))
        overlap_list.append("|".join(list(intersect_genes)))
    jaccard_df[gene_list_label] = ["{:.4f}".format(x) for x in jaccard_list]
    genes_df.to_csv("%s/%s_RNAseq_ChIPseq_interseqGenes.csv"%(out_dir, gene_list_label), index=False)
jaccard_df.to_csv("%s/RNAseq_ChIPseq_Jaccard.csv"%out_dir, index=False)

sc_P6_cl13_up
sc_d8_cl13_up
d5_KLRG1lo_Runx3_repressed
sc_P5_arm_up
d5_KLRG1hi_all
nascent_Il2_promoted
sc_P4_arm_up


  genes_df[chip_i_name] = pd.Series(list(intersect_genes))


sc_P3_arm_up
sc_P2_arm_up
nascent_48h_up
sc_P7_arm_up
d5_KLRG1lo_all
sc_all
d5_KLRG1hi_Runx3_repressed
nascent_48h_dn
sc_P3_cl13_up
d5_KLRG1hi_Runx3_promoted
sc_P7_cl13_up
d5_KLRG1lo_Runx3_promoted
sc_P9_cl13_up
sc_d8_arm_up
nascent_all
sc_d5_cl13_up
sc_P9_arm_up
sc_P6_arm_up
sc_P2_cl13_up
sc_d5_arm_up
nascent_Il2_repressed
sc_P5_cl13_up
sc_P4_cl13_up


### Cluster heatmap

In [21]:
jaccard_df = pd.read_csv("%s/RNAseq_ChIPseq_Jaccard.csv"%out_dir)
jaccard_heatmap_df = jaccard_df[list(set(gene_list_df['labels']))]
jaccard_heatmap_df.index = jaccard_df['ChIP']

jaccard_clustermap = sns.clustermap(jaccard_heatmap_df, figsize=(30,30))
jaccard_clustermap.savefig("%s/RNAseq_ChIPseq_Jaccard.jpg"%out_dir)
plt.close()

0        Ppm1h
1         Cd3e
2        Runx2
3         Nek7
4          Lbh
5      Apobec3
6        Ptk2b
7        Hif1a
8         Cltc
9      S100a11
10       Rbm25
11     Tmem154
12        Ybx3
13        Icos
14        Cd3g
15       Slfn2
16       Fam3c
17       Stap1
18       Sept7
19       Gata3
20    B4galnt1
21         Tox
dtype: object