In [3]:
import pandas as pd
import glob
import os
import csv
import numpy as np
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
%matplotlib inline

try:
    code_dir
except NameError:
    code_dir = os.getcwd()
    source_dir = code_dir.replace("codes_local", "0_sources")
    out_dir = code_dir.replace("codes_local", "2_ChIP_peak_jaccard")
    out_dir_simp = code_dir.replace("codes_local", "2_ChIP_peak_jaccard_simp")

In [4]:
gene_list_file = source_dir + "/gene_list.csv"
gene_list_df = pd.read_csv(gene_list_file)
gene_list_labels_uniq = set(gene_list_df['labels'])
print(gene_list_labels_uniq)

chip_anno_dir = '/media/pipkin/Rocket2/T_Cell_ChIP/202012_ChIP/3_peak_annotations'
chip_anno_files = glob.glob("%s/*annoDf.csv"%chip_anno_dir)
chip_names = [x.split("/")[-1].replace("_annoDf.csv", "") for x in chip_anno_files]

{'sc_d8_cl13_up', 'nascent_Il2_repressed', 'sc_P4_arm_up', 'nascent_Il2_promoted', 'nascent_all', 'sc_P2_cl13_up', 'sc_P9_cl13_up', 'sc_d8_arm_up', 'd5_KLRG1hi_Runx3_promoted', 'sc_P9_arm_up', 'sc_P3_arm_up', 'd5_KLRG1lo_all', 'nascent_48h_up', 'nascent_48h_dn', 'sc_P4_cl13_up', 'sc_P5_arm_up', 'd5_KLRG1lo_Runx3_repressed', 'sc_P2_arm_up', 'd5_KLRG1hi_all', 'sc_all', 'sc_P6_arm_up', 'sc_P6_cl13_up', 'd5_KLRG1lo_Runx3_promoted', 'sc_P7_arm_up', 'sc_d5_arm_up', 'sc_P3_cl13_up', 'd5_KLRG1hi_Runx3_repressed', 'sc_d5_cl13_up', 'sc_P7_cl13_up', 'sc_P5_cl13_up'}


In [19]:
chip_df = pd.DataFrame({"ChIP": chip_names})
chip_genes_n = []
for chip_i_file in chip_anno_files:
    chip_i_df = pd.read_csv(chip_i_file)
    chip_i_genes = set(x for x in chip_i_df['SYMBOL'] if str(x) != 'nan')
    chip_genes_n.append(len(chip_i_genes))
chip_df["ChIP gene #"] = chip_genes_n

chip_df['type'] = ["Histone" if x.split("___")[1].startswith("H3") else "TF" for x in chip_names]
chip_df['CD4_CD8'] = ["CD8" if "CD8" in x else "CD4" if "CD4" in x else "" for x in chip_names]
chip_df.to_csv("%s/chip_sum.csv"%out_dir_simp, index=False)

## Manually annotate datasets to be used in simplified plots

## 3. Summarize jaccard index between ChIPs for different RNAseq association

In [6]:
# Use only non redudant combinations for ChIP experiment comparisons
uniq_chip_cbs = list(itertools.combinations(chip_names,2))
uniq_chip_cbs = ["---".join(x) for x in uniq_chip_cbs]

In [8]:
jaccard_files = glob.glob("%s/*jaccard.csv"%out_dir)
jaccard_all_df = pd.DataFrame({"ChIP_combination": uniq_chip_cbs})
for jaccard_file in jaccard_files:
    jaccard_name = jaccard_file.split("/")[-1].replace("_jaccard.csv", "")
    jaccard_df = pd.read_csv(jaccard_file)
    
    jaccard_df_melt = pd.melt(jaccard_df, id_vars=['ChIP', 'ChIP gene #'])
    jaccard_df_melt.columns = ['ChIP_1', 'ChIP gene #', 'ChIP_2', 'jaccard_index']
    jaccard_df_melt['ChIP_combination'] = ["---".join([x,y]) for index, (x, y) in 
                                           enumerate(zip(jaccard_df_melt['ChIP_1'].tolist(), 
                                                         jaccard_df_melt['ChIP_2'].tolist()))]
    jaccard_df_melt = jaccard_df_melt[['ChIP_combination', 'jaccard_index']]
    jaccard_df_melt = jaccard_df_melt.set_index("ChIP_combination")
    jaccard_df_melt = jaccard_df_melt.loc[uniq_chip_cbs]
    
    jaccard_all_df[jaccard_name] = jaccard_df_melt['jaccard_index'].tolist()
jaccard_all_df.to_csv("%s/All_jaccard_index.csv"%out_dir, index=False)

In [10]:
jaccard_all_clustermap_df = jaccard_all_df.set_index("ChIP_combination") 

In [11]:
jaccard_all_clustermap = sns.clustermap(jaccard_all_clustermap_df, figsize=(30,30))
jaccard_all_clustermap.savefig("%s/All_jaccard_index_clustermap.jpg"%out_dir)
plt.close()

jaccard_all_dgram = jaccard_all_clustermap.dendrogram_col.dendrogram
jaccard_all_order = [jaccard_all_clustermap_df.columns.tolist()[int(x)] for x in jaccard_all_dgram['ivl']]
jaccard_all_order_df = pd.DataFrame({"ChIP": jaccard_all_order})
jaccard_all_order_df.to_csv("%s/All_jaccard_index_clustermapColOrder.csv"%out_dir)

jaccard_all_dgram = jaccard_all_clustermap.dendrogram_row.dendrogram
jaccard_all_order = [jaccard_all_clustermap_df.index.tolist()[int(x)] for x in jaccard_all_dgram['ivl']]
jaccard_all_order_df = pd.DataFrame({"ChIP": jaccard_all_order})
jaccard_all_order_df.to_csv("%s/All_jaccard_index_clustermapRowOrder.csv"%out_dir)

