In [33]:
import pandas as pd
import glob
import os
import csv
import numpy as np
from scipy.stats import chi2_contingency

try:
    code_dir
except NameError:
    code_dir = os.getcwd()
    source_dir = code_dir.replace("codes_local", "0_sources")
    out_dir = code_dir.replace("codes_local", "1_ChIP_peak_distribution")

In [49]:
gene_list_file = source_dir + "/gene_list.csv"
gene_list_df = pd.read_csv(gene_list_file)
print(set(gene_list_df['labels']))

chip_anno_dir = '/media/pipkin/Rocket2/T_Cell_ChIP/202012_ChIP/3_peak_annotations'
chip_anno_files = glob.glob("%s/*annoDf.csv"%chip_anno_dir)

out_file = out_dir + "/ChIP_peak_chisq.csv"
out_sum_pval_file = out_dir + "/ChIP_peak_chisq_pval.csv"

{'sc_d8_arm_up', 'sc_d5_cl13_up', 'd5_KLRG1lo_all', 'd5_KLRG1hi_all', 'nascent_48h_up', 'nascent_all', 'd5_KLRG1hi_Runx3_promoted', 'sc_d8_cl13_up', 'd5_KLRG1lo_Runx3_repressed', 'd5_KLRG1lo_Runx3_promoted', 'd5_KLRG1hi_Runx3_repressed', 'sc_all', 'nascent_48h_dn', 'sc_d5_arm_up', 'nascent_Il2_repressed', 'nascent_Il2_promoted'}


In [50]:
label_combinations = [["nascent_48h_up", "nascent_all"], ["nascent_48h_dn", "nascent_all"],
                      ['nascent_Il2_promoted', "nascent_48h_up"], ['nascent_Il2_repressed', "nascent_48h_dn"],
                      ['sc_d5_arm_up', 'sc_all'], ['sc_d5_cl13_up', 'sc_all'], 
                      ['sc_d8_arm_up', 'sc_all'], ['sc_d8_cl13_up', 'sc_all'], 
                      ['d5_KLRG1hi_Runx3_promoted', 'd5_KLRG1hi_all'], 
                      ['d5_KLRG1hi_Runx3_repressed', 'd5_KLRG1hi_all'],
                      ['d5_KLRG1lo_Runx3_promoted', 'd5_KLRG1lo_all'], 
                      ['d5_KLRG1lo_Runx3_repressed', 'd5_KLRG1lo_all']]


out_sum_pval_file_df = pd.DataFrame({"ChIP": [x.split("/")[-1].replace("_annoDf.csv", "") 
                                              for x in chip_anno_files]})
chip_genes_n = []
for chip_i_file in chip_anno_files:
    chip_i_df = pd.read_csv(chip_i_file)
    chip_i_genes = set(x for x in chip_i_df['SYMBOL'] if str(x) != 'nan')
    chip_genes_n.append(len(chip_i_genes))
out_sum_pval_file_df["ChIP gene #"] = chip_genes_n
    

with open(out_file, "w") as fout:
    for label_target, label_source in label_combinations:
        cp_name = "%s__in__%s"%(label_target, label_source)
        cp_p_list = []
        for chip_i_file in chip_anno_files:
            chip_i_name = chip_i_file.split("/")[-1].replace("_annoDf.csv", "")
            chip_i_df = pd.read_csv(chip_i_file)
            chip_i_genes = set(x for x in chip_i_df['SYMBOL'] if str(x) != 'nan')

            # Calculate contingency table
            target_genes = set(gene_list_df[gene_list_df['labels'] == label_target]['gene_names'].tolist())
            source_genes = set(gene_list_df[gene_list_df['labels'] == label_source]['gene_names'].tolist())

            target_chip_genes = target_genes & chip_i_genes
            target_non_chip_genes = target_genes - target_chip_genes

            non_target_genes = source_genes - target_genes
            non_target_chip_genes = non_target_genes & chip_i_genes
            non_target_non_chip_genes = non_target_genes - non_target_chip_genes

            chisq_array = np.array([[len(target_chip_genes), len(target_non_chip_genes)], 
                                    [len(non_target_chip_genes), len(non_target_non_chip_genes)]])

            # Perform chisq test
            chi2, p, dof, expected = chi2_contingency(chisq_array, correction=True)

            # Append pval
            cp_p_list.append(p)

            # Write test result
            wfout = csv.writer(fout, delimiter = ",")
            wfout.writerow(["ChIP", chip_i_name])
            wfout.writerow(["Target genes", label_target])
            wfout.writerow(["Source genes", label_source])
            wfout.writerow([])
            wfout.writerow(["", "ChIP", "None ChIP"])
            wfout.writerow(["Target", len(target_chip_genes), len(target_non_chip_genes)])
            wfout.writerow(["Other", len(non_target_chip_genes), len(non_target_non_chip_genes)])
            wfout.writerow(["chisq", chi2])
            wfout.writerow(["p-value", p])
            wfout.writerow([])
            wfout.writerow([])
        out_sum_pval_file_df[cp_name] = ["{:.4f}".format(x) for x in cp_p_list]
    out_sum_pval_file_df.to_csv(out_sum_pval_file, index=False)