In [1]:
import pandas as pd
import glob
import os
import csv
import numpy as np
from scipy.stats import chi2_contingency

try:
    code_dir
except NameError:
    code_dir = os.getcwd()
    source_dir = code_dir.replace("codes_local", "0_sources")
    out_dir = code_dir.replace("codes_local", "1_ChIP_peak_distribution")

In [2]:
gene_list_file = source_dir + "/gene_list.csv"
gene_list_df = pd.read_csv(gene_list_file)
print(set(gene_list_df['labels']))
print([x for x in set(gene_list_df['labels']) if 'sc_P' in x and 'arm' in x])

chip_anno_dir = '/media/pipkin/Rocket2/T_Cell_ChIP/202012_ChIP/3_peak_annotations'
chip_anno_files = glob.glob("%s/*annoDf.csv"%chip_anno_dir)

out_file = out_dir + "/ChIP_peak_chisq"
out_sum_pval_file = out_dir + "/sum_ChIP_peak_chisq.csv"

{'sc_P3_cl13_up', 'd5_KLRG1lo_Runx3_promoted', 'sc_P5_cl13_up', 'sc_P6_arm_up', 'sc_P3_arm_up', 'sc_P5_arm_up', 'sc_P4_arm_up', 'sc_P7_arm_up', 'sc_P2_cl13_up', 'sc_P6_cl13_up', 'd5_KLRG1hi_Runx3_promoted', 'sc_P9_arm_up', 'nascent_Il2_repressed', 'sc_d5_arm_up', 'd5_KLRG1lo_Runx3_repressed', 'sc_all', 'nascent_48h_up', 'd5_KLRG1hi_Runx3_repressed', 'sc_d8_arm_up', 'nascent_48h_dn', 'nascent_Il2_promoted', 'sc_P7_cl13_up', 'sc_d8_cl13_up', 'sc_P4_cl13_up', 'nascent_all', 'sc_d5_cl13_up', 'sc_P9_cl13_up', 'sc_P2_arm_up', 'd5_KLRG1hi_all', 'd5_KLRG1lo_all'}
['sc_P6_arm_up', 'sc_P3_arm_up', 'sc_P5_arm_up', 'sc_P4_arm_up', 'sc_P7_arm_up', 'sc_P9_arm_up', 'sc_P2_arm_up']


In [3]:
label_combinations = [["nascent_48h_up", "nascent_48h_dn"], 
                      ['nascent_Il2_promoted','nascent_Il2_repressed'], 
                      ['sc_d5_arm_up', 'sc_d5_cl13_up'],
                      ['sc_d8_arm_up', 'sc_d8_cl13_up'],
                      ['d5_KLRG1hi_Runx3_promoted', 'd5_KLRG1hi_Runx3_repressed'], 
                      ['d5_KLRG1lo_Runx3_promoted', 'd5_KLRG1lo_Runx3_repressed']]

for i in [2,3,4,5,6,9]:
    label_combinations.append(['sc_P%s_arm_up'%i, 'sc_P%s_cl13_up'%i])
label_combinations

[['nascent_48h_up', 'nascent_48h_dn'],
 ['nascent_Il2_promoted', 'nascent_Il2_repressed'],
 ['sc_d5_arm_up', 'sc_d5_cl13_up'],
 ['sc_d8_arm_up', 'sc_d8_cl13_up'],
 ['d5_KLRG1hi_Runx3_promoted', 'd5_KLRG1hi_Runx3_repressed'],
 ['d5_KLRG1lo_Runx3_promoted', 'd5_KLRG1lo_Runx3_repressed'],
 ['sc_P2_arm_up', 'sc_P2_cl13_up'],
 ['sc_P3_arm_up', 'sc_P3_cl13_up'],
 ['sc_P4_arm_up', 'sc_P4_cl13_up'],
 ['sc_P5_arm_up', 'sc_P5_cl13_up'],
 ['sc_P6_arm_up', 'sc_P6_cl13_up'],
 ['sc_P9_arm_up', 'sc_P9_cl13_up']]

In [4]:
out_sum_pval_file_df = pd.DataFrame({"ChIP": [x.split("/")[-1].replace("_annoDf.csv", "") 
                                              for x in chip_anno_files]})
chip_genes_n = []
for chip_i_file in chip_anno_files:
    chip_i_df = pd.read_csv(chip_i_file)
    chip_i_genes = set(x for x in chip_i_df['SYMBOL'] if str(x) != 'nan')
    chip_genes_n.append(len(chip_i_genes))
out_sum_pval_file_df["ChIP gene #"] = chip_genes_n

for label_target1, label_target2 in label_combinations:
    cp_name = "%s__in__%s"%(label_target1, label_target2)
    out_file_cp = out_file + "--" + cp_name + ".csv"
    cp_p_list = []
    cp_1_pctg = []
    cp_2_pctg = []
    with open(out_file_cp, "w") as fout:
        wfout = csv.writer(fout, delimiter=",")
        for chip_i_file in chip_anno_files:
            chip_i_name = chip_i_file.split("/")[-1].replace("_annoDf.csv", "")
            chip_i_df = pd.read_csv(chip_i_file)
            chip_i_genes = set(x for x in chip_i_df['SYMBOL'] if str(x) != 'nan')

            # Calculate contingency table
            target1_genes = set(gene_list_df[gene_list_df['labels'] == label_target1]['gene_names'].tolist())
            target2_genes = set(gene_list_df[gene_list_df['labels'] == label_target2]['gene_names'].tolist())

            target1_chip_genes = target1_genes & chip_i_genes
            target2_chip_genes = target2_genes & chip_i_genes
            target1_non_chip_genes = target1_genes - target1_chip_genes
            target2_non_chip_genes = target2_genes - target2_chip_genes

            target1_chip_n = len(target1_chip_genes)
            target1_non_chip_n = len(target1_non_chip_genes)
            target2_chip_n = len(target2_chip_genes)
            target2_non_chip_n = len(target2_non_chip_genes)
            
            chisq_array = np.array([[target1_chip_n, target1_non_chip_n], 
                                    [target2_chip_n, target2_non_chip_n]])
            target1_chip_pctg = target1_chip_n / len(target1_genes)*100
            target1_nonchip_pctg = target1_non_chip_n / len(target1_genes)*100
            target2_chip_pctg = target2_chip_n / len(target2_genes)*100
            target2_nonchip_pctg = target2_non_chip_n / len(target2_genes)*100            

            # Perform chisq test
            chi2, p, dof, expected = chi2_contingency(chisq_array, correction=True)

            # Append pval
            cp_p_list.append(p)
            cp_1_pctg.append(target1_chip_pctg)
            cp_2_pctg.append(target2_chip_pctg)

            # Write test result
            wfout = csv.writer(fout, delimiter = ",")
            wfout.writerow(["ChIP", chip_i_name])
            wfout.writerow(["Target1 genes", label_target1])
            wfout.writerow(["Target2 genes", label_target2])
            wfout.writerow([])
            wfout.writerow(["", "ChIP", "None ChIP"])
            wfout.writerow(["Target1",
                            "{} ({:.1f}%)".format(target1_chip_n, target1_chip_pctg), 
                            "{} ({:.1f}%)".format(target1_non_chip_n, target1_nonchip_pctg)])
            wfout.writerow(["Target2",
                            "{} ({:.1f}%)".format(target2_chip_n, target2_chip_pctg), 
                            "{} ({:.1f}%)".format(target2_non_chip_n, target2_nonchip_pctg)])           
            wfout.writerow(["chisq", chi2])
            wfout.writerow(["p-value", p])
            wfout.writerow([])
            wfout.writerow([])
        out_sum_pval_file_df["%s_pval"%cp_name] = ["{:.4f}".format(x) for x in cp_p_list]
        out_sum_pval_file_df["%s_chip_pctg"%label_target1] = ["{:.4f}".format(x) for x in cp_1_pctg]
        out_sum_pval_file_df["%s_chip_pctg"%label_target2] = ["{:.4f}".format(x) for x in cp_2_pctg]

        out_sum_pval_file_df.to_csv(out_sum_pval_file, index=False)