In [13]:
## 分析染色体上各个TAD的大小
import pandas as pd
tissues = ['anther', 'cotyledon', 'leaf', 'ovule', 'fiber_10DPA', 'fiber_20DPA', 'radicle', 'root', 'stem', 'stigma', 'hypocotyl', 'petal']
df_all_TAD = pd.DataFrame()
for t in tissues:
    input_f = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/TAD_data/{}_choosed.bed".format(t)
    df = pd.read_csv(input_f, sep="\t")
    df_all_TAD = pd.concat([df_all_TAD, df], axis=0)
df_all_TAD['length'] = df_all_TAD['e'] - df_all_TAD['s']
df_TAD_size = df_all_TAD.drop_duplicates(keep='first', subset=['chr', 'length'])
print(df_TAD_size.head())
df_TAD_size.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/TAD_data/J669_TAD_dropduplicated.bed", sep="\t", index=False)

        chr         s         e  order  length
0  Ghir_A01   7100000   7240000      2  140000
1  Ghir_A01   7240000   7420000      2  180000
2  Ghir_A01   8120000   8400000      2  280000
3  Ghir_A01   8400000   8620000      2  220000
6  Ghir_A01  14940000  15240000      2  300000


In [1]:
## 计算染色上不同大小TAD滑动窗口下所包含的loop数量分布

## 函数，确定区间内的loops数目
## loop_DF: loop 文件
## C: 染色体名称
## S: 起始位置
## E: 终止位置
def loop_number(loop_DF, C, S, E):
    df_loop = loop_DF.copy()
    df_tmp = df_loop.query("chrom1==@C & start1>=@S & end2<=@E").copy()
    #print(df_tmp)
    LN = df_tmp.shape[0]
    return LN

## 函数，滑动计算TAD大小区间所包含的loops数目
#! ChrN: TAD所属染色体，Chr_Size: 染色体大小， TAD_Size: TAD大小， slid_bin: 滑动Bin的大小， loop_Df: loop 文件
def TAD_slid(ChrN, Chr_Size, TAD_Size, slid_bin, loop_Df):
    CN, CS, TS, B, df_loop = ChrN, Chr_Size, TAD_Size, slid_bin, loop_Df
    #print(CN, CS, TS, B)
    TAD_id = "{0}_{1}".format(ChrN, Chr_Size)
    df_loop_chr = df_loop.query("chrom1==@CN").copy()
    lN_list = []
    for i in range(0, CS-TS, B): ## 创建滑框
        S, E = i, i+TS 
        #print(S, E)
        LN = loop_number(df_loop_chr, CN, S, E) ## 计算滑框内的loop数目
        #print(LN)
        lN_list.append(LN)
    df_return = pd.DataFrame({TAD_id: lN_list})
    return df_return.T

## 批量计算不同大小TAD 所包含的loops数目
#! TAD_df: 不同大小TAD的dataframe， slid_bin：滑框大小， loop文件
def TAD_LN(TAD_df, slid_bin, loop_df, output_f):
    print(slid_bin)
    #chrom_size = [117757855, 108092100, 113059412, 85149810, 109419610, 124056332, 97782242, 122376713, 82102766, 114851205, 123212122, 107672493, 108376418, 63209094, 69838496, 52700144, 56430500, 62933208, 66869448, 59257364, 69039221, 52819285, 68005838, 72942580, 62694377, 63343036]
    chrom_size = [11775, 10809, 11305, 85149, 10941, 12405, 97782, 12237, 82102766, 114851205, 123212122, 107672493, 108376418, 63209094, 69838496, 52700144, 56430500, 62933208, 66869448, 59257364, 69039221, 52819285, 68005838, 72942580, 62694377, 63343036]
    chrom_name = ['Ghir_A01', 'Ghir_A02', 'Ghir_A03', 'Ghir_A04', 'Ghir_A05', 'Ghir_A06', 'Ghir_A07', 'Ghir_A07', 'Ghir_A08', 'Ghir_A09', 'Ghir_A10', 'Ghir_A11', 'Ghir_A12', 'Ghir_A13', 'Ghir_D01', 'Ghir_D02', 'Ghir_D03', 'Ghir_D04', 'Ghir_D05', 'Ghir_D06', 'Ghir_D07', 'Ghir_D08', 'Ghir_D09', 'Ghir_D10', 'Ghir_D11', 'Ghir_D12', 'Ghir_D13']
    Chr_dic = dict(zip(chrom_name, chrom_size))
    df_TAD, B, df_loop = TAD_df, slid_bin, loop_df
    df_result = pd.DataFrame()
    for index, row in df_TAD.iterrows():
        chrN, TAD_size = row['chr'], row['length']
        print(chrN, TAD_size)
        chrSize = Chr_dic[chrN]
        df_tmp = TAD_slid(chrN, chrSize, TAD_size, B, df_loop)
        df_result = pd.concat([df_result, df_tmp], axis=0)
    df_result.to_csv(output_f, sep="\t")

# 将并行结果合并为一个文件
def concat(output_f):
    chrom_name = ['Ghir_A01', 'Ghir_A02', 'Ghir_A03', 'Ghir_A04', 'Ghir_A05', 'Ghir_A06', 'Ghir_A07', 'Ghir_A07', 'Ghir_A08', 'Ghir_A09', 'Ghir_A10', 'Ghir_A11', 'Ghir_A12', 'Ghir_D01', 'Ghir_D02', 'Ghir_D03', 'Ghir_D04', 'Ghir_D05', 'Ghir_D06', 'Ghir_D07', 'Ghir_D08', 'Ghir_D09', 'Ghir_D10', 'Ghir_D11', 'Ghir_D12', 'Ghir_D13']
    df_result = pd.DataFrame()
    for c in chrom_name:
        input_f = "{}_outputf.bed".format(c)
        df_tmp = pd.read_csv(input_f, sep="\t")
        df_result = pd.DataFrame([df_result, df_tmp], axis=0)
    df_result.to_csv(output_f, sep="\t", index=False)

## 计算不同大小TAD在每个组织中所包含的loop互作, 并行加速
import pandas as pd
import sys
import argparse
import multiprocessing as mp
def compute():
    # parser = argparse.ArgumentParser(description="")
    # parser.add('-T', '--TAD_file', required=True, help="The file include size and chromosome of TAD")
    # parser.add("-L", '--loop_file', required=True, help="Loop file")
    # parser.add("-B", '--Bin_size', required=True, help='The size of slid')
    # parser.add("-O", '--output_file', required=True, help='Output file')
    #parser.add("-N", '--Cpu_number', required=True, help='The number of CPU')
    #args = parser.args()
    #TF, LF, BS, OF = args.TAD_file, args.loop_file, int(args.Bin_size), int(args.Cpu_number), args.output_file
    TF = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/TAD_data/test.txt"
    LF = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/final_loops/anther_final_loop.bed"
    BS = 18000
    OF = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_loop/anther_TAD_loop_number.bed"
    df_TAD = pd.read_csv(TF, sep="\t")
    df_loop = pd.read_csv(LF, sep="\t")
    ## parallel 
    chrom_name = ['Ghir_A01', 'Ghir_A02', 'Ghir_A03', 'Ghir_A04', 'Ghir_A05', 'Ghir_A06', 'Ghir_A07', 'Ghir_A08', 'Ghir_A09', 'Ghir_A10', 'Ghir_A11', 'Ghir_A12','Ghir_A13', 'Ghir_D01', 'Ghir_D02', 'Ghir_D03', 'Ghir_D04', 'Ghir_D05', 'Ghir_D06', 'Ghir_D07', 'Ghir_D08', 'Ghir_D09', 'Ghir_D10', 'Ghir_D11', 'Ghir_D12', 'Ghir_D13']
    datas = []
    for i in chrom_name:
        print(i)
        df_TAD_chr = df_TAD.query("chr==@i").copy()
        df_loop_chr = df_loop.query("chrom1==@i").copy()
        output_f = "{}_outputf.bed".format(i)
        #print(df_TAD)
        p = mp.Process(target=TAD_LN, args=(df_TAD_chr, BS, df_loop_chr, output_f))
        #print(i)
        datas.append(p)
        p.start()
        # p.start()
        # p.join()
    for p in datas:
        p.join()
    #concat(OF)
if __name__ == "__main__":
    compute()
# tissues = ['anther']
# for i in tissues:
#     input_f = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/final_loops/{}_final_loop.bed".format(i)
#     df_loop = pd.read_csv(input_f, sep="\t")
#     df_TAD_size = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/TAD_data/test.txt", sep="\t")
#     print(df_TAD_size)
#     df_TAD_LN = TAD_LN(df_TAD_size, 18000, df_loop, 'result.txt')
# if __name__ == "__main__":
#     main(sys.argv[1:])
    

Ghir_A01


In [None]:
import pandas as pd
#df_TAD_loop.rename(columns={'Unnamed: 0':'chr'}, inplace=True)
tissues = ['anther', 'cotyledon', 'leaf', 'ovule', 'fiber_10DPA', 'fiber_20DPA', 'radicle', 'root', 'stem', 'stigma', 'hypocotyl', 'petal']
for t in tissues:
    df_TAD_loop = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_loop/{}_TAD_loop_number.bed".format(t), sep="\t")
    df_TAD = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/TAD_data/J669_TAD_dropduplicated.bed", sep="\t")
    df_TAD['TAD_id'] = df_TAD['chr'].map(str) + "-" + df_TAD['length'].map(str)
    df_TAD = df_TAD[['TAD_id']]
    df_result = pd.concat([df_TAD, df_TAD_loop], axis=1)
    df_result.drop("Unnamed: 0", inplace=True, axis=1)
    #print(df_result.head())
    df_result.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_loop/{}_TAD_loop_number2.bed".format(t), sep="\t", index=False)


In [30]:
chrom_name = ['Ghir_A01', 'Ghir_A02', 'Ghir_A03', 'Ghir_A04', 'Ghir_A05', 'Ghir_A06', 'Ghir_A07', 'Ghir_A07', 'Ghir_A08', 'Ghir_A09', 'Ghir_A10', 'Ghir_A11', 'Ghir_A12', 'Ghir_D01', 'Ghir_D02', 'Ghir_D03', 'Ghir_D04', 'Ghir_D05', 'Ghir_D06', 'Ghir_D07', 'Ghir_D08', 'Ghir_D09', 'Ghir_D10', 'Ghir_D11', 'Ghir_D12', 'Ghir_D13']
df = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/TAD_data/J668_TAD_dropduplicated.bed", sep="\t")
df_result = pd.DataFrame()
for c in chrom_name:
    df_c = df.query("chr.str.contains(@c)", engine='python')
    df_result = pd.concat([df_result, df_c.head(1)], axis=0)
df_result.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/TAD_data/J668_TAD_dropduplicated_txt.bed", sep="\t", index=False)

In [None]:
## 分析每个TAD中所包含的loops 数目
tissues = ['anther', 'cotyledon', 'leaf', 'ovule', 'fiber_10DPA', 'fiber_20DPA', 'radicle', 'root', 'stem', 'stigma', 'hypocotyl', 'petal']
chrom_name = ['Ghir_A01', 'Ghir_A02', 'Ghir_A03', 'Ghir_A04', 'Ghir_A05', 'Ghir_A06', 'Ghir_A07', 'Ghir_A08', 'Ghir_A09', 'Ghir_A10', 'Ghir_A11', 'Ghir_A12', 'Ghir_A13', 'Ghir_D01', 'Ghir_D02', 'Ghir_D03', 'Ghir_D04', 'Ghir_D05', 'Ghir_D06', 'Ghir_D07', 'Ghir_D08', 'Ghir_D09', 'Ghir_D10', 'Ghir_D11', 'Ghir_D12', 'Ghir_D13']
def TAD_loop(C, S, E, loop_df):
    #print(C, S, E)
    df_loop = loop_df.copy()
    df_tmp = df_loop.query("chrom1==@C & start1>=@S & end2<=@E")
    #print(df_tmp)
    LN = df_tmp.shape[0]
    return LN
df_result = pd.DataFrame()
for t in tissues:
    TAD_f = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/TAD_data/{0}_choosed.bed".format(t)
    df_TAD = pd.read_csv(TAD_f, sep="\t")
    df_TAD = df_TAD[['chr', 's', 'e']].copy()
    df_TAD_tmp = pd.DataFrame()
    for j in tissues:
        print(j)
        df_loop_tmp = pd.DataFrame()
        loop_f = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/final_loops/{}_final_loop.bed".format(j)
        df_loop = pd.read_csv(loop_f, sep="\t")
        for c in chrom_name:
            df_loop_chr = df_loop.query("chrom1 == @c").copy()
            df_TAD_chr = df_TAD.query("chr == @c").copy()
            for index, row in df_TAD_chr.iterrows():
                C, S, E = row['chr'], row['s'], row['e']
                LN = TAD_loop(C, S, E, df_loop_chr)
                df_TAD_chr.loc[index, j] = LN
            df_loop_tmp = pd.concat([df_loop_tmp, df_TAD_chr], axis=0)
        print(df_loop_tmp.head())
        if df_TAD_tmp.shape[0] == 0:
            df_TAD_tmp = df_loop_tmp.copy()
        else:
            df_TAD_tmp = pd.merge(df_TAD_tmp, df_loop_tmp, how='inner', on=['chr', 's', 'e'])
        print(df_TAD_tmp.head())
    df_TAD_tmp['tissues'] = [t]*df_TAD_tmp.shape[0]
    df_result = pd.concat([df_result, df_TAD_tmp], axis=0, ignore_index=True)
df_result.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_loop/J668_TAD_loop_number.csv", index=False)

In [None]:
## 分析每个TAD中所包含的loops 数目 (与基因相关的loops)
tissues = ['anther', 'cotyledon', 'leaf', 'ovule', 'fiber_10DPA', 'fiber_20DPA', 'radicle', 'root', 'stem', 'stigma', 'hypocotyl', 'petal']
chrom_name = ['Ghir_A01', 'Ghir_A02', 'Ghir_A03', 'Ghir_A04', 'Ghir_A05', 'Ghir_A06', 'Ghir_A07', 'Ghir_A08', 'Ghir_A09', 'Ghir_A10', 'Ghir_A11', 'Ghir_A12', 'Ghir_A13', 'Ghir_D01', 'Ghir_D02', 'Ghir_D03', 'Ghir_D04', 'Ghir_D05', 'Ghir_D06', 'Ghir_D07', 'Ghir_D08', 'Ghir_D09', 'Ghir_D10', 'Ghir_D11', 'Ghir_D12', 'Ghir_D13']
def TAD_loop(C, S, E, loop_df):
    #print(C, S, E)
    df_loop = loop_df.copy()
    df_tmp = df_loop.query("chrom1==@C & start1>=@S & end2<=@E")
    #print(df_tmp)
    LN = df_tmp.shape[0]
    return LN
df_result = pd.DataFrame()
for t in tissues:
    TAD_f = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/TAD_data/{0}_choosed.bed".format(t)
    df_TAD = pd.read_csv(TAD_f, sep="\t")
    df_TAD = df_TAD[['chr', 's', 'e']].copy()
    df_TAD_tmp = pd.DataFrame()
    for j in tissues:
        print(j)
        df_loop_tmp = pd.DataFrame()
        loop_f = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/final_loops/{}_gene_loop.bed".format(j)
        df_loop = pd.read_csv(loop_f, sep="\t")
        for c in chrom_name:
            df_loop_chr = df_loop.query("chrom1 == @c").copy()
            df_TAD_chr = df_TAD.query("chr == @c").copy()
            for index, row in df_TAD_chr.iterrows():
                C, S, E = row['chr'], row['s'], row['e']
                LN = TAD_loop(C, S, E, df_loop_chr)
                df_TAD_chr.loc[index, j] = LN
            df_loop_tmp = pd.concat([df_loop_tmp, df_TAD_chr], axis=0)
        print(df_loop_tmp.head())
        if df_TAD_tmp.shape[0] == 0:
            df_TAD_tmp = df_loop_tmp.copy()
        else:
            df_TAD_tmp = pd.merge(df_TAD_tmp, df_loop_tmp, how='inner', on=['chr', 's', 'e'])
        print(df_TAD_tmp.head())
    df_TAD_tmp['tissues'] = [t]*df_TAD_tmp.shape[0]
    df_result = pd.concat([df_result, df_TAD_tmp], axis=0, ignore_index=True)
df_result.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/J668_TAD_gene_loop_number.csv", index=False)

In [62]:
### 分析各TAD所包含loops的比例
#1、宽数据转换为长数据
import pandas as pd
def TAD_loop_wideTolong():
    tissues = ['anther', 'cotyledon', 'leaf', 'ovule', 'fiber_10DPA', 'fiber_20DPA', 'radicle', 'root', 'stem', 'stigma', 'hypocotyl', 'petal']
    df_return = pd.DataFrame()
    for t in tissues:
        input_f = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_loop/{}_TAD_loop_number.bed".format(t)
        df = pd.read_csv(input_f, sep="\t")
        df.rename(columns={'Unnamed: 0': 'TADid'}, inplace=True)
        df_long = df.melt(id_vars=['TADid'])
        df_long.dropna(inplace=True)
        df_long['number'] = [1]*df_long.shape[0]
        df_slid_TAD_LN = df_long.groupby(by=['TADid', 'value'], as_index=False)['number'].sum()
        df_TADidN = df_long.groupby(by=['TADid'], as_index=False)['number'].sum()
        df_result = pd.merge(df_slid_TAD_LN, df_TADidN, how='left', on=['TADid'])
        df_result.columns = ['TAD_id', 'LN', 'number', 'slid_number']
        df_result['ratio'] = df_result['number']/df_result['slid_number']
        df_result['tissues'] = [t]*df_result.shape[0]
        df_return = pd.concat([df_return, df_result], axis=0)
    df_return.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_loop/J668_slid_TAD_loop_number.csv", index=False)
TAD_loop_wideTolong()

##2、分析每个真实TAD所包含的loop出现的概率
def TAD_loop_ratio():
    tissues = ['anther', 'cotyledon', 'leaf', 'ovule', 'fiber_10DPA', 'fiber_20DPA', 'radicle', 'root', 'stem', 'stigma', 'hypocotyl', 'petal']
    df_TAD_LN = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_loop/J668_TAD_loop_number.csv")
    df_STAD_LN = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_loop/J668_slid_TAD_loop_number.csv")
    df_TAD_LN['length'] = df_TAD_LN['e'] - df_TAD_LN['s']
    df_TAD_LN['TAD_id'] = df_TAD_LN['chr'].map(str) + "-" + df_TAD_LN['length'].map(str)
    df_return = pd.DataFrame()
    for t in tissues:
        df_STAD_LN_T = df_STAD_LN.query("tissues==@t")[['TAD_id', 'LN', 'ratio']].copy()
        df_STAD_LN_T.columns = ['TAD_id', t, 'ratio']
        df_tmp = df_TAD_LN[['chr', 's', 'e', 'TAD_id', t, 'tissues']].copy()
        df_tmp = pd.merge(df_tmp, df_STAD_LN_T, how='left', on=['TAD_id', t])
        ratio_tag = "{}_ratio".format(t)
        df_tmp.rename(columns={'ratio':ratio_tag}, inplace=True)
        #print(df_tmp.head())
        if df_return.shape[0]==0:
            df_return = df_tmp.copy()
        else:
            df_return = pd.merge(df_return, df_tmp, how='inner', on=['chr', 's', 'e', 'TAD_id', 'tissues'])
    return df_return

df_TAD_LNR = TAD_loop_ratio()
df_TAD_LNR.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_loop/J668_TAD_LNR.csv", index=False)

In [220]:
### 分析各TAD所包含loops的比例 （用所有与基因相关的loops）
#1、宽数据转换为长数据
import pandas as pd
def TAD_loop_wideTolong():
    tissues = ['anther', 'cotyledon', 'leaf', 'ovule', 'fiber_10DPA', 'fiber_20DPA', 'radicle', 'root', 'stem', 'stigma', 'hypocotyl', 'petal']
    df_return = pd.DataFrame()
    for t in tissues:
        input_f = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/{}_TAD_gene_loop_number.bed".format(t)
        df = pd.read_csv(input_f, sep="\t")
        df.rename(columns={'Unnamed: 0': 'TADid'}, inplace=True)
        df_long = df.melt(id_vars=['TADid'])
        df_long.dropna(inplace=True)
        df_long['number'] = [1]*df_long.shape[0]
        df_slid_TAD_LN = df_long.groupby(by=['TADid', 'value'], as_index=False)['number'].sum()
        df_TADidN = df_long.groupby(by=['TADid'], as_index=False)['number'].sum()
        df_result = pd.merge(df_slid_TAD_LN, df_TADidN, how='left', on=['TADid'])
        df_result.columns = ['TAD_id', 'LN', 'number', 'slid_number']
        df_result['ratio'] = df_result['number']/df_result['slid_number']
        df_result['tissues'] = [t]*df_result.shape[0]
        df_return = pd.concat([df_return, df_result], axis=0)
    df_return.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/J668_slid_TAD_gene_loop_number.csv", index=False)
TAD_loop_wideTolong()

##2、分析每个真实TAD所包含的loop出现的概率
def TAD_loop_ratio():
    tissues = ['anther', 'cotyledon', 'leaf', 'ovule', 'fiber_10DPA', 'fiber_20DPA', 'radicle', 'root', 'stem', 'stigma', 'hypocotyl', 'petal']
    df_TAD_LN = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/J668_TAD_gene_loop_number.csv")
    df_STAD_LN = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/J668_slid_TAD_gene_loop_number.csv")
    df_TAD_LN['length'] = df_TAD_LN['e'] - df_TAD_LN['s']
    df_TAD_LN['TAD_id'] = df_TAD_LN['chr'].map(str) + "-" + df_TAD_LN['length'].map(str)
    df_return = pd.DataFrame()
    for t in tissues:
        df_STAD_LN_T = df_STAD_LN.query("tissues==@t")[['TAD_id', 'LN', 'ratio']].copy()
        df_STAD_LN_T.columns = ['TAD_id', t, 'ratio']
        df_tmp = df_TAD_LN[['chr', 's', 'e', 'TAD_id', t, 'tissues']].copy()
        df_tmp = pd.merge(df_tmp, df_STAD_LN_T, how='left', on=['TAD_id', t])
        ratio_tag = "{}_ratio".format(t)
        df_tmp.rename(columns={'ratio':ratio_tag}, inplace=True)
        #print(df_tmp.head())
        if df_return.shape[0]==0:
            df_return = df_tmp.copy()
        else:
            df_return = pd.merge(df_return, df_tmp, how='inner', on=['chr', 's', 'e', 'TAD_id', 'tissues'])
    return df_return
df_TAD_LNR = TAD_loop_ratio()
df_TAD_LNR.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/J668_TAD_LNR.csv", index=False)

In [76]:
# 计算不同大小的TAD滑动在染色体上所包含的loops出现次数最多的loops数目
import pandas as pd
# 函数，得到出现次数最多的loop number 的值
def max_number_LN(df):
    df.reset_index(drop=True, inplace=True)
    index = df['number'].idxmax()
    LN = df.loc[index, ]['LN']
    return LN
# 函数，计算不同大小TAD在不同组织中出现最多次数的loop number
def TAD_max_LN():
    df_STAD_LN = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_loop/J668_slid_TAD_loop_number.csv")
    df_TADid_tissue = df_STAD_LN[['TAD_id', 'tissues']].drop_duplicates(keep='first').copy()
    for index, row in df_TADid_tissue.iterrows():
        TADid, Tiss = row['TAD_id'], row['tissues']
        df_tmp = df_STAD_LN.query("TAD_id==@TADid & tissues==@Tiss").copy()
        ln = max_number_LN(df_tmp)
        df_TADid_tissue.loc[index, 'LN'] = ln
    return df_TADid_tissue
df_TADid_tissue = TAD_max_LN()
df_TADid_tissue.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_loop/J668_TAD_size_max_LN.csv", index=False)

In [221]:
# 计算不同大小的TAD滑动在染色体上所包含的loops出现次数最多的loops数目 (包含基因的loop)
import pandas as pd
# 函数，得到出现次数最多的loop number 的值
def max_number_LN(df):
    df.reset_index(drop=True, inplace=True)
    index = df['number'].idxmax()
    LN = df.loc[index, ]['LN']
    return LN
# 函数，计算不同大小TAD在不同组织中出现最多次数的loop number
def TAD_max_LN():
    df_STAD_LN = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/J668_slid_TAD_gene_loop_number.csv")
    df_TADid_tissue = df_STAD_LN[['TAD_id', 'tissues']].drop_duplicates(keep='first').copy()
    for index, row in df_TADid_tissue.iterrows():
        TADid, Tiss = row['TAD_id'], row['tissues']
        df_tmp = df_STAD_LN.query("TAD_id==@TADid & tissues==@Tiss").copy()
        ln = max_number_LN(df_tmp)
        df_TADid_tissue.loc[index, 'LN'] = ln
    return df_TADid_tissue
df_TADid_tissue = TAD_max_LN()
df_TADid_tissue.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/J668_TAD_gene_size_max_LN.csv", index=False)

In [None]:
## 
# 1、判断TAD所包含的loop数目属于正态分布的左边还是右边
# 2、判断TAD所包含loop数目概率是否更高
import pandas as pd
df_TADLNR = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_loop/J668_TAD_LNR.csv")
df_TADidT = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_loop/J668_TAD_size_max_LN.csv")
tissues = ['anther', 'cotyledon', 'leaf', 'ovule', 'fiber_10DPA', 'fiber_20DPA', 'radicle', 'root', 'stem', 'stigma', 'hypocotyl', 'petal']
df_TAD = pd.DataFrame()
for t in tissues:
    df_loop = pd.DataFrame()
    for i in tissues:
        choosed_columns = ['chr', 's', 'e', 'TAD_id', 'tissues', i , '{}_ratio'.format(i)]
        df_tmp = df_TADLNR.query("tissues==@t")[choosed_columns].copy()
        df_TAD_chr = df_TADidT.query("tissues==@i")[['TAD_id', 'LN']].copy()
        df_tmp = pd.merge(df_tmp, df_TAD_chr, how='inner', on=['TAD_id'])
        df_tmp.rename(columns={'LN': '{}_max_LN'.format(i)}, inplace=True)
        if df_loop.shape[0] == 0:
            df_loop = df_tmp.copy()
        else:
            df_loop = pd.merge(df_loop, df_tmp, how='inner', on=['chr', 's', 'e', 'TAD_id', 'tissues'])
    df_TAD = pd.concat([df_TAD, df_loop], axis=0)
print(df_TAD.head())
## 分析TAD所包含的loop是属于正态分布左边还是右边
def calculate_TADloop_distribution(df, tags):
    df['value'] = df['{}_max_LN'.format(tags)] - df[tags]
    #print(df.head())
    df['{}_tags'.format(tags)] = df['value'].apply(lambda x: '-1' if x>0 else ('0' if x==0 else '1'))
    df.drop('value', inplace=True, axis=1)
    return df
df_result = pd.DataFrame()
for t in tissues:
    choose_columns= ['chr', 's', 'e', 'TAD_id', 'tissues', t, "{}_ratio".format(t), "{}_max_LN".format(t)]
    df_TAD_T = df_TAD[choose_columns].copy()
    df_tmp = calculate_TADloop_distribution(df_TAD_T, t)
    if df_result.shape[0] == 0:
        df_result = df_tmp.copy()
    else:
        df_result = pd.merge(df_result, df_tmp, how='inner', on=['chr', 's', 'e', 'TAD_id', 'tissues'])
df_result.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_loop/J668_TAD_LNRT.csv", index=False)

In [223]:
# 1、判断TAD所包含的loop数目属于正态分布的左边还是右边 (包含基因的loops)
# 2、判断TAD所包含loop数目概率是否更高
import pandas as pd
df_TADLNR = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/J668_TAD_gene_LNR.csv")
df_TADidT = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/J668_TAD_gene_size_max_LN.csv")
tissues = ['anther', 'cotyledon', 'leaf', 'ovule', 'fiber_10DPA', 'fiber_20DPA', 'radicle', 'root', 'stem', 'stigma', 'hypocotyl', 'petal']
df_TAD = pd.DataFrame()
for t in tissues:
    df_loop = pd.DataFrame()
    for i in tissues:
        choosed_columns = ['chr', 's', 'e', 'TAD_id', 'tissues', i , '{}_ratio'.format(i)]
        df_tmp = df_TADLNR.query("tissues==@t")[choosed_columns].copy()
        df_TAD_chr = df_TADidT.query("tissues==@i")[['TAD_id', 'LN']].copy()
        df_tmp = pd.merge(df_tmp, df_TAD_chr, how='inner', on=['TAD_id'])
        df_tmp.rename(columns={'LN': '{}_max_LN'.format(i)}, inplace=True)
        if df_loop.shape[0] == 0:
            df_loop = df_tmp.copy()
        else:
            df_loop = pd.merge(df_loop, df_tmp, how='inner', on=['chr', 's', 'e', 'TAD_id', 'tissues'])
    df_TAD = pd.concat([df_TAD, df_loop], axis=0)
#print(df_TAD.head())
## 分析TAD所包含的loop是属于正态分布左边还是右边
def calculate_TADloop_distribution(df, tags):
    df['value'] = df['{}_max_LN'.format(tags)] - df[tags]
    #print(df.head())
    df['{}_tags'.format(tags)] = df['value'].apply(lambda x: '-1' if x>0 else ('0' if x==0 else '1'))
    df.drop('value', inplace=True, axis=1)
    return df
df_result = pd.DataFrame()
for t in tissues:
    choose_columns= ['chr', 's', 'e', 'TAD_id', 'tissues', t, "{}_ratio".format(t), "{}_max_LN".format(t)]
    df_TAD_T = df_TAD[choose_columns].copy()
    df_tmp = calculate_TADloop_distribution(df_TAD_T, t)
    if df_result.shape[0] == 0:
        df_result = df_tmp.copy()
    else:
        df_result = pd.merge(df_result, df_tmp, how='inner', on=['chr', 's', 'e', 'TAD_id', 'tissues'])
df_result.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/J668_TAD_gene_LNRT.csv", index=False)

In [225]:
##计算哪些TAD是表现出组织高度富集loops的TAD
import pandas as pd
def judgement_TAD(df, tag1, tag2):
    tag1_max_LN, tag2_max_LN = "{}_tags".format(tag1), "{}_tags".format(tag2)
    df['tags'] = df[tag1_max_LN] - df[tag2_max_LN]
    ## 筛选满足条件的TAD
    df_a = df.query("tags>0").copy() # 1、参考系位于右边，比对组位于左边
    #df_b = df[(df[tag1_max_LN]>0) & (df[tag2_max_LN]>0) & (df["{}_ratio".format(tag1)] - df["{}_ratio".format(tag2)]<0)].copy() ## 同时位于右边，但是参考系更靠右
    #df_c = df[(df[tag1_max_LN]<0) & (df[tag2_max_LN]<0) & (df["{}_ratio".format(tag1)] - df["{}_ratio".format(tag2)]>0)].copy() ## 同时位于左边，但是参考系更靠右
    #df_abs = pd.concat([df_a, df_b, df_c], axis=0, ignore_index=True)
    df_abs = df_a.copy()
    df_abs['{}_judge'.format(tag2)] = ['Yes']*df_abs.shape[0]
    df_return = pd.concat([df_abs, df], axis=0)
    df_return.fillna("No", inplace=True)
    #print(df_return.head(20))
    df_return.drop_duplicates(keep='first', subset=['chr', 's', 'e', 'TAD_id', 'tissues'], inplace=True)
    df_return = df_return[['chr', 's', 'e', 'TAD_id', 'tissues', '{}_judge'.format(tag2)]]
    return df_return
df = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/J668_TAD_gene_LNRT.csv")
tissues = ['anther', 'cotyledon', 'leaf', 'ovule', 'fiber_10DPA', 'fiber_20DPA', 'radicle', 'root', 'stem', 'stigma', 'hypocotyl', 'petal']
for i in tissues:
    df_tmp = pd.DataFrame()
    for j in tissues:
        if i!=j:
            choose_columns = ['chr', 's', 'e', 'TAD_id', 'tissues', '{}_ratio'.format(i), "{}_tags".format(i), "{}_ratio".format(j), "{}_tags".format(j)]
            df_choosed = df[choose_columns].query("tissues==@i").copy()
            #print(df_choosed)
            df_result = judgement_TAD(df_choosed, i, j)
            if df_tmp.shape[0] == 0:
                df_tmp = df_result.copy()
            else:
                df_tmp = pd.merge(df_tmp, df_result, how='inner', on=['chr', 's', 'e', 'TAD_id', 'tissues'])
    output_f = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/final_result/{}_TAD_gene_tags.csv".format(i)
    df_tmp.to_csv(output_f, index=False)

In [195]:
###确定合适的组织高互作TAD
tissues = ['anther', 'cotyledon', 'leaf', 'ovule', 'fiber_10DPA', 'fiber_20DPA', 'radicle', 'root', 'stem', 'stigma', 'hypocotyl', 'petal']
for t in tissues:
    input_f = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_loop/final_result/{}_TAD_tags.csv".format(t)
    df = pd.read_csv(input_f)
    df.replace('Yes',1, inplace=True)
    df.replace("No", 0, inplace=True)
    df['number'] = df.iloc[:, 5:].sum(axis=1)
    df_tmp = df[['chr', 's', 'e', 'number']].copy()
    #print(df)
    #df_TAD_region = df.loc[(df.iloc[:, 5:]==1).all(axis=1), :][['chr', 's', 'e', 'number']]
    #print(df_TAD_region)
    df_TSG_TAD= pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/TSG_TAD/TSG_TAD_region/{}_TAD_TSG_region.bed".format(t), sep="\t", names=['chr', 's', 'e'])
    df_result = pd.merge(df_tmp, df_TSG_TAD, how='right', on=['chr', 's', 'e'])
    print(df_result.query("number>7").shape[0], df_TSG_TAD.shape[0], t)
    #print(df_TSG_TAD.shape[0], df_TAD_region.shape[0], df_result.shape[0])

82 307 anther
75 154 cotyledon
72 154 leaf
518 776 ovule
13 49 fiber_10DPA
25 52 fiber_20DPA
238 334 radicle
162 327 root
43 127 stem
24 91 stigma
47 165 hypocotyl
17 53 petal


In [None]:
## 分析组织特异高loop富集TAD 区间所包含的基因
tissues = ['anther', 'cotyledon', 'leaf', 'ovule', 'fiber_10DPA', 'fiber_20DPA', 'radicle', 'root', 'stem', 'stigma', 'hypocotyl', 'petal']
df_gene = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/new_RNA_Seq/RNA_Seq_data/J668_all_Tissue_mean_TPM_long.csv")
for t in tissues:
    input_f = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/final_result/{}_TAD_gene_tags.csv".format(t)
    df_TAD_loop = pd.read_csv(input_f)
    df_TAD_loop.replace('Yes',1, inplace=True)
    df_TAD_loop.replace("No", 0, inplace=True)
    df_need_TAD = df_TAD_loop.loc[(df_TAD_loop.iloc[:, 5:]==1).all(axis=1),:]
    print(df_need_TAD.shape[0], t)
    df_need_TAD.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/final_result/TAD_high_loop_gene/{}_TAD_gene_high_loop.csv".format(t), index=False)
    
    df_need_TAD = df_TAD_loop.loc[(df_TAD_loop.iloc[:, 5:]==1).all(axis=1),:][['chr', 's', 'e']].copy()
    df_need_TAD['tags'] = df_need_TAD['chr'].map(str) + "-" + df_need_TAD['s'].map(str) + "-" +df_need_TAD['e'].map(str)
    print(df_need_TAD.head())
    TAD_gene_f = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/TSG_TAD/TAD_gene/{}_TAD_gene.csv".format(t)
    df_TAD_gene = pd.read_csv(TAD_gene_f)
    df_need_TAD_gene = pd.merge(df_need_TAD[['tags']], df_TAD_gene, how='inner', on=['tags'])[['GeneId']].copy()
    df_result = pd.merge(df_gene, df_need_TAD_gene, how='inner', on=['GeneId'])
    print(df_need_TAD.shape[0], t)
    df_result.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/final_result/TAD_high_loop_gene/{}_TAD_gene_high_loop_gene.csv".format(t), index=False)  

In [239]:
#计算这些基因可表达概率
import pandas as pd
df = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/loops/TAD_gene_loop/final_result/TAD_high_loop_gene/cotyledon_TAD_gene_high_loop_gene.csv")
df['n'] = df['TPM'].apply(lambda x: 1 if x>=0.5 else 0)
df['number'] = [1]*df.shape[0]
df_result = df.groupby(by=['tissues','n'], as_index=False)['number'].sum()
#print(df_result)

df_all = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/new_RNA_Seq/RNA_Seq_data/J668_all_Tissue_mean_TPM_long.csv")
#print(df_all)
df_all['n'] = df_all['TPM'].apply(lambda x: 1 if x>=0.5 else 0)
df_all['number'] = [1]*df_all.shape[0]
df_result2 = df_all.groupby(by=['tissues','n'], as_index=False)['number'].sum()

df_result3 = pd.merge(df_result, df_result2, how='inner', on=['tissues', 'n'])
df_result3['ratio'] = df_result3['number_x']/df_result3['number_y']
print(df_result3)


        tissues  n  number_x  number_y     ratio
0        anther  0        11     30023  0.000366
1        anther  1        15     40176  0.000373
2     cotyledon  0         9     26950  0.000334
3     cotyledon  1        17     43249  0.000393
4   fiber_10DPA  0        13     31468  0.000413
5   fiber_10DPA  1        13     38731  0.000336
6   fiber_20DPA  0        12     32749  0.000366
7   fiber_20DPA  1        14     37450  0.000374
8     hypocotyl  0         9     23354  0.000385
9     hypocotyl  1        17     46845  0.000363
10         leaf  0         9     24731  0.000364
11         leaf  1        17     45468  0.000374
12        ovule  0        10     25941  0.000385
13        ovule  1        16     44258  0.000362
14        petal  0        11     31145  0.000353
15        petal  1        15     39054  0.000384
16      radicle  0         8     22113  0.000362
17      radicle  1        18     48086  0.000374
18         root  0         8     22682  0.000353
19         root  1  