In [2]:
## 分析确定两个亚基因组同源TAD
import pandas as pd
import numpy as np
# 1、确定TAD中所包含的同源基因
def TAD_contain_HG(df_TAD, df_HG):
    # chrs = ['Ghir_A01', 'Ghir_A02', 'Ghir_A03', 'Ghir_A04', 'Ghir_A05', 'Ghir_A06', 'Ghir_A07', 'Ghir_A08', 'Ghir_A09', 'Ghir_A10', 'Ghir_A11', 'Ghir_A12', 'Ghir_A13', 
    #         'Ghir_D01', 'Ghir_D02', 'Ghir_D03', 'Ghir_D04', 'Ghir_D05', 'Ghir_D06', 'Ghir_D07', 'Ghir_D08', 'Ghir_D09', 'Ghir_D10', 'Ghir_D11', 'Ghir_D12', 'Ghir_D13']
    chrs = ['Ghir_A01', 'Ghir_A02', 'Ghir_D01', 'Ghir_D02']
    data_dict = {}
    for c in chrs:
        df_TAD_chr = df_TAD.query('chrom == @c').copy()
        df_HG_chr = df_HG.query("C == @c").copy()
        df_TAD_chr.reset_index(drop=True, inplace=True)
        df_HG_chr.reset_index(drop=True, inplace=True)
        for index, row in df_TAD_chr.iterrows():
            chrom, start, end = row['chrom'], row['start'], row['end']
            TAD_id = "{0}-{1}-{2}".format(chrom, start, end)
            Gene_id = df_HG_chr.query("C == @chrom & S<=@end & E>= @start")['GeneId'].tolist()
            ## 将数据加入字典
            data_dict.update({TAD_id:Gene_id})
    return data_dict
# 2、将字典转换为dataframe
def dict_To_dataframe(data_dict):
    data_keys = list(data_dict.keys())
    data_values = list(data_dict.values())
    df_return = pd.DataFrame(list(zip(data_keys, data_values)), columns=['TADid', 'GeneId'])
    return df_return
# 3、确定哪些位置
def find_HG(gene_list, tags, df_hg_pairs):
    df = pd.DataFrame({tags:gene_list[0]})
    df_pair = pd.merge(df, df_hg_pairs, how='inner', on=[tags])
    #print(df_pair)
    if df_pair.shape[0] == 0:
        return None
    else:
        AD_gene_list = sorted(df_pair.values.tolist()) ## 将dataframe中所有的元素都转换为列表
        AD_gene_list = np.array(AD_gene_list).flatten().tolist() # 将二维列表拉直
        #print(AD_gene_list)
        return AD_gene_list


## 计算两个列表的相似性
def list_similary(list1, list2):
    set1, set2 = set(list1), set(list2)
    intersection = set1 & set2
    union = set1 | set2
    similarity = len(intersection) / len(union)
    return similarity

## 分析两个TAD 所包含基因的相似程度
def H_similary_TAD(df_At, df_Dt):
    #At_chr = ['Ghir_A01', 'Ghir_A02', 'Ghir_A03', 'Ghir_A04', 'Ghir_A05', 'Ghir_A06', 'Ghir_A07', 'Ghir_A08', 'Ghir_A09', 'Ghir_A10', 'Ghir_A11', 'Ghir_A12', 'Ghir_A13']
    #Dt_chr = ['Ghir_D01', 'Ghir_D02', 'Ghir_D03', 'Ghir_D04', 'Ghir_D05', 'Ghir_D06', 'Ghir_D07', 'Ghir_D08', 'Ghir_D09', 'Ghir_D10', 'Ghir_D11', 'Ghir_D12', 'Ghir_D13']
    At_chr = ['Ghir_A01', 'Ghir_A02']
    Dt_chr = ['Ghir_D02', 'Ghir_D01']
    df_return = pd.DataFrame()
    for i in range(2):
        A_chr, D_chr = At_chr[i], Dt_chr[i]
        df_At_chr = df_At[df_At['TADid'].str.contains(A_chr)].copy()
        df_Dt_chr = df_Dt[df_Dt['TADid'].str.contains(D_chr)].copy()
        df_At_chr.set_index('TADid', inplace=True)
        df_Dt_chr.set_index('TADid', inplace=True)
        df_tmp = pd.DataFrame(columns=['At_TAD_id', 'At_TAD_HG', 'Dt_TAD_id', 'Dt_TAD_HG', 'similary'])
        for index1, row1 in df_At_chr.iterrows():
            for index2, row2 in df_Dt_chr.iterrows():
                list1, list2 = list(row1)[0],list(row2)[0]
                TAD_similary = list_similary(list1, list2)
                df_tmp.loc[df_tmp.shape[0], ] = [index1, str(list1), index2, str(list2), TAD_similary]
        df_return = pd.concat([df_return, df_tmp], axis=0)
    return df_return

# 4、找到所有的TAD所包含的同源基因
def HTAD(df, df_HG_pair):
    df_At = df.query("TADid.str.contains('Ghir_A')").copy()
    df_Dt = df.query("TADid.str.contains('Ghir_D')").copy()
    df_At.set_index('TADid', inplace=True)
    df_Dt.set_index('TADid', inplace=True)
    data_dict = {}
    # 找到At亚基因TAD所包含的基因及同源基因
    for index, row in df_At.iterrows():
        row_list = row.to_list()
        AD_gene_list = find_HG(row_list, 'GeneId_At', df_HG_pair)
        data_dict.update({index:AD_gene_list})
    df_At = dict_To_dataframe(data_dict)
    # 找到Dt亚基因组TAD包含的基因及同源基因
    data_dict = {}
    for index, row in df_Dt.iterrows():
        AD_gene_list = find_HG(row, 'GeneId_Dt', df_HG_pair)
        data_dict.update({index:AD_gene_list})
    df_Dt = dict_To_dataframe(data_dict)
    df_AtDt = pd.concat([df_At, df_Dt], axis=0)
    
    df_At.dropna(inplace=True, how='any')
    df_Dt.dropna(inplace=True, how='any')
    
    df_HGTAD = H_similary_TAD(df_At, df_Dt)
    df_HGTAD = df_HGTAD.query("similary>0").copy()
    return df_AtDt, df_HGTAD

#, 'ovule', 'root', 'radicle', 'leaf', 'hypocotyl', 'cotyledon', 'stem', 'fiber_10DPA', 'fiber_20DPA', 'petal', 'stigma'
for t in ['anther','ovule', 'root', 'radicle', 'leaf', 'hypocotyl', 'cotyledon', 'stem', 'fiber_10DPA', 'fiber_20DPA', 'petal', 'stigma']:
    df_TAD = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/TAD_data/{}_choosed.bed".format(t), sep="\t")
    df_HG = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/new_RNA_Seq/homo_gene2/all_homo_gene_position.txt", sep="\t")
    df_HG_pair = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/new_RNA_Seq/homo_gene2/Ghir_homo_gene_pairs.txt", sep="\t")
    df_HG.columns = ['C', 'S', 'E', 'GeneId']
    df_TAD.drop('order', axis=1, inplace=True)
    df_TAD.columns = ['chrom', 'start', 'end']
    TADid_Geneid_dict = TAD_contain_HG(df_TAD, df_HG) # 找到所有TAD包含的同源基因
    df_TAD_gene = dict_To_dataframe(TADid_Geneid_dict) # 将所有TAD:同源基因转换为dataframe格式
    df_AtDt, df_HGTAD = HTAD(df_TAD_gene, df_HG_pair) # 确定所有同源TAD
    df_AtDt.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/homo_TAD/{}_TAD_HG2.csv".format(t), index=False)
    df_HGTAD.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/homo_TAD/{}_TAD_HG_similary2.csv".format(t), index=False)


In [105]:
# 分析确定同源TAD
tissues = ['anther','ovule', 'root', 'radicle', 'leaf', 'hypocotyl', 'cotyledon', 'stem', 'fiber_10DPA', 'fiber_20DPA', 'petal', 'stigma']
for t in tissues:
    input_f = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/homo_TAD/{}_TAD_HG_similary.csv".format(t)
    output_f = "G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/homo_TAD/{}_TAD_HG_mt_08.csv".format(t)
    df = pd.read_csv(input_f)
    df_HTAD = df.query("similary>=0.8").copy()
    print(df_HTAD.shape[0])
    df_HTAD.to_csv(output_f, index=False)

467
445
477
547
449
454
436
432
477
478
461
446


In [27]:
# 找同源分裂TAD结构
import pandas as pd
df = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/homo_TAD/hypocotyl_TAD_HG_similary.csv")
df = df.query("similary!=1").copy()
## 分析确定哪些是一个At TAD 分裂为了多个Dt TAD
df_sta = df.groupby(by=['At_TAD_id'], as_index=False)['similary'].sum()
df_one = df_sta.query("similary==1")[['At_TAD_id']].copy()
## 确定hypocotyl 中所包含的同源bias基因
df_hypocotyl_bias = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/new_RNA_Seq/homo_gene2/hypocotyl_bias_gene_position.bed", sep="\t")
df_hypocotyl_bias_gene = df_hypocotyl_bias[['GeneId']].copy()

## 确定这些基因中的同源基因具有差异表达
df_TADid = pd.merge(df_one, df, how='inner', on=['At_TAD_id'])[['At_TAD_id', 'Dt_TAD_id', 'At_TAD_HG']].copy()

#print(df_TADid)
for i in range(df_TADid.shape[0]):
    # 将字符转换为列表
    string = df_TADid.loc[i, 'At_TAD_HG'].strip('[]')
    string = string.replace("'", "")
    items = string.split(",")
    my_list = [item.strip() for item in items]
    df_tmp_gene = pd.DataFrame({'GeneId':my_list})
    df_overlap = pd.merge(df_tmp_gene, df_hypocotyl_bias_gene, how='inner', on=['GeneId'])
    if(df_overlap.shape[0]>0):
        print(df_TADid.loc[i, 'At_TAD_id'], df_overlap.shape[0])
        
## 确定TAD 
df_a = df_TADid[['At_TAD_id']].copy()
df_a.drop_duplicates(keep='first', inplace=True)
df_a['number'] = range(df_a.shape[0])
df_b = pd.merge(df_TADid, df_a, how='inner', on=['At_TAD_id'])
df_a = df_b[['At_TAD_id','number']].copy()
df_b = df_b[['Dt_TAD_id', 'number']].copy()
df_a.columns = ['TAD_id', 'number']
df_b.columns = ['TAD_id', 'number']
df_c = pd.concat([df_a, df_b], axis=0)
df_c[['chrom', 'start', 'end']] = df_c['TAD_id'].str.split("-", expand=True)
df_c.drop_duplicates(keep='first', inplace=True)
df_c.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/TAD_new/homo_TAD/hypocotyl_example/hypocotyl_TAD_position.bed", sep="\t", index=False, header=False, columns = ['chrom', 'start', 'end', 'number'])

Ghir_A01-74760000-75560000 2
Ghir_A01-74760000-75560000 2
Ghir_A02-1780000-1940000 4
Ghir_A02-1780000-1940000 4
Ghir_A04-61060000-61480000 2
Ghir_A04-61060000-61480000 2
Ghir_A06-109860000-110040000 2
Ghir_A06-109860000-110040000 2
Ghir_A06-122220000-122560000 12
Ghir_A06-122220000-122560000 12
Ghir_A06-20020000-20260000 2
Ghir_A06-20020000-20260000 2
Ghir_A06-23380000-23580000 4
Ghir_A06-23380000-23580000 4
Ghir_A06-23380000-23580000 4
Ghir_A06-9360000-9620000 6
Ghir_A06-9360000-9620000 6
Ghir_A07-89280000-89660000 4
Ghir_A07-89280000-89660000 4
Ghir_A08-17380000-17680000 4
Ghir_A08-17380000-17680000 4
Ghir_A08-41040000-41600000 2
Ghir_A08-41040000-41600000 2
Ghir_A08-83000000-83360000 8
Ghir_A08-83000000-83360000 8
Ghir_A10-47640000-48940000 4
Ghir_A10-47640000-48940000 4
Ghir_A10-81160000-81900000 2
Ghir_A10-81160000-81900000 2
Ghir_A11-52860000-53480000 2
Ghir_A11-52860000-53480000 2
Ghir_A11-60880000-61140000 2
Ghir_A11-60880000-61140000 2
Ghir_A12-53400000-53800000 2
Ghir_A12-534

In [114]:
list1 = [1]
list2 = [2, 3, 4]
list3 = [1,2,3,4]
def list_similary(list1, list2):
    set1, set2 = set(list1), set(list2)
    intersection = set1 & set2
    union = set1 | set2
    similarity = len(intersection) / len(union)
    return similarity
a = list_similary(list1, list3)
b = list_similary(list2, list3)
print(a, b)

0.25 0.75
