In [31]:
## 对于bedtools 分析出的组织特异性表达基因所具有的subcompartment 状态进行去重
import pandas as pd
df = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/compartment/CALDER2/tissues/J668_tissues_special_gene_subcpt.bed", sep="\t", names = ['chr', 'start', 'end', 'GeneId', 'special_tissues', 'tissues', 'TPM', 'subcpt'])
df.drop_duplicates(keep='last', inplace=True, subset=['GeneId', 'tissues'])
df.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/compartment/CALDER2/tissues/J668_tissues_special_gene_subcpt.bed", sep="\t", index=False)

In [59]:
## 分析组织特异性基因，其在每个组织中最高subcpt数量统计
import re
import pandas as pd
df = pd.read_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/compartment/CALDER2/tissues/J668_tissues_special_gene_subcpt.bed", sep="\t")
tissues = [ 'anthers', 'cotyledon', 'ovules', 'fiber_5DPA', 'fiber_10DPA', 'fiber_20DPA', 'hypocotyl', 'leaf', 'petals', 'radicle', 'root', 'stem', 'stigma']
## 将长数据转换为宽数据
def longTowide(df):
    df_return = pd.DataFrame()
    for t in tissues:
        df_special = df.query("special_tissues==@t")
        df_special_tissues = pd.DataFrame()
        for j in tissues:
            df_tmp = df_special.query("tissues==@j")[['GeneId', 'subcpt']].copy()
            df_tmp.columns = ['GeneId', j]
            if df_special_tissues.shape[0] == 0:
                df_special_tissues = df_tmp
            else:
                df_special_tissues = pd.merge(df_special_tissues, df_tmp, how='outer', on=['GeneId'])
        df_special_tissues['tags'] = [t]*df_special_tissues.shape[0]
        df_return = pd.concat([df_return, df_special_tissues], axis=0)
    df_return.fillna(0, inplace=True)
    return df_return
#! 将处于动态和稳定的subcpt 区间的基因选出
def stable_dynamic(df):
    df_A1 = df.loc[(df[tissues]=="A1").all(axis=1), :]
    df_A2 = df.loc[(df[tissues]=="A2").all(axis=1), :]
    df_A3 = df.loc[(df[tissues]=="A3").all(axis=1), :]
    df_A4 = df.loc[(df[tissues]=="A4").all(axis=1), :]
    df_B1 = df.loc[(df[tissues]=="B1").all(axis=1), :]
    df_B2 = df.loc[(df[tissues]=="B2").all(axis=1), :]
    df_B3 = df.loc[(df[tissues]=="B3").all(axis=1), :]
    df_B4 = df.loc[(df[tissues]=="B4").all(axis=1), :]
    df_stable = pd.concat([df_A1, df_A2, df_A3, df_A4, df_B1, df_B2, df_B3, df_B4], axis=0)
    df_dynamic = pd.concat([df, df_stable], axis=0)
    df_dynamic.drop_duplicates(keep=False, inplace=True)
    return df_stable, df_dynamic
#! 统计分析，活性最高的基因表达量
def colmax_indexn(df, i): ## 统计每行的最大值在每列中出现的数目
    anthers_n =  df.query("anthers==@i").shape[0]
    cotyledon_n = df.query("cotyledon==@i").shape[0]
    ovules_n = df.query("ovules==@i").shape[0]
    fiber_5DPA_n = df.query("fiber_5DPA==@i").shape[0]
    fiber_10DPA_n = df.query("fiber_10DPA==@i").shape[0]
    fiber_20DPA_n = df.query("fiber_20DPA==@i").shape[0]
    hypocotyl_n = df.query("hypocotyl==@i").shape[0]
    leaf_n = df.query("leaf==@i").shape[0]
    petals_n = df.query("petals==@i").shape[0]
    radicle_n = df.query("radicle==@i").shape[0]
    root_n = df.query("root==@i").shape[0]
    stem_n = df.query("stem==@i").shape[0]
    stigma_n = df.query("stigma==@i").shape[0]
    number = [anthers_n, cotyledon_n, ovules_n, fiber_5DPA_n, fiber_10DPA_n, fiber_20DPA_n, hypocotyl_n, leaf_n, petals_n, radicle_n, root_n, stem_n, stigma_n]
    # print(number)
    # df_return = pd.DataFrame(pd.Series(number), columns = tissues)
    df_return = pd.DataFrame({'anthers':[anthers_n], 'cotyledon':[cotyledon_n], 'ovules':[ovules_n], 'fiber_5DPA':[fiber_5DPA_n], 
    'fiber_10DPA':[fiber_10DPA_n], 'fiber_20DPA':[fiber_20DPA_n], 'hypocotyl':[hypocotyl_n], 'leaf':[leaf_n], 'petals':[petals_n], 
    'radicle':[radicle_n], 'root':[root_n], 'stem':[stem_n], 'stigma':[stigma_n]})
    return df_return

def sta_subcpt(df):
    # 每行最大值，活性程度最高
    old_tag = ['A1', 'A2', 'A3', 'A4', 'B4', 'B3', 'B2', 'B1']
    new_tag = [8, 7, 6, 5, 4, 3, 2, 1]
    df.replace(old_tag, new_tag, inplace=True)
    df['max_values'] = df[tissues].max(axis=1)
    # 统计每行活性程度最高的subcpt 其组织
    df_return = pd.DataFrame()
    for t in tissues:
        df_tissues = df.query("tags==@t").copy()
        df_tissues_tmp = pd.DataFrame()
        other_tissues = tissues.copy()
        other_tissues.remove(t)
        print(other_tissues)
        df_tissues['other_average_subcpt'] = df_tissues[other_tissues].mean(axis=1)
        #print(df_tissues[other_tissues].mean(axis=1))
        df_tissues = df_tissues[[t, 'other_average_subcpt']]
        df_tissues['tags'] = [t]*df_tissues.shape[0]
        df_tissues.columns = ['TSG_subcpt', 'other_average_subcpt', 'tags']
        df_return = pd.concat([df_return, df_tissues], axis=0)
    return df_return
df_wide = longTowide(df)
#print(df_wide.shape[0])
df_stable, df_dynamic = stable_dynamic(df_wide)
#df_dynamic.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/compartment/CALDER2/tissues/J668_dynamic_cpt.bed", sep="\t", index=False)
df_statistics = sta_subcpt(df_dynamic)
df_statistics.to_csv("G:/Billfish/J668_multip_tissue_3D-genome/HiC/compartment/CALDER2/tissues/J668_TSG_subcpt_values.bed", sep="\t", index=False)

['cotyledon', 'ovules', 'fiber_5DPA', 'fiber_10DPA', 'fiber_20DPA', 'hypocotyl', 'leaf', 'petals', 'radicle', 'root', 'stem', 'stigma']
['anthers', 'ovules', 'fiber_5DPA', 'fiber_10DPA', 'fiber_20DPA', 'hypocotyl', 'leaf', 'petals', 'radicle', 'root', 'stem', 'stigma']
['anthers', 'cotyledon', 'fiber_5DPA', 'fiber_10DPA', 'fiber_20DPA', 'hypocotyl', 'leaf', 'petals', 'radicle', 'root', 'stem', 'stigma']
['anthers', 'cotyledon', 'ovules', 'fiber_10DPA', 'fiber_20DPA', 'hypocotyl', 'leaf', 'petals', 'radicle', 'root', 'stem', 'stigma']
['anthers', 'cotyledon', 'ovules', 'fiber_5DPA', 'fiber_20DPA', 'hypocotyl', 'leaf', 'petals', 'radicle', 'root', 'stem', 'stigma']
['anthers', 'cotyledon', 'ovules', 'fiber_5DPA', 'fiber_10DPA', 'hypocotyl', 'leaf', 'petals', 'radicle', 'root', 'stem', 'stigma']
['anthers', 'cotyledon', 'ovules', 'fiber_5DPA', 'fiber_10DPA', 'fiber_20DPA', 'leaf', 'petals', 'radicle', 'root', 'stem', 'stigma']
['anthers', 'cotyledon', 'ovules', 'fiber_5DPA', 'fiber_10DPA'