In [228]:
import pandas
import plotly.express as px
import plotly.graph_objects as go
import re
import argparse
import time

In [2]:
__version__ = "V2.0(Editor) 2023-07-14"

In [102]:
# 前置参数-debug
gene_biotype_list = ["protein_coding", "non_protein_coding", "un_classified"]
absolute_path = False
file_path = "F:/OneDrive/Master/Project/trans/data/"
input_filename = "0002_TSS_TES.tsv"
output_check_df_filename = "0003_check_df.tsv"
output_plot_filename = "0003_bar_plot_biotype_disease.png"

In [None]:
# 前置参数
parser = argparse.ArgumentParser()
parser.add_argument("--gene_biotype_list", dest="gene_biotype_list", required=False, nargs='*', default=["protein_coding", "non_protein_coding", "un_classified"], help='=["protein_coding", "non_protein_coding", "un_classified"]')
parser.add_argument("--absolute_path", dest="absolute_path", required=False, action="store_true", help="use absolute file path")
parser.add_argument("--file_path", dest="file_path", required=False, type=str, default="./", help="=./,\t the path of data directory")
parser.add_argument("--input_filename", dest="input_filename", required=False, type=str, default="0002_TSS_TES.tsv", help="=\"0002_TSS_TES.tsv\",\t the output file name of 0002.py")
parser.add_argument("--output_check_df_filename", dest="output_check_df_filename", required=False, type=str, default="0003_check_df.tsv", help="=0003_check_df.tsv,\t the output df for checking")
parser.add_argument("--output_plot_filename", dest="output_plot_filename", required=False, type=str, default="0003_bar_plot_biotype_disease.png", help="0003_bar_plot_biotype_disease.png,\t the output plot")

args = parser.parse_args()
gene_biotype_list = args.gene_biotype_list
absolute_path = args.absolute_path
file_path = args.file_path
input_filename = args.input_filename
output_check_df_filename = args.output_check_df_filename
output_plot_filename = args.output_plot_filename

In [None]:
# print paraments
print('\n')
print("[Script]{}".format(__file__))
print("[Version]{}".format(__version__))
print("[Date]{}".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))))
print("[Paraments]gene_biotype_list: {}".format(gene_biotype_list))
print("[Paraments]input_filename: {}".format(input_filename))
print("[Paraments]output_check_df_filename: {}".format(output_check_df_filename))
print("[Paraments]output_plot_filename: {}".format(output_plot_filename))
print('\n')

In [103]:
if absolute_path is False:
    input_filename = "{}{}".format(file_path, input_filename)
    output_check_df_filename = "{}{}".format(file_path, output_check_df_filename)
    output_plot_filename = "{}{}".format(file_path, output_plot_filename)

In [471]:
def old_make_bar_plot(sample_df, gene_biotype, disease, tab_level=0):
    """
    output:
        dict,\t {"check_df": df,
                 "{}_{}".format(disease, gene_biotype): fig}
    """
    sample_df = sample_df
    gene_biotype = gene_biotype
    disease = disease

    title = "{} {}".format(disease, gene_biotype)

    # 提取指定gene_biotype和disease的信息
    temp_info = sample_df.loc[sample_df["gene_biotype"]==gene_biotype, :].copy()
    #temp_info["disease"] = [i[0:-5] for i in temp_info.index]
    temp_info = temp_info.loc[temp_info["disease"]==disease, :]
    temp_info = temp_info.set_index("disease")
    temp_info = {"APA/ATSS": temp_info.at[disease, "APA/ATSS"],
                 "APA/TSS": temp_info.at[disease, "APA/TSS"]}

    # 创建绘图所用的df
    df = pandas.DataFrame(index=range(0,4),
                          columns=["gene_biotype", "disease", "5' type", "3' type", "Gene proportion"])
    
    # add info to df
    df["gene_biotype"] = gene_biotype
    df["disease"] = disease

    df.at[0,"5' type"] = "ATSS"
    df.at[1,"5' type"] = "ATSS"
    df.at[2,"5' type"] = "Single TSS"
    df.at[3,"5' type"] = "Single TSS"

    df.at[0,"3' type"] = "APA"
    df.at[1,"3' type"] = "Single PAS"
    df.at[2,"3' type"] = "APA"
    df.at[3,"3' type"] = "Single PAS"

    df.at[0,"Gene proportion"] = temp_info.get("APA/ATSS")
    df.at[1,"Gene proportion"] = 1-temp_info.get("APA/ATSS")
    df.at[2,"Gene proportion"] = temp_info.get("APA/TSS")
    df.at[3,"Gene proportion"] = 1-temp_info.get("APA/TSS")
    
    fig = go.Figure()
    df_group = df.groupby("3' type")
    for i in df_group.groups.keys():
        temp_df = df_group.get_group(i)
        plot_info = go.Bar(x=temp_df["5' type"], y=temp_df["Gene proportion"],
                               name=i, width=0.5)
        fig.add_trace(plot_info)
    fig.update_layout(title_text=title,
                      yaxis_title="Gene proportion",
                      barmode="stack")
    
    fig_info = {}


    return { "check_df": df, "{}_{}".format(disease, gene_biotype): fig}


def get_check_df(df, gene_biotype_list, disease_list):
    """
    input:
        df, pandas.DataFrame, the output file of 0002.py
        gene_biotype_list, list, [<gene_biotype1>, <gene_biotype2>, ...]
        disease_list, list, [<disease1>, <disease2>, ...]
    change:
        在df中，生成一行，这一行含有["gene_biotype", "disease", "5' type", "color", "Gene proportion"]每个值一一对应的关系
    output:
        check_df, pandas.DataFrame, ready for making plot
    """
    df = df
    gene_biotype_list = gene_biotype_list
    disease_list = disease_list

    check_df = pandas.DataFrame(columns=["gene_biotype", "disease", "5' type", "color", "Gene proportion"])
    for type5 in ["ATSS", "Single TSS"]:
        for c in ["APA", "Single PAS"]:
            for biotype in gene_biotype_list:
                for disease in disease_list:
                    value = df.loc[df["gene_biotype"]==biotype, :]
                    value = value.loc[value["disease"]==disease, :]
                    if type5 == "Single TSS":
                        if c == "Single PAS":
                            # TSS-PAS
                            value = 1 - value["APA/TSS"]
                            temp_df = pandas.DataFrame([[biotype, disease, type5, c, value]],
                                                    columns=["gene_biotype", "disease", "5' type", "color", "Gene proportion"])
                            check_df = pandas.concat([check_df,temp_df], axis=0)
                        else:
                            # TSS-APA
                            value = value["APA/TSS"]
                            temp_df = pandas.DataFrame([[biotype, disease, type5, c, value]],
                                                    columns=["gene_biotype", "disease", "5' type", "color", "Gene proportion"])
                            check_df = pandas.concat([check_df,temp_df], axis=0)
                    else:
                        if c == "Single PAS":
                            # ATSS-PAS
                            value = 1 - value["APA/ATSS"]
                            temp_df = pandas.DataFrame([[biotype, disease, type5, c, value]],
                                                    columns=["gene_biotype", "disease", "5' type", "color", "Gene proportion"])
                            check_df = pandas.concat([check_df,temp_df], axis=0)
                        else:
                            # ATSS-APA
                            value = value["APA/ATSS"]
                            temp_df = pandas.DataFrame([[biotype, disease, type5, c, value]],
                                                    columns=["gene_biotype", "disease", "5' type", "color", "Gene proportion"])
                            check_df = pandas.concat([check_df,temp_df], axis=0)
    #check_df["Gene proportion"] = check_df["Gene proportion"].astype(float)
    check_df["Gene proportion"] = check_df["Gene proportion"].astype(float)

    return check_df


def make_bar_plot(check_df, width=800, height=1000):
    """
    input:
        check_df, pandas.DataFrame, ...
        width, int, the width of plot
        height, int, the height of plot
    change:
        make bar plot
    output:
        fig
    """
    check_df = check_df
    width = width
    height = height

    fig = px.bar(check_df,
                x="5' type",
                y="Gene proportion",
                color="color",
                barmode="stack",
                facet_row="gene_biotype",
                facet_col="disease")

    fig.update_layout(height=height,
                      width=width)

    fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

    return fig

In [480]:
debug = pandas.DataFrame([['1','2','3'],['4','5','6']], index=['a','b'],columns=['a1','b2','c3'])

In [472]:
if __name__ == "__main__":
    # load file
    df = pandas.read_csv(input_filename, sep='\t', index_col=0)

    # format df
    df["disease"] = [i[0:-5] for i in df.index]

    # only retain gene info
    gene_index =  list(map(lambda x: x[-5:]=="_gene", df.index))
    df = df.loc[gene_index, :]

    # ready for analysis
    df["APA/ATSS"] = df["ATSS_APA"] / (df["ATSS_APA"] + df["ATSS_PAS"])
    df["APA/TSS"] = df["TSS_APA"] / (df["TSS_APA"] + df["TSS_PAS"])

    # get disease info
    disease_list = list(set(['_'.join(re.split(pattern='_', string=x)[0:-1]) for x in df.index.to_list()]))

    # get data frame
    check_df = get_check_df(df=df, gene_biotype_list=gene_biotype_list, disease_list=disease_list)

    # make plot
    fig = make_bar_plot(check_df=check_df, width=800, height=600)

    # save data
    check_df.to_csv(output_check_df_filename, sep='\t', index=None)
    fig.write_image(output_plot_filename)

In [326]:
# 汇总图片及check_df数据
"""plot_info = {}
check_df = pandas.DataFrame()
for gene_biotype in gene_biotype_list:
    for disease in disease_list:
        result_info = make_bar_plot(sample_df=df, gene_biotype=gene_biotype, disease=disease)
        plot_info["{}_{}".format(disease, gene_biotype)] = result_info.get("{}_{}".format(disease, gene_biotype))
        check_df = pandas.concat([check_df, result_info.get("check_df")], axis=0, ignore_index=True)"""