In [17]:
import pandas
import plotly.express as px
import plotly.graph_objects as go
import re
import argparse
import time

In [18]:
__version__ = "V3.0(Editor) 2023-07-15"

In [19]:
# 前置参数-debug
gene_biotype_list = ["protein_coding", "non_protein_coding", "un_classified"]
absolute_path = False
file_path = "F:/OneDrive/Master/Project/trans/data/"
input_filename = "0002_TSS_TES.tsv"
output_check_df_filename = "0003_check_df.tsv"
output_plot_path = "0003_plot/"

In [None]:
# 前置参数
parser = argparse.ArgumentParser()
parser.add_argument("--gene_biotype_list", dest="gene_biotype_list", required=False, nargs='*', default=["protein_coding", "non_protein_coding", "un_classified"], help='=["protein_coding", "non_protein_coding", "un_classified"]')
parser.add_argument("--absolute_path", dest="absolute_path", required=False, action="store_true", help="use absolute file path")
parser.add_argument("--file_path", dest="file_path", required=False, type=str, default="./", help="=./,\t the path of data directory")
parser.add_argument("--input_filename", dest="input_filename", required=False, type=str, default="0002_TSS_TES.tsv", help="=\"0002_TSS_TES.tsv\",\t the output file name of 0002.py")
parser.add_argument("--output_check_df_filename", dest="output_check_df_filename", required=False, type=str, default="0003_check_df.tsv", help="=0003_check_df.tsv,\t the output df for checking")
parser.add_argument("--output_plot_path", dest="output_plot_path", required=False, type=str, default="0003_plot/", help="0003_plot/,\t the output plot")

args = parser.parse_args()
gene_biotype_list = args.gene_biotype_list
absolute_path = args.absolute_path
file_path = args.file_path
input_filename = args.input_filename
output_check_df_filename = args.output_check_df_filename
output_plot_path = args.output_plot_path

In [None]:
# print paraments
print('\n')
print("[Script]{}".format(__file__))
print("[Version]{}".format(__version__))
print("[Date]{}".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))))
print("[Paraments]gene_biotype_list: {}".format(gene_biotype_list))
print("[Paraments]input_filename: {}".format(input_filename))
print("[Paraments]output_check_df_filename: {}".format(output_check_df_filename))
print("[Paraments]output_plot_path: {}".format(output_plot_path))
print('\n')

In [20]:
if absolute_path is False:
    input_filename = "{}{}".format(file_path, input_filename)
    output_check_df_filename = "{}{}".format(file_path, output_check_df_filename)
    output_plot_path = "{}{}".format(file_path, output_plot_path)

In [21]:
def old_make_bar_plot(sample_df, gene_biotype, cell_line, tab_level=0):
    """
    output:
        dict,\t {"check_df": df,
                 "{}_{}".format(cell_line, gene_biotype): fig}
    """
    sample_df = sample_df
    gene_biotype = gene_biotype
    cell_line = cell_line

    title = "{} {}".format(cell_line, gene_biotype)

    # 提取指定gene_biotype和cell_line的信息
    temp_info = sample_df.loc[sample_df["gene_biotype"]==gene_biotype, :].copy()
    #temp_info["cell_line"] = [i[0:-5] for i in temp_info.index]
    temp_info = temp_info.loc[temp_info["cell_line"]==cell_line, :]
    temp_info = temp_info.set_index("cell_line")
    temp_info = {"APA/ATSS": temp_info.at[cell_line, "APA/ATSS"],
                 "APA/TSS": temp_info.at[cell_line, "APA/TSS"]}

    # 创建绘图所用的df
    df = pandas.DataFrame(index=range(0,4),
                          columns=["gene_biotype", "cell_line", "5' type", "3' type", "Gene proportion"])
    
    # add info to df
    df["gene_biotype"] = gene_biotype
    df["cell_line"] = cell_line

    df.at[0,"type5"] = "ATSS"
    df.at[1,"type5"] = "ATSS"
    df.at[2,"type5"] = "Single TSS"
    df.at[3,"type5"] = "Single TSS"

    df.at[0,"type3"] = "APA"
    df.at[1,"type3"] = "Single PAS"
    df.at[2,"type3"] = "APA"
    df.at[3,"type3"] = "Single PAS"

    df.at[0,"Gene_proportion"] = temp_info.get("APA/ATSS")
    df.at[1,"Gene_proportion"] = 1-temp_info.get("APA/ATSS")
    df.at[2,"Gene_proportion"] = temp_info.get("APA/TSS")
    df.at[3,"Gene_proportion"] = 1-temp_info.get("APA/TSS")
    
    fig = go.Figure()
    df_group = df.groupby("3' type")
    for i in df_group.groups.keys():
        temp_df = df_group.get_group(i)
        plot_info = go.Bar(x=temp_df["5' type"], y=temp_df["Gene proportion"],
                               name=i, width=0.5)
        fig.add_trace(plot_info)
    fig.update_layout(title_text=title,
                      yaxis_title="Gene proportion",
                      barmode="stack")
    
    fig_info = {}


    return { "check_df": df, "{}_{}".format(cell_line, gene_biotype): fig}


def get_check_df(df, gene_biotype_list, cell_line_list, tab_level=0):
    """
    input:
        df, pandas.DataFrame, the output file of 0002.py
        gene_biotype_list, list, [<gene_biotype1>, <gene_biotype2>, ...]
        cell_line_list, list, [<cell_line1>, <cell_line2>, ...]
    change:
        在df中，生成一行，这一行含有["gene_biotype", "cell_line", "5' type", "color", "Gene proportion"]每个值一一对应的关系
    output:
        check_df, pandas.DataFrame, ready for making plot
    """
    df = df
    gene_biotype_list = gene_biotype_list
    cell_line_list = cell_line_list

    check_df = pandas.DataFrame(columns=["gene_biotype", "cell_line", "type5", "color", "Gene_proportion"])
    for type5 in ["ATSS", "Single TSS"]:
        for c in ["APA", "Single PAS"]:
            for biotype in gene_biotype_list:
                for cell_line in cell_line_list:
                    value = df.loc[df["gene_biotype"]==biotype, :]
                    value = value.loc[value["cell_line"]==cell_line, :]
                    if type5 == "Single TSS":
                        if c == "Single PAS":
                            # TSS-PAS
                            value = 1 - value["APA/TSS"]
                            temp_df = pandas.DataFrame([[biotype, cell_line, type5, c, value]],
                                                    columns=["gene_biotype", "cell_line", "type5", "color", "Gene_proportion"])
                            check_df = pandas.concat([check_df,temp_df], axis=0)
                        else:
                            # TSS-APA
                            value = value["APA/TSS"]
                            temp_df = pandas.DataFrame([[biotype, cell_line, type5, c, value]],
                                                    columns=["gene_biotype", "cell_line", "type5", "color", "Gene_proportion"])
                            check_df = pandas.concat([check_df,temp_df], axis=0)
                    else:
                        if c == "Single PAS":
                            # ATSS-PAS
                            value = 1 - value["APA/ATSS"]
                            temp_df = pandas.DataFrame([[biotype, cell_line, type5, c, value]],
                                                    columns=["gene_biotype", "cell_line", "type5", "color", "Gene_proportion"])
                            check_df = pandas.concat([check_df,temp_df], axis=0)
                        else:
                            # ATSS-APA
                            value = value["APA/ATSS"]
                            temp_df = pandas.DataFrame([[biotype, cell_line, type5, c, value]],
                                                    columns=["gene_biotype", "cell_line", "type5", "color", "Gene_proportion"])
                            check_df = pandas.concat([check_df,temp_df], axis=0)
    #check_df["Gene proportion"] = check_df["Gene proportion"].astype(float)
    check_df["Gene_proportion"] = check_df["Gene_proportion"].astype(float)

    return check_df


def old_make_bar_plot(check_df, width=800, height=1000):
    """
    input:
        check_df, pandas.DataFrame, ...
        width, int, the width of plot
        height, int, the height of plot
    change:
        make bar plot
    output:
        fig
    """
    check_df = check_df
    width = width
    height = height

    """fig = px.bar(check_df,
                x="5' type",
                y="Gene proportion",
                color="color",
                barmode="stack",
                facet_row="gene_biotype",
                facet_col="cell_line")"""
    
    fig = go.Figure()

    fig_info = go.Bar()

    fig.update_layout(height=height,
                      width=width)

    fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

    return fig


def make_bar_plot(check_df, gene_biotype, cell_line, tab_level=0):
    """
    input:
        check_df, pandas.DataFrame, ...
        gene_biotype, str, ...
        cell_line, str, ...
    change:
        make bar plot
    output:
        fig
    """
    check_df = check_df
    gene_biotype = gene_biotype
    cell_line = cell_line

    temp_df = check_df.loc[check_df["gene_biotype"]==gene_biotype, :]
    temp_df = temp_df.loc[temp_df["cell_line"]==cell_line, :]

    fig = go.Figure()
    fig_info = go.Bar()

    TSS_PAS = temp_df.query("type5 == 'Single TSS' & color == 'Single PAS'")["Gene_proportion"]
    TSS_APA = temp_df.query("type5 == 'Single TSS' & color == 'APA'")["Gene_proportion"]
    ATSS_PAS = temp_df.query("type5 == 'ATSS' & color == 'Single PAS'")["Gene_proportion"]
    ATSS_APA = temp_df.query("type5 == 'ATSS' & color == 'APA'")["Gene_proportion"]

    TSS_PAS = float(TSS_PAS)
    TSS_APA = float(TSS_APA)
    ATSS_PAS = float(ATSS_PAS)
    ATSS_APA = float(ATSS_APA)

    fig = go.Figure()

    fig.add_trace(go.Bar(x=["ATSS", "Single TSS"],
                        y=[ATSS_APA, TSS_APA],
                        name="APA",
                        width=0.5,
                        marker={"color": "purple",
                                "line": {"color": "black",
                                         "width": 0.5},
                                }
                        ))
    fig.add_trace(go.Bar(x=["ATSS", "Single TSS"],
                        y=[ATSS_PAS, TSS_PAS],
                        name="Single<br>PAS",
                        width=0.5,
                        marker={"color": "grey",
                                "line": {"color": "black",
                                         "width": 0.5},
                                }
                        ))

    layout = {"width": 400, "height": 300,
              "margin": {'l':0, 'r':0, 't':0, 'b':0},
              "barmode": "stack",
              "font": {"family": "Arial",
                       "color": "black",
                       "size": 10},
              "xaxis": {"linecolor": "black",
                        "range": [-0.5,1.5],
                        },
              "yaxis_title": "Gene proportion",
              "yaxis": {"titlefont": {"size":10},
                        "dtick": 0.5,
                        "showgrid": False,
                        "showline": True,
                        "linecolor": "black",
                        "tickfont": {"size": 10}
                        },
              "plot_bgcolor": "white",
              "bargap": 0, "bargroupgap": 0,
              }
    fig = fig.update_layout(layout)

    return fig

In [105]:
if __name__ == "__main__":
    # load file
    df = pandas.read_csv(input_filename, sep='\t', index_col=0)

    # only retain gene info
    gene_index =  list(map(lambda x: x[-5:]=="_gene", df.index))
    df = df.loc[gene_index, :]

    # format df
    df["cell_line"] = [i[0:-5] for i in df.index]

    # ready for analysis
    df["APA/ATSS"] = df["ATSS_APA"] / (df["ATSS_APA"] + df["ATSS_PAS"])
    df["APA/TSS"] = df["TSS_APA"] / (df["TSS_APA"] + df["TSS_PAS"])

    # get cell_line info
    cell_line_list = list(set(['_'.join(re.split(pattern='_', string=x)[0:-1]) for x in df.index.to_list()]))

    # get data frame
    check_df = get_check_df(df=df, gene_biotype_list=gene_biotype_list, cell_line_list=cell_line_list)

    # make plot
    for gene_biotype in set(check_df["gene_biotype"]):
        for cell_line in set(check_df["cell_line"]):
            fig = make_bar_plot(check_df=check_df,gene_biotype=gene_biotype, cell_line=cell_line, tab_level=0)
            fig.write_image("{}0003_{}_{}.svg".format(output_plot_path, cell_line, gene_biotype))
    
    # update
    # 新增功能, 对每个细胞系，统计三种gene_biotype的数据并绘图
    # 待整合到前面的函数中
    for cell_line in cell_line_list:
        temp_df = df.loc[df["cell_line"]==cell_line, :].copy()
        temp_df = temp_df[["TSS_PAS", "TSS_APA", "ATSS_PAS", "ATSS_APA"]]
        temp_df = temp_df.apply(lambda x: x.sum(), axis=0)
        temp_df["gene_biotype"] = "all"
        temp_df["cell_line"] = cell_line
        temp_df = pandas.DataFrame(temp_df).T
        temp_df["APA/ATSS"] = temp_df["ATSS_APA"] / (temp_df["ATSS_APA"] + temp_df["ATSS_PAS"])
        temp_df["APA/TSS"] = temp_df["TSS_APA"] / (temp_df["TSS_APA"] + temp_df["TSS_PAS"])
        temp_df = get_check_df(temp_df, ["all"], [cell_line])
        print(temp_df)
        fig = make_bar_plot(check_df=temp_df,gene_biotype="all", cell_line=cell_line)
        fig.write_image("{}0003_{}_{}.svg".format(output_plot_path, cell_line, "all"))

In [326]:
# 汇总图片及check_df数据
"""plot_info = {}
check_df = pandas.DataFrame()
for gene_biotype in gene_biotype_list:
    for cell_line in cell_line_list:
        result_info = make_bar_plot(sample_df=df, gene_biotype=gene_biotype, cell_line=cell_line)
        plot_info["{}_{}".format(cell_line, gene_biotype)] = result_info.get("{}_{}".format(cell_line, gene_biotype))
        check_df = pandas.concat([check_df, result_info.get("check_df")], axis=0, ignore_index=True)"""