In [1]:
import pandas
import plotly.express as px
import plotly.graph_objects as go
import re
import argparse
import time
import def_function

In [2]:
__version__ = "V3.3(Editor) 2023-08-07"

In [3]:
# 前置参数-debug
gene_biotype_list = ["protein_coding", "non_protein_coding", "un_classified"]
absolute_path = False
file_path = "F:/OneDrive/Master/Project/trans/data/"
input_filename = "0002_TSS_TES.tsv"
output_plot_path = "0003_plot/"

In [None]:
# 前置参数
parser = argparse.ArgumentParser()
parser.add_argument("--gene_biotype_list", dest="gene_biotype_list", required=False, nargs='*', default=["protein_coding", "non_protein_coding", "un_classified"], help='=["protein_coding", "non_protein_coding", "un_classified"]')
parser.add_argument("--absolute_path", dest="absolute_path", required=False, action="store_true", help="use absolute file path")
parser.add_argument("--file_path", dest="file_path", required=False, type=str, default="./", help="=./,\t the path of data directory")
parser.add_argument("--input_filename", dest="input_filename", required=False, type=str, default="0002_TSS_TES.tsv", help="=\"0002_TSS_TES.tsv\",\t the output file name of 0002.py")
parser.add_argument("--output_plot_path", dest="output_plot_path", required=False, type=str, default="0003_plot/", help="0003_plot/,\t the output plot")

args = parser.parse_args()
gene_biotype_list = args.gene_biotype_list
absolute_path = args.absolute_path
file_path = args.file_path
input_filename = args.input_filename
output_plot_path = args.output_plot_path

In [None]:
# print paraments
print('\n')
print("[Script]{}".format(__file__))
print("[Version]{}".format(__version__))
print("[Date]{}".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))))
print("[Paraments]gene_biotype_list: {}".format(gene_biotype_list))
print("[Paraments]input_filename: {}".format(input_filename))
print("[Paraments]output_plot_path: {}".format(output_plot_path))
print('\n')

In [4]:
# 补全路径
if absolute_path is False:
    input_filename = "{}{}".format(file_path, input_filename)
    output_plot_path = "{}{}".format(file_path, output_plot_path)

In [5]:
@def_function.log
def integrateGeneBiotype(df, tab_level=0):
    """
    change:
        1.汇总每一个细胞系中的所有geneBiotype的四种类型gene的计数
        2.对每一个细胞系生成一行数据, 记录相应的加和
    """
    df_group = df.groupby("cellLine")
    for cellLine in df_group.groups.keys():
        # 遍历每一个cellLine
        temp_df = df_group.get_group(cellLine)
        temp_df = [cellLine, "gene", "all",
                   temp_df["TSS-PAS"].sum(), temp_df["TSS-APA"].sum(),
                   temp_df["ATSS-PAS"].sum(), temp_df["ATSS-APA"].sum()]
        # 将数据格式化为表格
        temp_df = pandas.DataFrame([temp_df],
                                    columns=["cellLine", "geneOrTranscript", "geneBiotype",
                                             "TSS-PAS", "TSS-APA", "ATSS-PAS", "ATSS-APA"])
        # 将新的数据添加到原df中
        df = pandas.concat([df, temp_df], axis=0, ignore_index=True)

    return df


def make_bar_plot(TSS_APA, ATSS_APA):
    """
    """
    TSS_APA = float(TSS_APA)
    ATSS_APA = float(ATSS_APA)
    TSS_PAS = 1 - TSS_APA
    ATSS_PAS = 1 - ATSS_APA

    # 准备基础画布
    fig = go.Figure()

    # 准备轨迹
    fig = fig.add_trace(go.Bar(x=["ATSS", "Single TSS"],
                               y=[ATSS_APA, TSS_APA],
                               name="APA",
                               width=0.5,
                               marker={"color": "purple",
                                       "line": {"color": "black",
                                                "width": 0.5},
                                        }
                               ))
    fig = fig.add_trace(go.Bar(x=["ATSS", "Single TSS"],
                               y=[ATSS_PAS, TSS_PAS],
                               name="Single<br>PAS",
                               width=0.5,
                               marker={"color": "grey",
                                       "line": {"color": "black",
                                                "width": 0.5},
                                       }
                               ))

    # 设置画布
    layout = {"width": 400, "height": 300,
              "margin": {'l':0, 'r':0, 't':0, 'b':0},
              "barmode": "stack",
              "font": {"family": "Arial",
                       "color": "black",
                       "size": 12},
              "xaxis": {"linecolor": "black",
                        "range": [-0.5,1.5],
                        "tickfont": {"size": 12,
                                     "family": "Arial",
                                     "color": "black"},
                                },
              "yaxis_title": "Gene proportion",
              "yaxis": {"titlefont": {"size":12,
                                      "color": "black",
                                      "family": "Arial"},
                        "dtick": 0.5,
                        "showgrid": False,
                        "showline": True,
                        "linecolor": "black",
                        "tickfont": {"size": 12,
                                     "family": "Arial",
                                     "color": "black"}
                        },
              "plot_bgcolor": "white",
              "bargap": 0, "bargroupgap": 0,
              }
    fig = fig.update_layout(layout)

    return fig



In [6]:
if __name__ == "__main__":
    # load file
    df = pandas.read_csv(input_filename, sep='\t')

    # only retain gene info
    df = df.loc[df["geneOrTranscript"]=="gene", :]

    # integrate gene biotype
    df = integrateGeneBiotype(df=df)

    # ready for analysis
    df["APA/ATSS"] = df["ATSS-APA"] / (df["ATSS-APA"] + df["ATSS-PAS"])
    df["APA/TSS"] = df["TSS-APA"] / (df["TSS-APA"] + df["TSS-PAS"])

    # get cell_line info
    cellLineList = list(set(df["cellLine"]))

    # make plot
    for cellLine in set(df["cellLine"]):
        for geneBiotype in set(df["geneBiotype"]):
            temp = df.query("cellLine==@cellLine & geneBiotype==@geneBiotype")
            fig = make_bar_plot(TSS_APA=temp["APA/TSS"], ATSS_APA=temp["APA/ATSS"])
            fig.write_image("{}0003_{}_{}.svg".format(output_plot_path, cellLine, geneBiotype))
    
    print("[{}]All blocks finished.".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))))



[Function]integrateGeneBiotype start.
	[Time]2023-08-17 10:39:49
	[Paraments]df: <...>
[2023-08-17 10:39:49]integrateGeneBiotype finished.
[2023-08-17 10:39:51]All blocks finished.


---

调试

调试

---