In [1]:
import pandas
import time
import argparse
import plotly.graph_objects as go
import numpy
import math
import def_function
from def_class import *
import pickle

In [2]:
__version__ = "V2.0(Editor) 2023-08-18"

In [55]:
# 前置参数-debug
infinity_mode = "del"  # 对于仅在一种细胞系中表达的transcript的处理策略, del--不展示这种transcript, endPoint--将这种transcript的表达值定为 最大值+1 / 最小值-1
cutoff_value = 1.5
cell_line_1 = "A673"
cell_line_2 = "Caco-2"
gene_name_list = []
gene_id_list = []
absolute_path = False
file_path = "F:/OneDrive/Master/Project/trans/data/"
input_df_filename = "0004_relative_expression.tsv"
input_pkl_filename = "0002_total_info.pkl"
output_plot_filename = "0006_transcript_expression.svg"

In [None]:
# 前置参数
parser = argparse.ArgumentParser()
parser.add_argument("--infinity_mode", dest="infinity_mode", required=False, type=str, default="del", help="=del, ['del', 'endPoint'], the mode to work with the transcript which only expressing in one cellLine")
parser.add_argument("--cutoff_value", dest="cutoff_value", required=False, type=float, default=1.5, help="=1.5, the cutoff value of the significant log2FC for transcript")
parser.add_argument("--cell_line_1", dest="cell_line_1", required=True, type=str, default="", help="the cell line 1")
parser.add_argument("--cell_line_2", dest="cell_line_2", required=True, type=str, default="", help="the cell line 2")
parser.add_argument("--gene_name_list", dest="gene_name_list", required=False, nargs="*", type=str, default=[], help="the gene will be showed")
parser.add_argument("--gene_id_list", dest="gene_id_list", required=False, nargs="*", type=str, default=[], help="the gene will be showed")
parser.add_argument("--absolute_path", dest="absolute_path", required=False, action="store_true", help="use the absolute path")
parser.add_argument("--file_path", dest="file_path", required=False, type=str, default="", help="the path of data directory")
parser.add_argument("--input_df_filename ", dest="input_df_filename", required=False, type=str, default="0004_relative_expression.tsv", help="=0004_relative_expression.tsv, the output file of 0005.py")
parser.add_argument("--input_pkl_filename ", dest="input_pkl_filename", required=False, type=str, default="0002_total_info.pkl", help="=0002_total_info.pkl, the pickle file of 0002.py")
parser.add_argument("--output_plot_filename", dest="output_plot_filename", required=False, type=str, default="0006_transcript_expression.svg", help="the output filename of this script")

args = parser.parse_args()
infinity_mode = args.infinity_mode
cutoff_value = args.cutoff_value
cell_line_1 = args.cell_line_1
cell_line_2 = args.cell_line_2
gene_name_list = args.gene_name_list
gene_id_list = args.gene_id_list
absolute_path = args.absolute_path
file_path = args.file_path
input_df_filename = args.input_df_filename
input_pkl_filename = args.input_pkl_filename
output_plot_filename = args.output_plot_filename

In [56]:
# 补全路径
if absolute_path is False:
    input_df_filename = "{}{}".format(file_path, input_df_filename)
    input_pkl_filename = "{}{}".format(file_path, input_pkl_filename)
    output_plot_filename = "{}{}".format(file_path, output_plot_filename)

In [None]:
# 打印参数
print("\n")
print("[Date]{}".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))))
print("[Script]{}".format(__file__))
print("[Version]{}".format(__version__))
print("[Version]def_function {}".format(def_function.__version__))
print("[Version]def_class: {}".format(class_version))
print("[Parament]infinity_mode: {}".format(infinity_mode))
print("[Parament]cutoff_value: {}".format(cutoff_value))
print("[Parament]cell_line_1: {}".format(cell_line_1))
print("[Parament]cell_line_2: {}".format(cell_line_2))
print("[Parament]gene_name_list: {}".format(gene_name_list))
print("[Parament]gene_id_list: {}".format(gene_id_list))
print("[Parament]input_df_filename: {}".format(input_df_filename))
print("[Parament]output_plot_filename: {}".format(output_plot_filename))
print("\n")

In [48]:
# 绘图函数
@def_function.log
def make_plot(df, cell_line_1, cell_line_2, infinity_mode, cutoff_value, gene_name_list=None, gene_id_list=None, tab_level=0):
    """
    input:
        df, pandas.DataFrame, 
        cell_line_1, str, ...
        cell_line_2, str, ...
        infinity_mode, str, 对于仅在一种细胞系中表达的transcript的处理方式："del" or "endPoint"
        cutoff_value, float, log2FC的显著性阈值
        gene_name_list, list, 要展示的gene的name, gene_name的优先级低于gene_id
        gene_id_list, list, 要展示的gene的id, gene_id的优先级高于gene_name
    change:
        1.根据读取的数据, 整理出一个df, 一列保存每个转录本的log2FC, 一列保存转录本所映射的基因
        2.存在部分transcript, 这些transcript仅在一种细胞系中表达, 其log2FC为nan
            2.1当参数infinity_mode为endPoint时, 将这种transcript的log2FC定为x轴 最大值+1 or 最小值-1
            2.2当参数infinity_mode为del时, 将不展示这种transcript
        3.根据整理得到的temp_df, 绘制图片
    output:
        fig
    """
    df = df
    cell_line_1 = cell_line_1
    cell_line_2 = cell_line_2
    infinity_mode = infinity_mode
    cutoff_value = cutoff_value
    gene_name_list = (gene_name_list, [])[gene_name_list is None]
    gene_id_list = (gene_id_list, [])[gene_id_list is None]


    # 确定x轴的显示范围
    temp = df["log2FC_{}@{}".format(cell_line_1, cell_line_2)].map(lambda x: None if numpy.isinf(x) else x)
    xaxis_min = math.floor(min(temp))
    xaxis_max = math.ceil(max(temp))


    # 根据cutoff_value对log2FC进行分类并对不同类型设定相应的颜色
    temp_list = df["log2FC_{}@{}".format(cell_line_1, cell_line_2)].to_list()
    for i in range(0, len(temp_list)):
        value = temp_list[i]
        # 处理仅在一种cellLine中表达的transcript
        if numpy.isinf(value):
            if infinity_mode == "del":
                temp_list[i] = None
                continue
            elif infinity_mode == "endPoint":
                pass
            else:
                raise ValueError("[Error]Unsupported infinity_mode: {}".format(infinity_mode))

        # 根据log2FC的值判断应对transcript标注的颜色
        if value >= cutoff_value:
            temp_list[i] = "indianred"
        elif value <= -cutoff_value:
            temp_list[i] = "lightblue"
        elif -cutoff_value < value < cutoff_value:
            temp_list[i] = "grey"
        else:
            raise ValueError("[Error]the log2FC of {} is {}".format(df["transcript_id"][i], value))
    df["color"] = temp_list

    if infinity_mode == "del":
        df = df.dropna(axis=0, how="any", subset="color")
    elif infinity_mode == "endPoint":
        # 若log2FC值为inf, 则将其更改为最大值+1; 若log2FC值为-inf, 则将其更改为最小值-1
        df["log2FC_{}@{}".format(cell_line_1, cell_line_2)] = df["log2FC_{}@{}".format(cell_line_1, cell_line_2)].map(lambda x: x if not numpy.isinf(x) else xaxis_max+1 if numpy.isposinf(x) else xaxis_min-1)


    # 可以设置要展示的基因，基因标识可以是gene_id也可以是gene_name, gene_id的优先级更高，不可混合使用gene_id与gene_name
    id_marker = "Default"
    if len(gene_id_list) == 0:
        if len(gene_name_list) == 0:
            pass
        else:
            id_marker = "gene_name"
    else:
        id_marker = "gene_id"
    # 根据id_marker筛选要展示的gene
    if id_marker == "gene_id":
        temp_df = df.loc[df["gene_id"].isin(gene_id_list), :]
    elif id_marker == "gene_name":
        temp_df = df.loc[df["gene_name"].isin(gene_name_list), :]
    elif id_marker == "Default":
        gene_id_list = df["gene_id"].value_counts()
        gene_id_list = [[k, v] for k, v in dict(gene_id_list).items()]
        gene_id_list = sorted(gene_id_list, key=lambda x: x[1], reverse=True)
        gene_id_list = gene_id_list[0:33]
        gene_id_list = [i[0] for i in gene_id_list]
        temp_df = df.loc[df["gene_id"].isin(gene_id_list), :]
    else:
        raise ValueError("[Error]id_marker is {}".format(id_marker))
    # 调整排序，美化排版
    temp_df = temp_df.sort_values(by="gene_name", ascending=False)


    # 绘制图形
    fig = go.Figure()
    fig_trace = go.Scatter(x=temp_df["log2FC_{}@{}".format(cell_line_1, cell_line_2)],
                        y=temp_df["gene_name"],
                        mode="markers",
                        marker={"color": temp_df["color"]})
    fig = fig.add_trace(fig_trace)
    # 设置图片格式
    layout = {"width": 250, "height": 500,
            "title": {"text": "5'-3' isoform expression",
                            "font": {"family": "Arial",
                                    "color": "black",
                                    "size": 12,},
                            "x": 0.6},
            "margin": {'l':15, 'r':15, 't':25, 'b':15},
            "paper_bgcolor": "white",
            "plot_bgcolor":"white",
            "xaxis": {"title": {"text": "log2FC({}/{})".format(cell_line_1, cell_line_2),
                                "font": {"family": "Arial",
                                        "color": "black",
                                        "size": 12,},
                                "standoff": 0.5,
                                },
                        "range": [xaxis_min-1, xaxis_max+1],
                        "showline": True,
                        "linecolor": "black",
                        "showticklabels": True,
                        "ticklen": 3,
                        "tickcolor": "black",
                        "ticks": "outside",
                        },
            "yaxis": {"showline": True,
                        "linecolor": "black",
                        "showticklabels": True,
                        "range": [-1, len(set(temp_df["gene_name"]))],
                        "tickfont": {"family": "Arial",
                                    "color": "black",
                                    "size": 10},
                        "dtick":1,
                        "ticks": "outside",
                        "ticklen": 3,
                        "tickcolor": "black",
                        }
            }
    fig = fig.update_layout(layout)

    return fig

In [61]:
if __name__ == "__main__":
    # load_data
    df = pandas.read_csv(input_df_filename, sep='\t')
    with open(input_pkl_filename, 'rb') as file:
        total = pickle.load(file)

    # filter columns
    new_columns = ["gene_id", "transcript_id"] + \
                  ["log2FC_{}@{}".format(cell_line_1, cell_line_2)] + \
                  ["{}".format(cell_line_1), "{}".format(cell_line_2)]
    df = df[new_columns]

    # filter gene_id
    df = df.loc[map(lambda x: x[0:5]!="ENCLB" , df["gene_id"]), :]

    # add gene_name
    df["gene_name"] = df["gene_id"].map(lambda x: total.gene_dict[x].gene_name)

    # make plot
    fig = make_plot(df=df, cell_line_1=cell_line_1, cell_line_2=cell_line_2,
                    infinity_mode=infinity_mode, cutoff_value=cutoff_value,
                    gene_name_list=gene_name_list, gene_id_list=gene_id_list)
    
    # output plot
    fig.write_image(output_plot_filename)

    # log
    print("[{}]All blocks finished.".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))))



[Function]make_plot start.
	[Time]2023-08-18 10:32:58
	[Paraments]df: <...>
	[Paraments]cell_line_1: A673
	[Paraments]cell_line_2: Caco-2
	[Paraments]infinity_mode: del
	[Paraments]cutoff_value: <...>
	[Paraments]gene_name_list: []
	[Paraments]gene_id_list: []
[2023-08-18 10:32:58]make_plot finished.
[2023-08-18 10:32:59]All blocks finished.


---

Start

End

---

In [None]:
"""
1.读取含有每个转录本在不同细胞系中差异表达的表格
2.根据读取的数据，整理出一个df，一列保存每个转录本的log2FC，一列保存转录本所映射的基因
    2.1存在部分转录本只在一种细胞系中表达，对于这种转录本，暂且将其log2FC定为最大值/最小值+1
3.绘图
"""