In [1]:
import argparse
import time
from def_class import *
import def_function
import pandas
import numpy
import math
import pickle
import plotly.graph_objects as go
import re

In [2]:
__version__ = "V1.0(Editor) 2023-08-17"

In [16]:
# 前置参数-debug
cellLine1 = "A673"
cellLine2 = "Caco-2"
geneId = None
geneName = None
cutoffValue = 1.5
absolute_path = False
file_path = "F:/OneDrive/Master/Project/trans/data/"
input_pickle_filename = "0002_total_info.pkl"
input_transcript_expression_filename = "0004_relative_expression.tsv"
output_plot_path = "0005_plot/"

In [17]:
# 前置参数
parser = argparse.ArgumentParser()
parser.add_argument("--cellLine1", dest="cellLine1", required=True, type=str, default="", help="the cellLine 1")
parser.add_argument("--cellLine2", dest="cellLine2", required=True, type=str, default="", help="the cellLine 2")
parser.add_argument("--geneId", dest="geneId", required=False, type=str, default=None, help="=None, the gene need to be displayed")
parser.add_argument("--geneName", dest="geneName", required=False, type=str, default=None, help="=None, the gene need to be displayed")
parser.add_argument("--cutoffValue", dest="cutoffValue", required=False, type=float, default=1.5, help="=1.5, the cutoff value of log2FC about the significant expression")
parser.add_argument("--absolute_path", dest="absolute_path", action="store_true", help="use the absolute path")
parser.add_argument("--file_path", dest="file_path", required=False, type=str, default="", help="the path of data file")
parser.add_argument("--input_pickle_filename", dest="input_pickle_filename", required=False, type=str, default="0002_total_info.pkl", help="=0002_total_info.pkl, the output file of 0002.py")
parser.add_argument("--input_transcript_expression_filename", dest="input_transcript_expression_filename", required=False, type=str, default="0004_relative_expression.tsv", help="=0004_relative_expression.tsv, the output file of 0004.py")
parser.add_argument("--output_plot_path", dest="output_plot_path", required=False, type=str, default="0005_plot/", help="=0005_plot/, the output path of the plot")

args = parser.parse_args()
cellLine1 = args.cellLine1
cellLine2 = args.cellLine2
geneId = args.geneId
geneName = args.geneName
cutoffValue = args.cutoffValue
absolute_path = args.absolute_path
file_path = args.file_path
input_pickle_filename = args.input_pickle_filename
input_transcript_expression_filename = args.input_transcript_expression_filename
output_plot_path = args.output_plot_path

usage: ipykernel_launcher.py [-h] --cellLine1 CELLLINE1 --cellLine2 CELLLINE2
                             [--geneId GENEID] [--geneName GENENAME]
                             [--cutoffValue CUTOFFVALUE] [--absolute_path]
                             [--file_path FILE_PATH]
                             [--input_pickle_filename INPUT_PICKLE_FILENAME]
                             [--input_transcript_expression_filename INPUT_TRANSCRIPT_EXPRESSION_FILENAME]
                             [--output_plot_path OUTPUT_PLOT_PATH]
ipykernel_launcher.py: error: the following arguments are required: --cellLine1, --cellLine2


SystemExit: 2


To exit: use 'exit', 'quit', or Ctrl-D.



In [None]:
# 补全路径
if absolute_path is False:
    input_pickle_filename = "{}{}".format(file_path, input_pickle_filename)
    input_transcript_expression_filename = "{}{}".format(file_path, input_transcript_expression_filename)
    output_plot_path = "{}{}".format(file_path, output_plot_path)

In [None]:
# 打印参数
print('\n')
print("[Date]{}".format(time.strftime("%Y-%m-%d, %H:%M:%S", time.localtime(time.time()))))
print("[Script]{}".format(__file__))
print("[Version]{}".format(__version__))
print("[Version]def_function: {}".format(def_function.__version__))
print("[Version]def_class: {}".format(class_version))
print("[Parament]cellLine1: {}".format(cellLine1))
print("[Parament]cellLine2: {}".format(cellLine2))
print("[Parament]geneId: {}".format(geneId))
print("[Parament]geneName: {}".format(geneName))
print("[Parament]cutoffValue: {}".format(cutoffValue))
print("[Parament]input_pickle_filename: {}".format(input_pickle_filename))
print("[Parament]input_transcript_expression_filename: {}".format(input_transcript_expression_filename))
print("[Parament]output_plot_path: {}".format(output_plot_path))
print('\n')

In [None]:
# 隐藏函数
def select_gene_to_show(df, cell_line_1, cell_line_2, cutoff_value=1, reset_value=0.1, tab_level=0):
    """
    input:
        df, pandas.DataFrame, expression_df
        cell_line_1, str, ...
        cell_line_2, str, ...
        cutoff_value, int or float, gene中必须含有在两种组织类型中特异性表达的transcript, 该阈值即为衡量特异性表达的阈值
        reset_value, float, 若log2FC为numpy.nan, 则将nan修改为该值
    change:
        挑选可以展示的gene
        选择这样的gene, 这个gene存在一些transcript分别在两种组织类型中特异性表达
    output:
        selected_gene, list, 建议展示的gene
    """
    expression_df = df
    cellline1 = cell_line_1
    cellline2 = cell_line_2
    cutoff_value = cutoff_value
    reset_value = 0.1

    selected_gene = []  # [[<gene_id>, <the sum of abs(log2FC) of the gene], ...]

    expression_df_group = expression_df.groupby("gene_id")
    for gene_id in expression_df_group.groups.keys():
        temp = expression_df_group.get_group(gene_id)
        temp = temp["log2FC_{}@{}".format(cellline1, cellline2)].to_list()
        temp = [reset_value if numpy.isnan(i) else i for i in temp]
        if any([i>cutoff_value for i in temp]) and any([i<-cutoff_value for i in temp]):
            temp = [abs(i) for i in temp]
            temp = sum(temp)
            selected_gene.append([gene_id, temp])
        else:
            continue
    selected_gene = sorted(selected_gene, key=lambda x: x[1], reverse=True)
    #selected_gene = selected_gene[0][0]
    selected_gene = [i[0] for i in selected_gene]

    return selected_gene


# 隐藏函数
def exonRange_to_dict(exonRange, tab_level=0):
    """
    input:
        exonRange, dict, {"<exon_start>-<exon_end>": <relative expression>, ...}
    change:
        能够将exon_expression_in_cellline1中的exon_range转换为数字
        将每一个exon_range拆分为range(<exon_start>, <exon_end>+1, by=1), 存储到df中, 对应位置的值即为含有该位点的exon的expression的加和
    output:
        exon_site_expression, dict, {<position in sequence>: <expression>, ...}
    """
    exonRange_dict = exonRange

    exon_site_expression = {}  # {<int=position>: <float=expression>, ...}
    for exon_range, exon_expression in exonRange_dict.items():
        exon_range = [int(i) for i in exon_range.split('-')]
        exon_start = exon_range[0]
        exon_end = exon_range[1]
        
        # add position value
        for position in range(exon_start, exon_end+1, 1):
            temp = exon_site_expression.get(position, 0)
            temp = temp + exon_expression
            exon_site_expression[position] = temp
    return exon_site_expression


def classify_significant_transcript(df, geneId, cutoffValue, cellLine1, cellLine2, tab_level=0):
    """
    input:
        df, pandas.DataFrame, expression_df
        geneId, str, 要判断的gene_id
        cutoffValue, float, 判断transcript的显著性时用到的log2FC阈值
        cellLine1, str, ...
        cellLine2, str, ...
    change:
        判断指定geneId的transcript是否于cellLine中显著表达
        根据log2FC对transcript进行分类
            1. 声明一个dict, 以存储在cellLine1或cellLine2中特异性表达以及非特异性表达的transcriptId
                {"<cellLine1>": [<transcriptId>, ...], "<cellLine2>": [<transcriptId>, ...], "other": [<transcriptId>, ...]}
            2. 根据log2FC对transcript的特异性表达进行判断
                若log2FC>=cutoff_value, 则于cellLine1中特异性表达, 若log2FC<=-cutoff_value, 则于cellLine2中特异性表达
                若-cutoff_value<log2FC<cutoff_value, 则无特异性表达
                !!!若log2FC is numpy.nan, 则判断cellLine1的相对表达值是否为0, 若不为0则于cellLine1特异性表达, 否则再判断cellLine2的相对表达值是否为0, 若不为0则于cellLine2特异性表达
    output:
        significant_info, dict, {<cellLine1>: [<transcript_id>, ...],
                                 <cellLine2>: [<transcript_id>, ...],
                                 "other": [<transcript_id>, ...]}
    """
    df = df
    geneId = geneId
    cutoffValue = cutoffValue
    cellLine1 = cellLine1
    cellLine2 = cellLine2

    temp_df = df.loc[df["gene_id"]==geneId, :]
    significant_info = {cellLine1: [], cellLine2: [], "other": []}

    for i in temp_df.index:
        transcript_id = temp_df.at[i, "transcript_id"]
        log2FC = temp_df.at[i, "log2FC_{}@{}".format(cellLine1, cellLine2)]

        if log2FC >= cutoffValue:
            significant_info[cellLine1].append(transcript_id)
        elif log2FC <= -cutoffValue:
            significant_info[cellLine2].append(transcript_id)
        else:
            significant_info["other"].append(transcript_id)
    
    return significant_info


In [None]:
# 功能函数
def getGeneId(total, geneId, geneName, df, cellLine1, cellLine2):
    """
    input:
        total, Total Object
        geneId, str/None
        geneName, str/None
    change:
        根据给定的geneId或geneName, 返回geneId
        若geneId与geneName都为None, 则根据规则挑一个geneId出来
    output:
        geneId, str
    """
    total = total
    geneId = geneId
    geneName = geneName
    df = df
    cellLine1 = cellLine1
    cellLine2 = cellLine2

    if geneId is None:
        if geneName is None: 
            selected_gene_id = select_gene_to_show(df=df,
                                                   cell_line_1=cellLine1,
                                                   cell_line_2=cellLine2,
                                                   cutoff_value=1.5,
                                                   reset_value=0.1)
            geneId = selected_gene_id[0]
        else:
            for geneId, geneObject in total.gene_dict.items():
                if geneObject.gene_name == geneName:
                    return geneId
            raise ValueError("[Error]The geneName is not existed in the data.")
    else:
        geneId = geneId

    return geneId


def getExonExpression(total, geneId, cellLine):
    """
    input:
        total, Total Object
        geneId, str
        cellLine, str
    change:
        获取在指定gene的指定cellLine中存在表达的exon的范围以及表达量
    output:
        dict, {"<geneId>": {"<exon1_start>-<exon1_end>": <value>,
                            "<exon2_start>-<exon2_end>": <value>, ...}}
    """
    total = total
    geneId = geneId
    cellLine = cellLine

    exonExpressionInfo = total.gene_dict[geneId].exonExpression

    tempDict = {}
    for exon, exonDict in exonExpressionInfo.items():
        cellLineExpression = exonDict["cellLineExpression"].get(cellLine, 0)
        if cellLineExpression != 0:
            tempDict[exon] = cellLineExpression

    return {geneId: tempDict}


In [None]:
# 绘图函数
def make_gene_model_plot(exon_combination, gene_id, gene_name, gene_strand, gene_start, gene_end, tab_level=0):
    exon_combination = exon_combination
    gene_id = gene_id
    gene_name = gene_name
    gene_strand = gene_strand
    gene_start = gene_start
    gene_end = gene_end

    result = {}  # {"fig": <fig>, "xaxis_min": <xaxis_min>, "xaxis_max": <xaxis_max>}

    # 获取指定gene的所有exon的start, end, length信息
    exon_df = pandas.DataFrame(columns=["exon_start", "exon_end", "exon_length"])
    exon_start = []
    exon_end = []
    exon_length = []
    for exon_range in exon_combination[gene_id]:
        exon_range = [int(i) for i in exon_range.split('-')]
        exon_start.append(exon_range[0])
        exon_end.append(exon_range[1])
        exon_length.append(exon_range[1]-exon_range[0])
    exon_df["exon_start"] = exon_start
    exon_df["exon_end"] = exon_end
    exon_df["exon_length"] = exon_length

    # 根据gene的strand, 计算exon相对于gene start的位置
    if gene_strand =='-':
        exon_df["exon_start"] = gene_end - exon_df["exon_end"]
        exon_df["exon_end"] = exon_df["exon_start"] + exon_df["exon_length"]
    else:
        exon_df["exon_start"] = exon_df["exon_start"] - gene_start
        exon_df["exon_end"] = exon_df["exon_start"] + exon_df["exon_length"]

    # 整理exon_df的格式
    # 该exon_df存储了指定gene的每一个exon相对于该gene的起始起点的位置信息(已考虑strand)
    exon_df = exon_df.sort_values(by="exon_end")
    exon_df = exon_df.sort_values(by="exon_start")
    exon_df = exon_df.reset_index(drop=True)

    # 获取x轴的范围
    xaxis_min = min(exon_df["exon_start"])
    xaxis_max = max(exon_df["exon_end"])
    result["xaxis_min"] = xaxis_min
    result["xaxis_max"] = xaxis_max

    # 将exon的range转换为柱状图的坐标
    fig_x = []
    for i in exon_df.index:
        temp = list(range(exon_df.at[i, "exon_start"], exon_df.at[i, "exon_end"]+1))
        fig_x = fig_x + temp
    fig_x = list(set(fig_x))

    # 根据exon_df绘制结构图Barplot
    fig = go.Figure()
    fig_trace = go.Bar(x=fig_x,
                    y=[10]*len(fig_x),
                    width=1,
                    marker={"color": "black",
                            "line": {"color": "black",
                                        "width": 0}
                            })
    fig = fig.add_shape(type="line",
                        x0=xaxis_min, y0=5, x1=xaxis_max, y1=5,
                        line={"color": "black",
                            "width": 1})
    fig = fig.add_trace(fig_trace)
    layout = {"width": 700, "height": 50,
              "title": {"text": "{} gene model".format(gene_name),
                        "font": {"family": "Arial",
                                 "color": "black",
                                 "size": 12,},
                        "x": 0.1},
            "margin": {'l':15, 'r':15, 't':25, 'b':15}, 
            "xaxis": {"showline": False,
                        "showticklabels": False,
                        "range": [xaxis_min, xaxis_max],
                        "showgrid": False,},
            "yaxis": {"showline": False,
                        "showticklabels": False,
                        "showgrid": False,
                        },
            "paper_bgcolor": "white",
            "plot_bgcolor":"white",
            }
    fig = fig.update_layout(layout)
    result["fig"] = fig

    return result


def make_exon_plot(exon_cellline_1, exon_cellline_2, gene_id, title_list= [], gene_strand='+', gene_start=0, gene_end=0, xaxis_min=0, xaxis_max=0 ,tab_level=0):
    """
    input:
        gene_id, str, ...
        exon_cellline_1, dict, 在cell line 1中每个基因的每个exon的相对表达值
        exon_cellline_2, dict, 在cell line 2中每个基因的每个exon的相对表达值
        title_list, list, [<the name of cell line 1>, <the name of cell line 2>]
        gene_strand, str, 指定gene的+/-链
        gene_start, int, 指定gene的start
        gene_end, int, 指定gene的end
        xaxis_min, int, the min value of xaxis
        xaxix_max, int, the max value of xaxis
    """
    gene_id = gene_id
    exon_cellline_1 = exon_cellline_1
    exon_cellline_2 = exon_cellline_2
    title_list = title_list
    gene_strand = gene_strand
    gene_start = gene_start
    gene_end = gene_end
    xaxis_min = xaxis_min
    xaxis_max = xaxis_max

    # 准备绘图所用的数据
    fig1_xy = exonRange_to_dict(exon_cellline_1[gene_id], tab_level=tab_level+1)
    fig1_x = []
    fig1_y = []
    # 根据链的+/-修改position为相对于gene第一个exon起始位点的位置
    fig1_position = {}
    if gene_strand == "-":
        for position, expression in fig1_xy.items():
            fig1_position[gene_end-position] = expression
    else:
        for position, expression in fig1_xy.items():
            fig1_position[position-gene_start] = expression
    for position, expression in fig1_position.items():
        fig1_x.append(position)
        fig1_y.append(expression)


    fig2_xy = exonRange_to_dict(exon_cellline_2[gene_id], tab_level=tab_level+1)
    fig2_x = []
    fig2_y = []
    # 根据链的+/-修改position为相对于gene第一个exon起始位点的位置
    fig2_position = {}
    if gene_strand == "-":
        for position, expression in fig2_xy.items():
            fig2_position[gene_end-position] = expression
    else:
        for position, expression in fig2_xy.items():
            fig2_position[position-gene_start] = expression
    for position, expression in fig2_position.items():
        fig2_x.append(position)
        fig2_y.append(expression)

    # 设定y轴的最大范围
    y_max = max(fig1_y+fig2_y)

    # 开始绘图
    fig1 = go.Figure()
    fig1_trace = go.Bar(x=fig1_x,
                        y=fig1_y,
                        width=1,
                        marker={"color": "indianred",
                                "line": {"width": 0}})
    fig2 = go.Figure()
    fig2_trace = go.Bar(x=fig2_x,
                        y=fig2_y,
                        width=1,
                        marker={"color": "lightblue",
                                "line": {"width": 0}})

    layout = {"width": 700, "height": 100,
              "title": {"text": "cell_line",  # 在后面会更改, 这一行不必修改
                        "font": {"family": "Arial",
                                "color": "black",
                                "size": 12},
                        'x': 0.1,},
              "margin": {'l':15, 'r':15, 't':25, 'b':15}, 
              "barmode": "stack",
              "font": {"family": "Arial",
                       "color": "black",
                       "size": 10},
              "xaxis": {"title": {"text": "",
                                  "font": {"family": "Arial",
                                        "color": "black",
                                        "size": 12},
                                "standoff": 0.1,
                                },
                        "showline": True,
                        "linecolor": "black",
                        "linewidth": 1,
                        "showticklabels": False,
                        "range": [xaxis_min, xaxis_max]},
            "yaxis": {"title": {"text": "",
                                "font": {"family": "Arial",
                                         "color": "black",
                                         "size": 12},
                                "standoff": 0,},
                        "showline": False,
                        "range": [0, y_max],
                        "showgrid": False,
                        "showticklabels": False,
                        },
            "plot_bgcolor": "white",
            "bargap": 0, "bargroupgap": 0,}

    fig1 = fig1.update_layout(layout)
    fig2 = fig2.update_layout(layout)

    fig1 = fig1.update_layout({"title": {"text": title_list[0]}})
    fig2 = fig2.update_layout({"title": {"text": title_list[1]}})

    fig1 = fig1.add_trace(fig1_trace)
    fig2 = fig2.add_trace(fig2_trace)

    return [fig1, fig2]


def make_transcript_plot(df, gene_id, cutoff_value, cell_line_1, cell_line_2, gene_strand, gene_start, gene_end, xaxis_min, xaxis_max, tab_level=0):
    df = df
    gene_id = gene_id
    cutoff_value = cutoff_value
    cell_line_1 = cell_line_1
    cell_line_2 = cell_line_2
    gene_strand = gene_strand
    gene_start = gene_start
    gene_end = gene_end
    xaxis_min = xaxis_min
    xaxis_max = xaxis_max

    # 将transcript分为三种类型: 于cellline1特异性表达，于cellline2特异性表达，非特异性表达
    significant_info = classify_significant_transcript(df=df, geneId=gene_id, cutoffValue=cutoff_value,
                                                       cellLine1=cell_line_1, cellLine2=cell_line_2,
                                                       tab_level=tab_level+1)
    # 设定三种类型transcript的颜色
    color_dict = {cell_line_1: "indianred", cell_line_2: "lightblue", "other": "grey"}

    # 获取指定gene的全部transcript的信息
    temp_df = df.loc[df["gene_id"]==gene_id, :].copy()

    # 补充transcript的start及end信息
    temp_df["transcript_start"] = temp_df["transcript_id"].map(lambda x: total.gene_dict[gene_id].transcript_dict[x]["range"][0])
    temp_df["transcript_end"] = temp_df["transcript_id"].map(lambda x: total.gene_dict[gene_id].transcript_dict[x]["range"][1])

    num=1
    fig = go.Figure()
    # 绘制在cell_line特异性表达以及非特异性表达的trace
    for name in [cell_line_2, "other", cell_line_1]:
        significant_id_list = significant_info[name]
        temp_df_name = temp_df.loc[temp_df["transcript_id"].isin(significant_id_list), :].copy()

        # 根据strand的方向，修改transcript的start与end为相对于gene的起点的位置
        if gene_strand == '-':
            temp_df_name["base"] = gene_end - temp_df_name["transcript_end"]
        else:
            temp_df_name["base"] = temp_df_name["transcript_start"] - gene_start
        temp_df_name["length"] = temp_df_name["transcript_end"] - temp_df_name["transcript_start"]
        # 整理transcript的排序
        temp_df_name = temp_df_name.sort_values(by="length", ascending=False)
        temp_df_name = temp_df_name.sort_values(by="base", ascending=False)

        fig_x = [i for i in range(num, num+temp_df_name.shape[0])]
        num = num + temp_df_name.shape[0]
        fig_trace = go.Bar(x=temp_df_name["length"],
                        y=fig_x,
                        base=temp_df_name["base"],
                        name=name,
                        width=1,
                        orientation='h',
                        marker={"color": color_dict[name]})
        fig = fig.add_trace(fig_trace)

    layout = {"width":700, "height":200,
                "title": {"text": "5'-3' isoforms count",
                            "font": {"family": "Arial",
                                "size": 12,
                                "color": "black"},
                            'x': 0.1},
                "margin": {'l':15, 'r':15, 't':25, 'b':15}, 
                "xaxis": {"showline": False,
                            "showticklabels": False,
                            "range": [xaxis_min, xaxis_max],
                            "showgrid": False,},
                "yaxis": {"showline": False,
                            "showticklabels": False,
                            "showgrid": False,
                            },
                "showlegend": False,
                "paper_bgcolor": "white",
                "plot_bgcolor": "white",
                }
    fig = fig.update_layout(layout)

    return fig


In [None]:
if __name__ == "__main__":
    # load data
    with open(input_pickle_filename, 'rb') as file:
        total = pickle.load(file)
    expression_df = pandas.read_csv(input_transcript_expression_filename, sep='\t', index_col=None)
    
    # get exon combination for each gene
    exon_combination = total.get_exon_combination()

    # 获取展示用的gene_id
    geneId = getGeneId(total=total, geneId=geneId, geneName=geneName,
                       df=expression_df, cellLine1=cellLine1, cellLine2=cellLine2)
    geneName = total.gene_dict[geneId].gene_name
    print("\t[Reseult]selected gene id is: {}, gene name is: {}".format(geneId, geneName))

    # 获取展示用的gene的strand, start, end信息
    geneStrand = total.gene_dict[geneId].strand
    geneStart = total.gene_dict[geneId].start
    geneEnd = total.gene_dict[geneId].end

    # make gene model plot
    gene_model = make_gene_model_plot(exon_combination=exon_combination,
                                      gene_id=geneId, gene_name=geneName, gene_strand=geneStrand,
                                      gene_start=geneStart, gene_end=geneEnd, tab_level=0)
    xaxis_min = gene_model["xaxis_min"]
    xaxis_max = gene_model["xaxis_max"]
    fig0 = gene_model["fig"]

    # 获取指定gene在cellLine1及cellLine2中的exon的表达信息
    exon_expression_in_cellline1 = getExonExpression(total, geneId, cellLine1)
    exon_expression_in_cellline2 = getExonExpression(total, geneId, cellLine2)
    # make exon plot
    [fig1, fig2] = make_exon_plot(exon_cellline_1=exon_expression_in_cellline1,
                                  exon_cellline_2=exon_expression_in_cellline2,
                                  title_list= [cellLine1, cellLine2],
                                  gene_id=geneId,
                                  gene_strand=geneStrand, gene_start=geneStart, gene_end=geneEnd,
                                  xaxis_min=xaxis_min, xaxis_max=xaxis_max)
    # make transcript plot
    fig3 = make_transcript_plot(df=expression_df, gene_id=geneId, cutoff_value=cutoffValue,
                                cell_line_1=cellLine1, cell_line_2=cellLine2,
                                gene_strand=geneStrand, gene_start=geneStart, gene_end=geneEnd,
                                xaxis_min=xaxis_min, xaxis_max=xaxis_max)

    # output
    fig0.write_image("{}0005_{}_gene_model.svg".format(output_plot_path, geneName))
    fig1.write_image("{}0005_{}_{}.svg".format(output_plot_path, geneName, cellLine1))
    fig2.write_image("{}0005_{}_{}.svg".format(output_plot_path, geneName, cellLine2))
    fig3.write_image("{}0005_{}_transcript.svg".format(output_plot_path, geneName))

    print("[{}]All blocks finished.".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))))

	[Reseult]selected gene id is: ENCLB767XEKG000058938, gene name isENCLB767XEKG000058938
[2023-08-17 20:24:32]All blocks finished.


---

start

end

---

In [None]:
"""
- 运行该脚本前需指定需要分析的两种组织类型
- 该脚本分析Total对象中每一个gene分别在两种组织类型中的exon的相对表达量
    - 先统计每个gene的exon
        - 以dict形式返回gene所含的exon, {"<gene_id>": ["<exon1_start>-<exon1_end>", ...], ...}
    - 再根据0004.tsv中的组织相对表达量, 分别计算exon在两种类型组织中的相对表达量
        - 声明一个dict, {"<gene_id>": {"<cell_line_1>": {"<exon1_start>-<exon1_end>": <relative expression>,
                                                        "<exon2_start>-<exon2_end>": <relative expression>, ...},
                                      "<cell_line_2>": {"<exon1_start>-<exon1_end>": <relative expression>,
                                                        "<exon2_start>-<exon2_end>": <relative expression>, ...}
                                      }
                        }
        - 写一个函数
            - 声明一个dict, {"<gene_id>": {"<exon1_start>-<exon1_end>": <value>,
                                          "<exon2_start>-<exon2_end>": <value>, ...
                                          }
                            }
            - 该函数可确定在一种组织类型中存在表达的transcript, 筛选得到这些transcript的信息
            - 根据Total.gene_dict, 按照transcript对应的gene_id及transcript_id检索该transcript的exon组成
                - 更新该函数中声明的dict, 对应的exon的value = 原value值 + 该transcript的相对表达值
        - 在一种组织类型中, 根据transcript的相对表达量，计算exon的相对表达量
            - 建立一个dict存储该样本中不同gene的exon的相对表达量, {<gene_id>: {"<exon-start>,<exon-end>": <相对表达量>, ...}, ...}
            - 在Gene对象中，已记录了该gene的transcript的exon, 对transcript的每一个exon, 取上一步建立的dict中的exon, exon相对表达量=exon相对表达量+transcript的相对表达量
            - 最终, dict中存储了一种组织类型中不同gene的不同exon的相对表达量
    - 以df保存结果
    - 可以以柱状图形式展示exon的相对表达量

1. Gene新增方法, 整理该gene的全部exon, 并以dict形式返回{<gene_id_1>: [[<exon_1_start>, <exon_1_end>],
                                                                   [<exon_2_start>, <exon_2_end>], ...]
                                                                  ],
                                                     ...}
2. 根据0004.xlsx及0001.tsv文件, 对每一个gene的transcript的特异性表达进行归类，判断transcript富集于两种组织中的哪一种
    2.1 根据cell_line_1及cell_line_2及log2FC的正负号进行判断, log2FC>0则表示富集于cell_line_1, log2FC<0则表示富集于cell_line_2
3. 对两种cell_line的每一个gene，统计gene中exon的使用次数
    3.1 对于cell_line_1的gene, 取在cell_line_1中对应gene中表达的对应transcript。在Gene对象中，已存储了该transcript所对应的exon, 所以 该exon的相对表达量=该exon的相对表达量+该transcript在该组织中的的相对表达量
    3.2 可以以柱状图形式呈现一个gene中所有exon的相对表达量
"""