In [1]:
import pandas
import argparse
import time
import def_function
from def_class import *
import pickle

In [2]:
__version__ = "V4.1(Editor) 2023-08-07"

In [3]:
# 前置参数-debug
ATSS_APA_cutoff = 3
absolute_path = False
file_path = "F:/OneDrive/Master/Project/trans/data/"
input_sample_info_filename = "0000_sample_info.tsv"
input_pkl_filename = "0001_total_info.pickle"
output_df_filename = "0002_TSS_TES.tsv"
output_check_df_filename = "0002_result_check.tsv"
output_pkl_filename = "0002_total_info.pkl"

In [None]:
# 前置参数
parser = argparse.ArgumentParser()
parser.add_argument("--ATSS_APA_cutoff", dest="ATSS_APA_cutoff", required=False, type=int, default=3, help="=3,\t the cutoff value of isoform of ATSS-APA gene")
parser.add_argument("--absolute_path", dest="absolute_path", action="store_true", help="use absolute path")
parser.add_argument("--file_path", dest="file_path", required=False, type=str, default="", help="the path of data directory")
parser.add_argument("--input_sample_info_filename", dest="input_sample_info_filename", required=False, type=str, default="0000_sample_info.tsv", help="=0000_sample_info.tsv,\t the file of metadata")
parser.add_argument("--input_pkl_filename", dest="input_pkl_filename", required=False, type=str, default="0001_total_info.pickle", help="=0001_total_info.pickle,\t the output file of 0001.py")
parser.add_argument("--output_df_filename", dest="output_df_filename", required=False, type=str, default="0002_TSS_TES.tsv", help="=0002_TSS_TES.tsv,\t the output file of this script")
parser.add_argument("--output_check_df_filename", dest="output_check_df_filename", required=False, type=str, default="0002_result_check.tsv", help="=0002_result_check.tsv,\t the output file of this script")
parser.add_argument("--output_pkl_filename", dest="output_pkl_filename", required=False, type=str, default="0002_total_info.pkl", help="=0002_total_info.pkl,\t the output file of this file")

args = parser.parse_args()
ATSS_APA_cutoff = args.ATSS_APA_cutoff
absolute_path =args.absolute_path
file_path = args.file_path
input_sample_info_filename = args.input_sample_info_filename
input_pkl_filename = args.input_pkl_filename
output_df_filename = args.output_df_filename
output_check_df_filename = args.output_check_df_filename
output_pkl_filename = args.output_pkl_filename

In [4]:
# 补全路径
if absolute_path is False:
    input_sample_info_filename = "{}{}".format(file_path, input_sample_info_filename)
    input_pkl_filename = "{}{}".format(file_path, input_pkl_filename)
    output_df_filename = "{}{}".format(file_path, output_df_filename)
    output_check_df_filename = "{}{}".format(file_path, output_check_df_filename)
    output_pkl_filename = "{}{}".format(file_path, output_pkl_filename)

In [None]:
# 打印参数
print('\n')
print("[Date]{}".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))))
print("[script]{}".format(__file__))
print("[Version]{}".format(__version__))
print("[Version]def_function: {}".format(def_function.__version__))
print("[Version]def_class: {}".format(class_version))
print("[Parament]ATSS_APA_cutoff: {}".format(ATSS_APA_cutoff))
print("[Parament]input_sample_info_filename: {}".format(input_sample_info_filename))
print("[Parament]input_pkl_filename: {}".format(input_pkl_filename))
print("[Parament]output_df_filename: {}".format(output_df_filename))
print("[Parament]output_check_df_filename: {}".format(output_check_df_filename))
print("[Parament]output_pkl_filename: {}".format(output_pkl_filename))
print('\n')

In [None]:
"""
存在一个问题：
    在统计ATSS-APA类型基因时，需要isoform类型>3
    但原0002.py在统计ATSS-APA类型基因时，实际筛选条件未isoform类型>2
    因此需要重构0002.py

思路：
    1.重构基因分类方法，应该向gene对象添加相应的基因类型属性，而非根据df进行分类
    2.确定原0002.py都做了什么，尽量维持输出数据格式一致
"""

In [5]:
@def_function.log
def loadSampleInfo(filename):
    """
    input:
        filename,\t str,\t file location\n
    change:
        读取样本的metadata信息, 获取细胞系与GEO_accession的联系
    output:
        dict,\t {<cellLine1>: [<GSM1>, <GSM2>, ...],
                 <cellLine2>: ...}
    """
    filename = filename

    sample_info = pandas.read_csv(filename, sep='\t')
    sample_info_group = sample_info.groupby(by="cell_line")
    sample_info = {}
    for cellLine in sample_info_group.groups.keys():
        sample_info[cellLine] = sample_info_group.get_group(cellLine)["GEO_accession"].to_list()

    return sample_info


def getTranscriptRangeExpressedInCellline(geneObject, sampleList, tab_level=0):
    """
    input:
        geneObject,\t Gene object,\t ...\n
        sampleList,\t, list, 指定cellLine所包含的的样本名
    change:
        获取gene中于指定cellLine存在表达的transcript的TSS及TES及transcript的种类数\n
    output:
        dict,\t {"TSS": <transcriptStart>, "TES": <transcriptEnd>, "num": transcriptNum}\n
    """
    geneObject = geneObject
    sampleList = sampleList

    transcriptNum = 0  # 记录该gene中有多少种transcript在指定cellLine中存在表达
    transcriptStart = set()  # 存储在cellLine中存在表达的TSS
    transcriptEnd = set()  # 存储在cellLine中存在表达的TES
    # 遍历每一个transcript
    for transcriptObject in geneObject.transcript_dict.values():
        # 确定transcript在cellLine中是否表达
        transcriptExpression = [transcriptObject["countsExpression"].get(sample, 0) for sample in sampleList]
        transcriptExpression = map(lambda x: True if x!=0 else False, transcriptExpression)

        if True not in transcriptExpression:
            # 该transcript在cellLine中不表达
            continue
        else:
            # 该transcript在cellLine中表达
            transcriptNum += 1
            if geneObject.strand == '-':
                transcriptStart.add(transcriptObject.get("range")[1])
                transcriptEnd.add(transcriptObject.get("range")[0])
            else:
                transcriptStart.add(transcriptObject.get("range")[0])
                transcriptEnd.add(transcriptObject.get("range")[1])
            # 一旦确定该transcript在cellLine中存在表达就记录该transcript的信息，并继续处理下一个transcript
            continue

    return {"TSS": transcriptStart, "TES": transcriptEnd, "num": transcriptNum}


def get_check_df(total, sampleInfo, cellLine, tab_level=0):
    """
    input:
        total,\t Total,\t ...\n
        sampleInfo,\t pandas.DataFrame,\t ...\n
        cellLine,\t, str,\t ...\n
    change:
        生成df, 以gene_id为index, 列名为["strand", "gene_biotype", "start_counts", "end_counts", "transcript_counts", "cell_line", "gene_classified"]
    output:
        check_df,\t pandas.DataFrame,\t ...\n
    """
    total = total
    sampleInfo = sampleInfo
    cellLine = cellLine

    check_df = {}
    if cellLine == "all":
        # 遍历每一个gene对象
        for gene_id, gene_object in total.gene_dict.items():
            check_df[gene_id] = [gene_object.strand,
                                gene_object.gene_biotype,
                                len(TSS_TES_dict.get(gene_id).get("range_start")),
                                len(TSS_TES_dict.get(gene_id).get("range_end")),
                                len(gene_object.transcript_dict),
                                "all",
                                gene_object.gene_classified
                                ]
        check_df = pandas.DataFrame.from_dict(check_df, orient="index",
                                            columns=["strand", "gene_biotype", "start_counts", "end_counts",
                                                    "transcript_counts", "cell_line", "gene_classified"])
    else:
        # 若cellLine不属于all的话, 则遍历所有gene
        # 对gene中的每一个transcript, 检查其在指定cellLine样本中是否表达
        #   若表达，则记录其TSS与TES, 若不存在, 则认为在指定cellLine中不表达该transcript
        for geneId, geneObject in total.gene_dict.items():
            # 获取gene在cellLine中表达的情况
            transcriptInfo = getTranscriptRangeExpressedInCellline(geneObject=geneObject,
                                                                   sampleList=sampleInfo.get(cellLine))
            if transcriptInfo.get("num") == 0:
                # 若该gene没有于cellLine中表达的transcript, 则跳过该gene
                continue
            startNum = len(transcriptInfo.get("TSS"))
            endNum = len(transcriptInfo.get("TES"))
            gene_classified = "{}-{}".format(("ATSS", "TSS")[startNum==1],
                                            ("APA", "PAS")[endNum==1]
                                            )
            check_df[geneId] = [geneObject.strand,
                                geneObject.gene_biotype,
                                startNum,
                                endNum,
                                transcriptInfo.get("num"),
                                cellLine,
                                gene_classified]
        check_df = pandas.DataFrame.from_dict(check_df, orient="index",
                                            columns=["strand", "gene_biotype", "start_counts", "end_counts",
                                                    "transcript_counts", "cell_line", "gene_classified"])

    return check_df


@def_function.log
def count_TSS_TES(check_df, ATSS_APA_cutoff, tab_level=0):
    """
    input:
        check_df,\t pandas.DataFrame,\t ...\n
        ATSS_APA_cutoff, int, the cutoff value of isoform of ATSS-APA gene
    change:
        根据gene_biotype, gene_classified对四种类型gene的transcript的种类数进行统计
    output:
        df,\t pandas.DataFrame,\t columns=["cellLine", "geneOrTranscript", "geneBiotype", "TSS-PAS", "TSS-APA", "ATSS-PAS", "ATSS-APA"]\n
    """
    check_df = check_df
    ATSS_APA_cutoff = ATSS_APA_cutoff

    df = []

    df_group = check_df.groupby("cell_line")
    for cellLine in df_group.groups.keys():
        temp_df = df_group.get_group(cellLine)
        # 删除ATSS-APA中低于指定isoform类型数的gene
        # 也就是说, ATSS-APA型gene需要具备大于等于ATSS_APA_cutoff种transcript才会被纳入统计
        readyToDelIndex = temp_df.query("gene_classified=='ATSS-APA' & transcript_counts<@ATSS_APA_cutoff").index
        temp_df = temp_df.drop(index=readyToDelIndex)
        # 将gene_biotype归类为三种类型: protein_coding, non_protein_coding, un_classified
        temp_df["gene_biotype"] = temp_df["gene_biotype"].map(lambda x: (x, "non_protein_coding")[x not in ["protein_coding", "un_classified"]])
        temp_df_group = temp_df.groupby("gene_biotype")
        # 按gene_biotype对gene进行分类统计
        for biotype in temp_df_group.groups.keys():
            temp_df2 = temp_df_group.get_group(biotype)
            temp = temp_df2["gene_classified"].value_counts()
            df.append([cellLine, "gene", biotype,
                    temp["TSS-PAS"], temp["TSS-APA"],
                    temp["ATSS-PAS"], temp["ATSS-APA"]])

            # 统计transcript的数量
            temp = {}  # {"TSS-PAS": <int>, "TSS-APA": <int>,"ATSS-PAS": <int>, "ATSS-APA": <int>}
            temp_group = temp_df2.groupby("gene_classified")
            for gene_classified in temp_group.groups.keys():
                # 统计不同gene_classified类型gene中所具有的transcript的种类数
                temp[gene_classified] = temp_group.get_group(gene_classified)["transcript_counts"].sum()
            df.append([cellLine, "transcript", biotype,
                    temp["TSS-PAS"], temp["TSS-APA"],
                    temp["ATSS-PAS"], temp["ATSS-APA"]])
    
    # 整理df格式
    df = pandas.DataFrame(df, columns=["cellLine", "geneOrTranscript", "geneBiotype", "TSS-PAS", "TSS-APA", "ATSS-PAS", "ATSS-APA"])
    df = df.sort_values(["cellLine", "geneOrTranscript", "geneBiotype"], ignore_index=True, ascending=False)

    return df


In [6]:
if __name__ == "__main__":
    # load data
    with open(input_pkl_filename, 'rb') as file:
        total = pickle.load(file)
    sampleInfo = loadSampleInfo(filename=input_sample_info_filename)

    # 向所有gene添加gene_classified属性(TSS-PAS, TSS-APA, ATSS-PAS, ATSS-APA)
    TSS_TES_dict = total.add_gene_classified()

    # 整理得到check_df
    check_df = get_check_df(total=total, sampleInfo=sampleInfo, cellLine="all")
    check_df = check_df.reset_index()
    check_df = check_df.rename(columns={"index": "gene_id"})
    for cellLine in sampleInfo.keys():
        temp_check_df = get_check_df(total=total, sampleInfo=sampleInfo, cellLine=cellLine)
        temp_check_df = temp_check_df.reset_index()
        temp_check_df = temp_check_df.rename(columns={"index": "gene_id"})
        check_df = pandas.concat([check_df, temp_check_df], axis=0, ignore_index=True)
    
    # 获得gene及transcript的数目统计表
    df = count_TSS_TES(check_df=check_df, ATSS_APA_cutoff=ATSS_APA_cutoff)

    # save data
    df.to_csv(output_df_filename, sep='\t', index=None)
    check_df.to_csv(output_check_df_filename, sep='\t', index=None)
    with open(output_pkl_filename, 'wb') as file:
        pickle.dump(total, file)
    
    print("[{}]All blocks finished.".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))))



[Function]loadSampleInfo start.
	[Time]2023-08-17 10:39:11
	[Paraments]filename: F:/OneDrive/Master/Project/trans/data/0000_sample_info.tsv
[2023-08-17 10:39:11]loadSampleInfo finished.


[Function]count_TSS_TES start.
	[Time]2023-08-17 10:39:12
	[Paraments]check_df: <...>
	[Paraments]ATSS_APA_cutoff: 3
[2023-08-17 10:39:12]count_TSS_TES finished.
[2023-08-17 10:39:21]All blocks finished.


---

调试

调试

---