In [3]:
import pandas
import re
import argparse
import time

In [4]:
__version__ = "V3.0(Editor) 2023-07-14"

In [5]:
# 前置参数-debug
absolute_path = False
file_path = "F:/OneDrive/Master/Project/trans/data/"
input_sample_info_filename = "0000_sample_info.tsv"
input_df_filename = "0001_total_info.tsv"
output_df_filename = "0002_TSS_TES.tsv"
output_check_df_filename = "0002_result_check.tsv"

In [10]:
# 前置参数
parser = argparse.ArgumentParser()
parser.add_argument("--absolute_path", dest="absolute_path", required=False, action="store_true", help="use absolute file path")
parser.add_argument("--file_path", dest="file_path", required=False, type=str, default="./", help="=\"./\",\t the path of data directory, the end of this parament should be '/'")
parser.add_argument("--input_sample_info_filename", dest="input_sample_info_filename", required=False, type=str, default="0000_sample_info.tsv", help="=\"0000_sample_info.tsv\",\t the sample info file")
parser.add_argument("--input_df_filename", dest="input_df_filename", required=False, type=str, default="0001_total_info.tsv", help="\"0001_total_info.tsv\",\t the output tsv file of 0001.py")
parser.add_argument("--output_df_filename", dest="output_df_filename", required=False, type=str, default="0002_TSS_TES.tsv", help="\"0002_TSS_TES.tsv\",\t the output filename of this script")
parser.add_argument("--output_check_df_filename", dest="output_check_df_filename", required=False, type=str, default="0002_result_check.tsv", help="\"0002_result_check.tsv\",\t the file for checking result")

args = parser.parse_args()
absolute_path = args.absolute_path
file_path = args.file_path
input_sample_info_filename = args.input_sample_info_filename
input_df_filename = args.input_df_filename
output_df_filename = args.output_df_filename
output_check_df_filename = args.output_check_df_filename

SyntaxError: invalid syntax (2254374255.py, line 3)

In [6]:
# 补全路径
if absolute_path is False:
    input_sample_info_filename = "{}{}".format(file_path, input_sample_info_filename)
    input_df_filename = "{}{}".format(file_path, input_df_filename)
    output_df_filename = "{}{}".format(file_path, output_df_filename)
    output_check_df_filename = "{}{}".format(file_path, output_check_df_filename)

In [None]:
# 打印参数
print('\n')
print("[Script]{}".format(__file__))
print("[Version]{}".format(__version__))
print("[Date]{}".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))))
print("[Paraments]input_sample_info_filename: {}".format(input_sample_info_filename))
print("[Paraments]input_df_filename: {}".format(input_df_filename))
print("[Paraments]output_df_filename: {}".format(output_df_filename))
print("[Paraments]output_check_df_filename: {}".format(output_check_df_filename))
print('\n')

In [29]:
# Function
def log(function):
    name = function.__name__
    def wrapper(*args, **kwargs):
        tab = kwargs.get("tab_level", 0)

        [print('\n') if tab == 0 else None]

        print("{}[Function]{} start.".format('\t'*tab, name))
        print("{}[Time]{}".format('\t'*(tab+1),
                                  time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))))
        for key, value in kwargs.items():
            if type(value) in [int, str, bool, list]:
                print("{}[Paraments]{}: {}".format('\t'*(tab+1),
                                                   key, value))
            else:
                print("{}[Paraments]{}: <...>".format('\t'*(tab+1),
                                                      key))
        result = function(*args, **kwargs)
        print("{}[{}]{} finished.".format('\t'*tab,
                                          time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())),
                                          name))
        return result
    return wrapper

@log
def get_disease_sample_info(df, disease, sample_disease_list, tab_level=0):
    """
    input:
        df, pandas.DataFrame, 含有所有sample的df
        disease, str, 疾病类型
        sample_disease_list, list, # 属于指定类型疾病的样本名
    change:
        统计同一类型疾病中的四种类型gene的gene数及transcript数
    output:
        result, dict, 含有在指定疾病类型中表达的transcript的信息(4种类型的gene及transcript数量) 及 相应的df
                      {"result": result, "df": disease_sample_info}
    """
    df = df
    disease = disease
    sample_disease_list = sample_disease_list

    result = {disease:{"TSS_PAS@gene": 0, "TSS_PAS@transcript": 0,
                       "TSS_APA@gene": 0, "TSS_APA@transcript": 0,
                       "ATSS_PAS@gene": 0, "ATSS_PAS@transcript": 0,
                       "ATSS_APA@gene": 0, "ATSS_APA@transcript": 0
                       }
              }

    # 筛选得到在同一种疾病类型中存在表达的transcript
    disease_columns = ["chr", "strand", "gene_id", "gene_name", "gene_biotype",
                       "transcript_id", "transcript_name",
                       "transcript_start", "transcript_end"]
    disease_columns = disease_columns + sample_disease_list
    # 过滤掉在指定疾病类型疾病样本中均未表达的transcript
    disease_df = df[disease_columns].copy()
    disease_df[sample_disease_list] = disease_df[sample_disease_list].applymap(lambda x: None if x==0 else x)
    disease_df = disease_df.dropna(subset=sample_disease_list, axis=0, how="all")

    # 开始整理统计信息表
    disease_sample_info = []
    disease_df_group = disease_df.groupby("gene_id")
    for gene_id in disease_df_group.groups.keys():
        # 获得单个基因的所有转录本信息
        temp_df = disease_df_group.get_group(gene_id)
        transcript_num = temp_df.shape[0]  # 该gene所具有的transcript的种类数
        strand = list(set(temp_df["strand"].to_list()))  # 该gene的链的+/-方向
        gene_biotype = temp_df.at[temp_df.index[0], "gene_biotype"]
        # 根据strad对起始点及终止点的数量进行统计
        if len(strand) == 1 and strand[0] == '-':
            # 负链
            num_end = len(set(temp_df["transcript_start"].to_list()))
            num_start = len(set(temp_df["transcript_end"].to_list()))
            strand = strand[0]
        elif len(strand) == 1 and strand[0] == '+':
            # 正链
            num_start = len(set(temp_df["transcript_start"].to_list()))
            num_end = len(set(temp_df["transcript_end"].to_list()))
            strand = strand[0]
        else:
            raise KeyError("[错误]strand: {}".format(strand))
        disease_sample_info.append([gene_id, strand, gene_biotype, num_start, num_end, transcript_num])
    disease_sample_info = pandas.DataFrame(disease_sample_info)  # from list to pandas.DataFrame
    # modify the name of the df's columns
    new_columns = disease_sample_info.columns.to_list()
    new_columns = dict(zip(new_columns, ["gene_id", "strand", "gene_biotype",
                                         "start_counts", "end_counts", "transcript_counts"]))
    disease_sample_info = disease_sample_info.rename(columns=new_columns)
    disease_sample_info = disease_sample_info.set_index("gene_id")
    # add column of disease
    disease_sample_info["disease"] = disease

    # 根据统计信息表保存统计数据
    ## 统计四种类型的gene的gene及transcript数量
    TSS_PAS = disease_sample_info.query("start_counts==1 & end_counts==1")
    TSS_APA = disease_sample_info.query("start_counts==1 & end_counts>1")
    ATSS_PAS = disease_sample_info.query("start_counts>1 & end_counts==1")
    ATSS_APA = disease_sample_info.query("start_counts>1 & end_counts>1")
    result[disease]["TSS_PAS@gene"] = TSS_PAS.shape[0]
    result[disease]["TSS_APA@gene"] = TSS_APA.shape[0]
    result[disease]["ATSS_PAS@gene"] = ATSS_PAS.shape[0]
    result[disease]["ATSS_APA@gene"] = ATSS_APA.shape[0]
    result[disease]["TSS_PAS@transcript"] = TSS_PAS["transcript_counts"].sum()
    result[disease]["TSS_APA@transcript"] = TSS_APA["transcript_counts"].sum()
    result[disease]["ATSS_PAS@transcript"] = ATSS_PAS["transcript_counts"].sum()
    result[disease]["ATSS_APA@transcript"] = ATSS_APA["transcript_counts"].sum()

    return {"result": result, "df": disease_sample_info}


@log
def get_biotype_sample_info(sample_df, disease_dict, biotype, sample_info, tab_level=0):
    """
    input:
        sample_df, pandas.DataFrame, the .tsv file of output from 0001.py \n
        disease_dict, dict, {<disease1>:[<sample1>], <disease2>:[<sample2>,<sample3>], ...} \n
        biotype, str, one of ["un_classified", "protein_coding", "non_protein_coding"] \n
        sample_info, pandas.DataFrame, ... \n
        tab_level, int, the parament for log format, there is no need to change this parament
    change:
        ...
    output:
        {"result_df": df, "check_df": df_check}
    """
    sample_df = sample_df
    disease_dict = disease_dict
    biotype = biotype
    sample_info = sample_info

    sample_df = sample_df.loc[sample_df["gene_biotype"]==biotype, :]
    sample_df = sample_df.drop(labels=["transcript_biotype"], axis=1)

    # 准备统计信息表
    df_index = ["{}_{}".format(disease, x) for disease in disease_dict.keys() for x in ["gene", "transcript"]]
    df_index = ["all_gene", "all_transcript"] + df_index
    df = pandas.DataFrame(index=df_index,
                          columns=["gene_biotype", "TSS_PAS", "TSS_APA", "ATSS_PAS", "ATSS_APA"])

    # 统计信息
    df_info = get_disease_sample_info(df=sample_df,
                                      disease="all",
                                      sample_disease_list=sample_info["GEO_accession"].to_list(),
                                      tab_level=tab_level+1)
    df_check = df_info.get("df")
    df_check["disease"] = "all"
    df_info = df_info.get("result")
    for disease in disease_dict.keys():
        disease_sample_list = disease_dict.get(disease)
        temp_dict = get_disease_sample_info(df=sample_df,
                                            disease=disease,
                                            sample_disease_list=disease_sample_list,
                                            tab_level=tab_level+1)
        temp_df_check = temp_dict.get("df")
        temp_df_check["disease"] = disease
        temp_dict = temp_dict.get("result")
        df_info.update(temp_dict)
        df_check = pandas.concat([df_check, temp_df_check], axis=0)

    # add info to df
    for disease in df_info:
        temp_disease_info = df_info.get(disease)
        for key, counts in temp_disease_info.items():
            [gene_type,g_or_t] = re.split(pattern="@", string=key)
            counts_index = "{}_{}".format(disease, g_or_t)
            counts_column = gene_type
            df.at[counts_index, counts_column] = counts

    return {"result_df": df, "check_df": df_check}


In [30]:
# 读取数据
sample_info = pandas.read_csv(input_sample_info_filename, sep='\t')
sample_info["disease"] = sample_info["disease"].map(lambda x: x.replace(' ', '_'))
sample_df = pandas.read_csv(input_df_filename, sep='\t', index_col=0)

In [31]:
# 建立疾病类型与样本的关系
disease_dict = {}  # {<disease_1>: [<sample_1>], <disease_2>: [<sample_2>, <sample_3, ...>], ...}
sample_info_group = sample_info.groupby("disease")
for disease in sample_info_group.groups.keys():
    disease_dict[disease] = sample_info_group.get_group(disease)["GEO_accession"].to_list()

In [32]:
# classify gene_biotype to three type: un_classified, protein_coding, non_protein_coding
sample_df["gene_biotype"] = sample_df["gene_biotype"].map(lambda x: "non_protein_coding" if x not in ["protein_coding", "un_classified"] else x)

In [33]:
check_df = pandas.DataFrame()
result_df = pandas.DataFrame()
#for gene_biotype in ["protein_coding", "non_protein_coding", "un_classified"]:
for gene_biotype in ["protein_coding", "non_protein_coding", "un_classified"]:
    temp_info = get_biotype_sample_info(sample_df=sample_df,
                                        disease_dict=disease_dict,
                                        biotype=gene_biotype,
                                        sample_info=sample_info,
                                        tab_level=0)
    temp_check_df = temp_info.get("check_df")
    temp_check_df["gene_biotype"] = gene_biotype
    temp_result_df = temp_info.get("result_df")
    temp_result_df["gene_biotype"] = gene_biotype

    check_df = pandas.concat([check_df, temp_check_df], axis=0)
    result_df = pandas.concat([result_df, temp_result_df], axis=0)



[Function]get_biotype_sample_info start.
	[Time]2023-07-14 10:27:19
	[Paraments]sample_df: <...>
	[Paraments]disease_dict: <...>
	[Paraments]biotype: protein_coding
	[Paraments]sample_info: <...>
	[Function]get_disease_sample_info start.
		[Time]2023-07-14 10:27:19
		[Paraments]df: <...>
		[Paraments]disease: all
		[Paraments]sample_disease_list: ['GSM6783527', 'GSM6782551']
		[Paraments]tab_level: 1
	[2023-07-14 10:27:21]get_disease_sample_info finished.
	[Function]get_disease_sample_info start.
		[Time]2023-07-14 10:27:21
		[Paraments]df: <...>
		[Paraments]disease: Ewing_sarcoma
		[Paraments]sample_disease_list: ['GSM6783527']
		[Paraments]tab_level: 1
	[2023-07-14 10:27:23]get_disease_sample_info finished.
	[Function]get_disease_sample_info start.
		[Time]2023-07-14 10:27:23
		[Paraments]df: <...>
		[Paraments]disease: colorectal_adenocarcinoma
		[Paraments]sample_disease_list: ['GSM6782551']
		[Paraments]tab_level: 1
	[2023-07-14 10:27:25]get_disease_sample_info finished.
[2023-

In [34]:
# save data
result_df.to_csv(output_df_filename, sep='\t')
check_df.to_csv(output_check_df_filename, sep='\t')

In [35]:
output_check_df_filename

'F:/OneDrive/Master/Project/trans/data/0002_result_check.tsv'

In [None]:
# output end
print("[{}]All blocks finished.".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))))