In [1]:
import pandas
import gzip
import time
import argparse
import pickle
from Class import *
import function
import tqdm

In [2]:
__version__ = "V0.0(Editor) 2023-10-18"

In [3]:
# 前置参数-debug
COUNTSCUTOFF = 2
ABSOLUTEPATH = False
FILEPATH = "F:/OneDrive/Master/Project/trans/data/"
RAWDATAPATH = "raw_data/"
INPUTTOTAL = "10001_total.pkl"
INPUTREFANNOTATION = "Homo_sapiens.GRCh38.109.chr.gtf.gz"
OUTPUTTOTAL = "10002_total.pkl"

In [4]:
# 补全路径
if ABSOLUTEPATH == False:
    RAWDATAPATH = "{}{}".format(FILEPATH, RAWDATAPATH)
    INPUTTOTAL = "{}{}".format(FILEPATH, INPUTTOTAL)
    INPUTREFANNOTATION = "{}annotation/{}".format(RAWDATAPATH, INPUTREFANNOTATION)
    OUTPUTTOTAL = "{}{}".format(FILEPATH, OUTPUTTOTAL)

In [5]:
# 引用变量
EXONRANGE = 5
RANGETSS = 50
RANGETES = 150

In [58]:
# 一级函数
def findId(index, chr, location, rangeMax, orientation, allowType, total=None, transcriptGeneMap=None):
    '''
    input:
        total, Total Object
        index, dict, {<chr>: {<location>: {<location>: <id>, ...}, ...}, ...}
        chr, str
        location, int
        rangeMax, int
        orientation, str, 当为+时, location递增; 当为-时, location递减
        allowType, str, "gene" or "transcript" or "exon"
        transcriptGeneMap, dict, {<transcriptId>: <geneId>, ...}
    change:
        寻找index中, 相距location为rangeMax的id(不包含location)
    '''
    total = total
    index = index
    chr = chr
    location = location
    rangeMax = rangeMax
    orientation = orientation
    allowType = allowType
    transcriptGeneMap = transcriptGeneMap

    temp = []  # [(<id>, <mean expression in all cellLine>)]

    if orientation == '-':
        rangeList = [i for i in range(location-1, location-rangeMax-1, -1)]
    else:
        rangeList = [i for i in range(location+1, location+rangeMax+1, 1)]

    for i in rangeList:
        # 寻找是否在指定范围内存在相应的location
        if index[chr].get(i, None) is None:
            # 如果不存在, 就直接判断下一个位置
            continue
        else:
            # 寻找到了范围内的location
            newLocation = i
            if allowType == "exon":
                # index[chr][newLocation].values()中的值为str, 而不是list
                for id in index[chr][newLocation].values():
                    if total is not None:
                        expression = numpy.mean([total.exonDict[id].cellLineExpression.get(cellLine, 0) for cellLine in total.celllineInfo.keys()])
                    else:
                        expression = None
                    temp.append((id, expression))
            elif allowType == "gene":
                # index[chr][newLocation].values()中的值为str, 而不是list
                for id in index[chr][newLocation].values():
                    if total is not None:
                        expression = numpy.mean([total.geneDict[id].cellLineExpression.get(cellLine, 0) for cellLine in total.celllineInfo.keys()])
                    else:
                        expression = None
                    temp.append((id, expression))
            else:
                # index[chr][newLocation].values()中的值为list, 而不是str
                if transcriptGeneMap is None:
                    raise ValueError("findId(): need transcriptGeneMap")
                for idList in index[chr][newLocation].values():
                    for id in idList:
                        if total is not None:
                            geneId = transcriptGeneMap.get(id, None)
                            if geneId is None:
                                expression = None
                            else:
                                expression = numpy.mean([total.geneDict[geneId].transcriptDict[id].cellLineExpression.get(cellLine, 0) for cellLine in total.celllineInfo.keys()])
                        else:
                            expression = None
                        temp.append((id, expression))
            temp = temp + findId(total=total, index=index, chr=chr, location=newLocation, rangeMax=rangeMax, orientation=orientation, allowType=allowType, transcriptGeneMap=transcriptGeneMap)

    # 去重
    temp = list(set(temp))

    return temp
    

In [103]:
# 一级函数
def loadRefAnnotation(filename, allowType):
    '''
    input:
        filename, str, 参考基因组注释文件位置.gz
        allowType, str, "transcript" or "gene" or "exon"
    change:
        读取参考基因组注释数据中allowType的数据
        Warning: 在读取name时, 如果没有name则会返回"None"
    return:
        dict, {"positionRef": {"TSS": {<chr>: {<TSS>: {<TES>: [<transcriptId>, ...],
                                                               ...},
                                                       ...},
                                               ...},
                                       ...},
                               "TES": {<chr>: {<TES>: {<TSS>: [<transcriptId>, ...],
                                                               ...},
                                                       ...},
                                               ...}
                                       ...},
                "dataRef": {<id>: {<key>: <value>,
                                   ...},
                            ...}
    '''
    # 读取参考基因组注释数据
    filename = filename
    allowType = [allowType]

    positionRef = {"TSS": {}, "TES": {}}  # {"TSS": {<chr>: {<TSS>: {<TES>: [<id>, ...], ...}, ...}, ...}, "TES": {<chr>: {<TES>: {<TSS>: [<id>, ...], ...}, ...}, ...}}
    dataRef = {}

    # 读取数据
    with gzip.open(filename, 'rt') as file:
        if allowType == ["transcript"]:
            for line in file:
                line = function.pickRefAnnotation(info=line, allowType=allowType)
                if line is None:
                    continue
                # 提取数据
                geneId = line["gene_id"]
                geneVersion = line["gene_version"]
                geneName = line.get("gene_name", "None")
                geneBiotype = line["gene_biotype"]
                transcriptId = line["transcript_id"]
                transcriptId = function.strSplit(transcriptId)[0]  # 去掉版本号
                transcriptVersion = line["transcript_version"]
                transcriptName = line.get("transcript_name", "None")
                transcriptBiotype = line["transcript_biotype"]
                strand = line["strand"]
                chr = line["chr"]
                chr = (chr, 'M')[chr == "MT"]
                chr = "chr{}".format(chr)
                start = int(line["start"])
                end = int(line["end"])
                # 判断转录起点与转录终点
                if strand == '+':
                    TSS, TES = start, end
                else:
                    TSS, TES = end, start
                # 在dataRef中添加信息
                if transcriptId in dataRef.keys():
                    print("[Warning]loadRefAnnotation(): transcriptId {} have been existed".format(transcriptId))
                dataRef[transcriptId] = {"geneId": geneId,
                                         "geneVersion": geneVersion,
                                         "geneName": geneName,
                                         "geneBiotype": geneBiotype,
                                         "transcriptId": transcriptId,
                                         "transcriptVersion": transcriptVersion,
                                         "transcriptName": transcriptName,
                                         "transcriptBiotype": transcriptBiotype,
                                         "strand": strand,
                                         "chr": chr,
                                         "start": start,
                                         "end": end,
                                         "TSS": TSS,
                                         "TES": TES}
        elif allowType == ["gene"]:
            for line in file:
                line = function.pickRefAnnotation(info=line, allowType=allowType)
                if line is None:
                    continue
                # 提取数据
                geneId = function.strSplit(line["gene_id"])[0]
                geneVersion = line["gene_version"]
                geneName = line.get("gene_name", "None")
                geneBiotype = line["gene_biotype"]
                strand = line["strand"]
                chr = line["chr"]
                chr = (chr, 'M')[chr == "MT"]
                chr = "chr{}".format(chr)
                start = int(line["start"])
                end = int(line["end"])
                # 判断转录起点与转录终点
                if strand == '+':
                    TSS, TES = start, end
                else:
                    TSS, TES = end, start
                # 在dataRef中添加信息
                if geneId in dataRef.keys():
                    print("[Warning]loadRefAnnotation(): geneId {} have been existed".format(geneId))
                dataRef[geneId] = {"geneId": geneId,
                                    "geneName": geneName,
                                    "geneVersion": geneVersion,
                                    "geneBiotype": geneBiotype,
                                    "strand": strand,
                                    "chr": chr,
                                    "start": start,
                                    "end": end,
                                    "TSS": TSS,
                                    "TES": TES}
        elif allowType == ["exon"]:
            for line in file:
                line = function.pickRefAnnotation(info=line, allowType=allowType)
                if line is None:
                    continue
                # 提取数据
                geneId = line["gene_id"]
                transcriptId = line["transcript_id"]
                [exonId, exonVersion] = function.strSplit(line["exon_id"])
                exonVersion = line["exon_version"]
                strand = line["strand"]
                chr = line["chr"]
                chr = ("chr{}".format(chr), "chrM")[chr == "MT"]
                start = int(line["start"])
                end = int(line["end"])
                # 判断转录起点与转录终点
                if strand == '+':
                    TSS, TES = start, end
                else:
                    TSS, TES = end, start
                # 在dataRef中添加信息
                if exonId in dataRef.keys():
                    dataRef[exonId]["geneTranscriptIdSet"].add((geneId, transcriptId))
                else:
                    dataRef[exonId] = {"geneId": geneId,
                                        "transcriptId": transcriptId,
                                        "exonId": exonId,
                                        "exonVersion": exonVersion,
                                        "strand": strand,
                                        "chr": chr,
                                        "start": start,
                                        "end": end,
                                        "TSS": TSS,
                                        "TES": TES,
                                        "geneTranscriptIdSet": {(geneId, transcriptId)}}
        else:
            raise ValueError("the allowType should be 'transcript' or 'gene', but it is {}".format(allowType))

    # 整理index
    # 在positionRef中添加信息
    if allowType == ["transcript"]:
        for id, transcriptDict in dataRef.items():
            chr = transcriptDict["chr"]
            TSS = transcriptDict["TSS"]
            TES = transcriptDict["TES"]
            if chr not in positionRef["TSS"].keys():
                positionRef["TSS"][chr] = {TSS: {TES: [id]}}
            else:
                if TSS not in positionRef["TSS"][chr].keys():
                    positionRef["TSS"][chr][TSS] = {TES: [id]}
                else:
                    if TES not in positionRef["TSS"][chr][TSS].keys():
                        positionRef["TSS"][chr][TSS][TES] = [id]
                    else:
                        positionRef["TSS"][chr][TSS][TES].append(id)
            if chr not in positionRef["TES"].keys():
                positionRef["TES"][chr] = {TES: {TSS: [id]}}
            else:
                if TES not in positionRef["TES"][chr].keys():
                    positionRef["TES"][chr][TES] = {TSS: [id]}
                else:
                    if TSS not in positionRef["TES"][chr][TES].keys():
                        positionRef["TES"][chr][TES][TSS] = [id]
                    else:
                        positionRef["TES"][chr][TES][TSS].append(id)
    elif allowType == ["gene"] or allowType == ["exon"]:
        for id, dataDict in dataRef.items():
            chr = dataDict["chr"]
            TSS = dataDict["TSS"]
            TES = dataDict["TES"]
            if chr not in positionRef["TSS"].keys():
                positionRef["TSS"][chr] = {TSS: {TES: id}}
            else:
                if TSS not in positionRef["TSS"][chr].keys():
                    positionRef["TSS"][chr][TSS] = {TES: id}
                else:
                    if TES not in positionRef["TSS"][chr][TSS].keys():
                        positionRef["TSS"][chr][TSS][TES] = id
                    else:
                        pass
                        #print("[Warning]{id}-->{e}, but {e} is existed".format(id=id, e=positionRef["TSS"][chr][TSS][TES]))
            if chr not in positionRef["TES"].keys():
                positionRef["TES"][chr] = {TES: {TSS: id}}
            else:
                if TES not in positionRef["TES"][chr].keys():
                    positionRef["TES"][chr][TES] = {TSS: id}
                else:
                    if TSS not in positionRef["TES"][chr][TES].keys():
                        positionRef["TES"][chr][TES][TSS] = id
                    else:
                        pass
                        #print("[Warning]{id}-->{e}, but {e} is existed".format(id=id, e=positionRef["TSS"][chr][TSS][TES]))
    else:
        raise ValueError("the allowType should be 'transcript' or 'gene', but it is {}".format(allowType))
    
    return {"positionRef": positionRef, "dataRef": dataRef}
        

In [None]:
# load data
with open(INPUTTOTAL, "rb") as file:
    total = pickle.load(file)

In [None]:
# 更新total中的exonDict及exonIndex
total.refresh()
total.reIndex()

In [None]:
# 声明变量
exonCorrected = {"mapped": {}, "TSS-TES": {}, "TSS": {}, "TES": {}, "integrate TSS": {}, "integrate TES": {}}  # {"": {<existed exon id>: <corrected exon id>}, ...}
readyCorList = []

In [None]:
# 根据参考基因组中exon数据对exon的TSS及TES进行校正
# 读取参考基因组注释中exon的数据, 并建立index
temp = loadRefAnnotation(filename=INPUTREFANNOTATION, allowType="exon")
exonRef = temp["dataRef"]
exonIndex = temp["positionRef"]
# 遍历total.exonDict中所有exon, 记录这些exon
for exonId, exonObject in total.exonDict.items():
    readyCorList.append(exonId)
# 统计校正前exon及transcript的数量
infoDict = {}
infoDict["total exon num before exon correct"] = len(total.exonDict)
num = 0
for exonId, exonObject in total.exonDict.items():
    if exonObject.status == "NOVEL":
        num += 1
infoDict["novel exon num before exon correct"] = num
num = 0
for geneId, geneObject in total.geneDict.items():
    for transcriptId in geneObject.transcriptDict.keys():
        num += 1
infoDict["total transcript num before exon correct"] = num
num = 0
for geneId, geneObject in total.geneDict.items():
    for transcriptObject in geneObject.transcriptDict.values():
        if transcriptObject.status == "NOVEL":
            num +=1
infoDict["novel transcript num before exon correct"] = num

# 根据参考基因组中exon数据对exon的TSS及TES进行校正
for exonId in tqdm.tqdm(readyCorList, desc="Exon", leave=True):
    exonObject = total.exonDict[exonId]
    chr = exonObject.chr
    TSS = exonObject.TSS
    TES = exonObject.TES

    # 声明list类型变量exonList
    exonList = []  # 存储可进行映射的exon [(<exonId>, <dif>, <startMarker>, <endMarker>), ...]

    # 准备校正exon的TSS及TES
    # 获取[TSS-5, TES+5]范围的所有exon
    temp = [exonIndex["TSS"][chr].get(i, None) for i in range(TSS-EXONRANGE, TSS+EXONRANGE+1)]
    # 去除空值
    while None in temp:
        temp.remove(None)
    while {} in temp:
        temp.remove({})
        pass
    for d in temp:
        for tempTES, tempExonId in d.items():
            if abs(tempTES-TES) <= EXONRANGE:
                # 从中筛选出[TES-5, TES+5]范围的所有exon, 认为这些exon为可双端映射的exon
                num = abs(TSS-exonRef[tempExonId]["TSS"]) + abs(TES-tempTES)
                exonList.append((tempExonId, num, True, True))
            else:
                # 仅TSS可映射到tempExon
                num = abs(TSS-exonRef[tempExonId]["TSS"])
                exonList.append((tempExonId, num, True, False))

    # 获取[TES-5, TES+5]范围的所有exon
    temp = [exonIndex["TES"][chr].get(i, None) for i in range(TES-EXONRANGE, TES+EXONRANGE+1)]
    # 去除空值
    while None in temp:
        temp.remove(None)
    while {} in temp:
        temp.remove({})
    for d in temp:
        for tempTSS, tempExonId in d.items():
            # 从中筛选出(-oo, TSS-5)U(TSS+5, +oo)的所有exon, 认为这些exon为仅可映射到TES的exon
            if abs(tempTSS-TSS) > EXONRANGE:
                # 仅end可映射到tempExon
                num = abs(TES-exonRef[tempExonId]["TES"])
                exonList.append((tempExonId, num, False, True))

    # 不分析无法映射的exon
    if len(exonList) == 0:
        continue

    # 校正过程
    mode = False  # 是否存在最佳映射
    for (newExonId, num, startMarker, endMarker) in exonList:
        if startMarker == endMarker == True:
            mode = True
            break
    # 开始映射
    if mode is True:
        # 若exon可双端映射到一个exon
        # 获取相差碱基数最少的exon
        temp = []
        for (newExonId, num, TSSMarker, TESMarker) in exonList:
            if TSSMarker == TESMarker == True:
                temp.append((newExonId, num))
        temp = sorted(temp, key=lambda x: x[1])
        (newExonId, num) = temp[0]
        '''if num == 0:
            # 表明不必进行start, end的修改
            continue'''
        # 判断exonId是否相同
        if exonId == newExonId:
            # exonId相同
            # 修改status, exonVersion, start, TSS, end, TES
            total.exonDict[exonId].status = "KNOWN"
            total.exonDict[exonId].exonVersion = exonRef[newExonId]["exonVersion"]
            total.exonDict[exonId].start = exonRef[newExonId]["start"]
            total.exonDict[exonId].end = exonRef[newExonId]["end"]
            total.exonDict[exonId].TSS = exonRef[newExonId]["TSS"]
            total.exonDict[exonId].TES = exonRef[newExonId]["TES"]
        else:
            # exonId不同
            # 查询参考exonId是否已记录
            if newExonId in total.exonDict.keys():
                # 参考exonId已记录, 合并两exon
                total._exonMerge(exonId1=newExonId, exonId2=exonId)
            else:
                # 参考exonId未记录
                # exonDict中新增参考exon对象
                total.exonDict[newExonId] = Exon(status="KNOWN",
                                                 exonId=exonRef[newExonId]["exonId"],
                                                 exonVersion=exonRef[newExonId]["exonVersion"],
                                                 chr=exonRef[newExonId]["chr"],
                                                 strand=exonRef[newExonId]["strand"],
                                                 start=exonRef[newExonId]["start"],
                                                 end=exonRef[newExonId]["end"],
                                                 TSS=exonRef[newExonId]["TSS"],
                                                 TES=exonRef[newExonId]["TES"])
                # 合并原exon与参考exon
                total._exonMerge(exonId1=newExonId, exonId2=exonId)
            # 将映射关系{<exonId>: <newExonId>}保存到exonCorrected["mapped"]中
            exonCorrected["mapped"][exonId] = newExonId
    else:
        # 若exon不可双端映射到一个exon
        # 如果status为KNOWN就不再进行映射, 跳过该exon
        if total.exonDict[exonId].status == "KNOWN":
            continue
        # 修改exon的status为CORRECTED
        total.exonDict[exonId].status = "CORRECTED"
        # 判断exon能否双端映射到两个exon
        marker=False  # 标记该exon能否映射到两个exon, 1-可根据start映射 2-可根据end映射 3-可分别根据start,end映射
        for (newExonId, num, startMarker, endMarker) in exonList:
            if startMarker is True:
                marker = 3 if marker==2 else 1
            else:
                marker = 3 if marker==1 else 2
            if marker == 3:
                break
        if marker == 3:
            # 能映射到两个exon
            # 获取TSS端相差碱基数最少的exon
            temp = [(newExonId, num, startMarker, endMarker) for (newExonId, num, startMarker, endMarker) in exonList if startMarker is True]
            temp = sorted(temp, key=lambda x: x[1])
            (newExonId, num, startMarker, endMarker) = temp[0]
            # 在exon的status中添加-TSS-TES后缀
            total.exonDict[exonId].status = total.exonDict[exonId].status + "-TSS-TES"
            if num != 0:
                # 表明需要进行TSS的修改
                # 修改exon的TSS的位置
                if total.exonDict[exonId].strand == '+':
                    total.exonDict[exonId].start = exonRef[newExonId]["start"]
                    total.exonDict[exonId].TSS = exonRef[newExonId]["TSS"]
                else:
                    total.exonDict[exonId].end = exonRef[newExonId]["end"]
                    total.exonDict[exonId].TSS = exonRef[newExonId]["TSS"]
                # 将映射关系{<exonId>: <newExonId>}保存到exonCorrected["TSS-TES"]中
                exonCorrected["TSS-TES"][exonId] = {"TSS": newExonId, "TES": None}
            # 获取TES端相差碱基数最少的exon
            temp = [(newExonId, num, startMarker, endMarker) for (newExonId, num, startMarker, endMarker) in exonList if endMarker is True]
            temp = sorted(temp, key=lambda x: x[1])
            (newExonId, num, startMarker, endMarker) = temp[0]
            if num != 0:
                # 表明需要进行TES的修改
                # 修改exon的TES的位置
                if total.exonDict[exonId].strand == '+':
                    total.exonDict[exonId].end = exonRef[newExonId]["end"]
                    total.exonDict[exonId].TES = exonRef[newExonId]["TES"]
                else:
                    total.exonDict[exonId].start = exonRef[newExonId]["start"]
                    total.exonDict[exonId].TES = exonRef[newExonId]["TES"]
                # 将映射关系{<exonId>: <newExonId>}保存到exonCorrected["TSS-TES"]中
                if exonId not in exonCorrected["TSS-TES"].keys():
                    exonCorrected["TSS-TES"][exonId] = {"TSS": None, "TES": newExonId}
                else:
                    exonCorrected["TSS-TES"][exonId]["TES"] = newExonId
        else:
            # 仅能单端映射到一个exon
            # 筛选出相差碱基数最少的exon进行映射
            exonList = sorted(exonList, key=lambda x: x[1])
            tempNum = exonList[0][1]
            exonList = [(newExonId, num, startMarker, endMarker) for (newExonId, num, startMarker, endMarker) in exonList if num == tempNum]
            (newExonId, num, startMarker, endMarker) = exonList[0]
            # 判断哪一端可进行映射
            if startMarker is True:
                # TSS端可映射
                # 在exon的status中添加-TSS后缀
                total.exonDict[exonId].status = total.exonDict[exonId].status + "-TSS"
                # 将映射关系{<exonId>: <newExonId>}保存到exonCorrected["TSS"]中
                exonCorrected["TSS"][exonId] = newExonId
                if num == 0:
                    # 表明不必进行TSS的修改
                    continue
                # 修改exon的TSS的位置
                if total.exonDict[exonId].strand == '+':
                    total.exonDict[exonId].start = exonRef[newExonId]["start"]
                    total.exonDict[exonId].TSS = exonRef[newExonId]["TSS"]
                else:
                    total.exonDict[exonId].end = exonRef[newExonId]["end"]
                    total.exonDict[exonId].TSS = exonRef[newExonId]["TSS"]
            else:
                # TES端可映射
                # 在exon的status中添加-TES后缀
                total.exonDict[exonId].status = total.exonDict[exonId].status + "-TES"
                # 将映射关系{<exonId>: <newExonId>}保存到exonCorrected["TES"]中
                exonCorrected["TES"][exonId] = newExonId
                if num == 0:
                    # 表明不必进行TES的修改
                    continue
                # 修改exon的TES的位置
                if total.exonDict[exonId].strand == '+':
                    total.exonDict[exonId].end = exonRef[newExonId]["end"]
                    total.exonDict[exonId].TES = exonRef[newExonId]["TES"]
                else:
                    total.exonDict[exonId].start = exonRef[newExonId]["start"]
                    total.exonDict[exonId].TES = exonRef[newExonId]["TES"]

In [None]:
# 更新total中的exonDict及exonIndex
total.refresh()
total.reIndex()

In [None]:
# 统计校正后exon及transcript的数量
infoDict["exon mapped"] = len(exonCorrected["mapped"])
infoDict["exon TSS-TES"] = len(exonCorrected["TSS-TES"])
infoDict["exon TSS"] = len(exonCorrected["TSS"])
infoDict["exon TES"] = len(exonCorrected["TES"])
infoDict["total exon num after exon correct"] = len(total.exonDict)
num=0
for exonId, exonObject in total.exonDict.items():
    if exonObject.status == "NOVEL":
        num+=1
infoDict["novel exon num after exon correct"] = num
num = 0
for geneId, geneObject in total.geneDict.items():
    for transcriptId in geneObject.transcriptDict.keys():
        num += 1
infoDict["total transcript num after exon correct"] = num
num = 0
for geneId, geneObject in total.geneDict.items():
    for transcriptObject in geneObject.transcriptDict.values():
        if transcriptObject.status == "NOVEL":
            num +=1
infoDict["novel transcript num after exon correct"] = num

In [None]:
# 计算exon, transcript, gene的相对表达量
total.computeRelativeExpression()
total.computeCellLineExpression()

In [None]:
# 准备记录合并记录
infoDict["integrate TSS"] = 0
infoDict["integrate TES"] = 0

In [None]:
# 合并相邻的exon, 即合并相差EXONRANGE范围内的NOVEL及未校正端的exon
if True:
    # 建立index
    indexTSS = total.exonIndexBuild(siteType="TSS")  # {<chr>: {<TSS>: {<TES>: <exonId>, ...}, ...}, ...}
    indexTES = total.exonIndexBuild(siteType="TES")  # {<chr>: {<TES>: {<TSS>: <exonId>, ...}, ...}, ...}

    # 处理TSS
    # 声明set变量exonSet
    exonSet = set()  # {<exonId>, ...}
    # 筛选符合校正条件的exon
    for exonId, exonObject in total.exonDict.items():
        if exonObject.status == "KNOWN":
            continue
        elif "-TSS" in exonObject.status:
            continue
        else:
            exonSet.add(exonId)
    # 只要exonSet不为空集, 就随机提取exonSet中的一个exonId, 寻找其TSS上下游各EXONRANGE范围内的exon
    while len(exonSet) != 0:
        exonId = exonSet.pop()
        exonObject = total.exonDict[exonId]
        chr = exonObject.chr
        TSS = exonObject.TSS
        # 声明list变量temp, [(<exonId>, <expression>), ...]
        temp = []  # [(<exonId>, <expression>), ...]
        # 寻找所有与该exon的start相同的exon
        for exonId in indexTSS[chr][TSS].values():
            temp = temp + [(exonId, numpy.mean([total.exonDict[exonId].cellLineExpression.get(cellLine, 0) for cellLine in total.celllineInfo.keys()]))]
        # 递减寻找EXONRANGE范围内的exon
        temp = temp + findId(total=total, index=indexTSS, chr=chr, location=TSS, rangeMax=EXONRANGE, orientation='-', allowType="exon")
        # 递增寻找EXONRANGE范围内的exon
        temp = temp + findId(total=total, index=indexTSS, chr=chr, location=TSS, rangeMax=EXONRANGE, orientation='+', allowType="exon")
        # 去重
        temp = list(set(temp))
        # 判断是否寻找到了其他的exon
        if len(temp)==1:
            # 未寻找到其他的exon, 认为该exon的start无法修改, 跳过该exon
            exonSet.discard(temp[0][0])
            continue
        else:
            # 已寻找到其他的exon
            pass
        # 判断在temp中是否存在已校正端
        corSet = set()  # {(exonId, TSS), ...}
        nonCorSet = set()  # {(exonId, TSS), ...}
        # 分离其中的未校正端与已校正端
        for (exonId, expression) in temp:
            status = total.exonDict[exonId].status
            TSS = total.exonDict[exonId].TSS
            if status=="KNOWN" or "-TSS" in status:
                # TSS端可信
                corSet.add((exonId, TSS))
            else:
                # TSS端不可信
                nonCorSet.add((exonId, TSS))
        if len(corSet) != 0:
            # 存在已校正端
            # 对每一个未校正端选择其距离最近的已校正端作为参考位点进行校正
            for (exonId, TSS) in nonCorSet:
                # 寻找最近的已校正端
                diffTSS = [(exonIdTemp, tempTSS, abs(TSS-tempTSS)) for (exonIdTemp, tempTSS) in corSet]
                diffTSS = sorted(diffTSS, key=lambda x: x[2])
                targetExonId = diffTSS[0][0]
                targetTSS = diffTSS[0][1]
                # 修改TSS, start/end位置
                total.exonDict[exonId].TSS = targetTSS
                if total.exonDict[exonId].strand == '+':
                    total.exonDict[exonId].start = targetTSS
                else:
                    total.exonDict[exonId].end = targetTSS
                # 改变status
                total.exonDict[exonId].status = total.exonDict[exonId].status + "-TSS"
                # 删除exonSet中该exonId
                exonSet.discard(exonId)
                # 记录这一次校正
                exonCorrected["integrate TSS"][exonId] = targetExonId
            # 结束此次循环
            continue
        else:
            # 不存在已校正端, 进行下一步
            pass
        # 以expression最大的exon为参照, 修改其他exon的TSS
        exonIdTarget = None
        expressionMax = 0
        # 寻找expression最大的exon
        for (exonId, expression) in temp:
            if expression >= expressionMax:
                expressionMax = expression
                exonIdTarget = exonId
            else:
                continue
        # 修改其他的exon
        for (exonId, expression) in temp:
            # 修改其他exon的TSS, start/end
            total.exonDict[exonId].TSS = total.exonDict[exonIdTarget].TSS
            if total.exonDict[exonId].strand == '+':
                total.exonDict[exonId].start = total.exonDict[exonIdTarget].start
            else:
                total.exonDict[exonId].end = total.exonDict[exonIdTarget].end
            # 修改status
            # 不应该修改status, 因为无已校正TSS作为参考, 所以TSS即使进行了合并也并不是可靠的
            #total.exonDict[exonId].status = total.exonDict[exonId].status + "-TSS"
            # 删除exonSet中的这些exon
            exonSet.discard(exonId)
            # 记录这一次校正
            exonCorrected["integrate TSS"][exonId] = exonIdTarget

    # 处理TES
    # 声明set变量exonSet
    exonSet = set()  # {<exonId>, ...}
    # 筛选符合校正条件的exon
    for exonId, exonObject in total.exonDict.items():
        if exonObject.status == "KNOWN":
            continue
        elif "-TES" in exonObject.status:
            continue
        else:
            exonSet.add(exonId)
    # 只要exonSet不为空集, 就随机提取exonSet中的一个exonId, 寻找其TES上下游各EXONRANGE范围内的exon
    while len(exonSet) != 0:
        exonId = exonSet.pop()
        exonObject = total.exonDict[exonId]
        chr = exonObject.chr
        TES = exonObject.TES
        # 声明list变量temp, [(<exonId>, <expression>), ...]
        temp = []  # [(<exonId>, <expression>), ...]
        # 寻找所有与该exon的TES相同的exon
        for exonId in indexTES[chr][TES].values():
            temp = temp + [(exonId, numpy.mean([total.exonDict[exonId].cellLineExpression.get(cellLine, 0) for cellLine in total.celllineInfo.keys()]))]
        # 递减寻找EXONRANGE范围内的exon
        temp = temp + findId(total=total, index=indexTES, chr=chr, location=TES, rangeMax=EXONRANGE, orientation='-', allowType="exon")
        # 递增寻找EXONRANGE范围内的exon
        temp = temp + findId(total=total, index=indexTES, chr=chr, location=TES, rangeMax=EXONRANGE, orientation='+', allowType="exon")
        # 去重
        temp = list(set(temp))
        # 判断是否寻找到了其他的exon
        if len(temp)==1:
            # 未寻找到其他的exon, 认为该exon的end无法修改, 跳过该exon
            exonSet.discard(temp[0][0])
            continue
        else:
            # 已寻找到其他的exon, 继续
            pass
        # 判断在temp中是否存在已校正端
        corSet = set()  # {(exonId, TES), ...}
        nonCorSet = set()  # {(exonId, TES), ...}
        # 分离其中的未校正端与已校正端
        for (exonId, expression) in temp:
            status = total.exonDict[exonId].status
            TES = total.exonDict[exonId].TES
            if status=="KNOWN" or "-TES" in status:
                # TES端可信
                corSet.add((exonId, TES))
            else:
                # TES端不可信
                nonCorSet.add((exonId, TES))
        if len(corSet) != 0:
            # 存在已校正端
            # 对每一个未校正端选择其距离最近的已校正端作为参考位点进行校正
            for (exonId, TES) in nonCorSet:
                # 寻找最近的已校正端
                diffTES = [(exonIdTemp, tempTES, abs(TES-tempTES)) for (exonIdTemp, tempTES) in corSet]
                diffTES = sorted(diffTES, key=lambda x: x[2])
                targetExonId = diffTES[0][0]
                targetTES = diffTES[0][1]
                # 修改TES, start/end位置
                total.exonDict[exonId].TES = targetTES
                if total.exonDict[exonId].strand == '+':
                    total.exonDict[exonId].end = targetTES
                else:
                    total.exonDict[exonId].start =targetTES
                # 改变status
                total.exonDict[exonId].status = total.exonDict[exonId].status + "-TES"
                # 删除exonSet中该exonId
                exonSet.discard(exonId)
                # 记录这一次校正
                exonCorrected["integrate TES"][exonId] = targetExonId
            # 结束此次循环
            continue
        else:
            # 不存在已校正端, 进行下一步
            pass
        # 以expression最大的exon为参照, 修改其他exon的TES
        exonIdTarget = None
        expressionMax = 0
        # 寻找expression最大的exon
        for (exonId, expression) in temp:
            if expression >= expressionMax:
                expressionMax = expression
                exonIdTarget = exonId
        # 修改其他的exon
        for (exonId, expression) in temp:
            # 修改其他exon的TES, start/end
            total.exonDict[exonId].TES = total.exonDict[exonIdTarget].TES
            if total.exonDict[exonId].strand == '+':
                total.exonDict[exonId].end = total.exonDict[exonIdTarget].end
            else:
                total.exonDict[exonId].start = total.exonDict[exonIdTarget].start
            # 修改status
            # 不应该修改status, 因为无已校正TSS作为参考, 所以TSS即使进行了合并也并不是可靠的
            #total.exonDict[exonId].status = total.exonDict[exonId].status + "-TES"
            # 删除exonSet中的这些exon
            exonSet.discard(exonId)
            # 记录这一次校正
            exonCorrected["integrate TES"][exonId] = exonIdTarget

# 统计TSS或TES已进行合并的exon的数量
infoDict["exon integrate TSS"] = len(exonCorrected["integrate TSS"])
infoDict["exon integrate TES"] = len(exonCorrected["integrate TSS"])

In [None]:
# 更新exon的映射并合并重复的gene, transcript, exon并重新计算表达量
total.refresh()
total.reIndex()

# 重新计算exon, transcript, gene的相对表达量
total.computeRelativeExpression()
total.computeCellLineExpression()

# 整理exon的status
for exonObject in total.exonDict.values():
    status = exonObject.status
    if status == "CORRECTED-TES-TSS":
        exonObject.status = "CORRECTED-TSS-TES"
    elif status == "NOVEL-TES-TSS":
        exonObject.status = "CORRECTED-TSS-TES"
    elif status == "NOVEL-TSS":
        exonObject.status = "CORRECTED-TSS"
    elif status == "NOVEL-TES":
        exonObject.status = "CORRECTED-TES"

# 统计exon合并后exon及transcript的数量
infoDict["total exon num after exon integrate"] = len(total.exonDict)
num=0
for exonId, exonObject in total.exonDict.items():
    if exonObject.status == "NOVEL":
        num+=1
infoDict["novel exon num after exon integrate"] = num
num = 0
for geneId, geneObject in total.geneDict.items():
    for transcriptId in geneObject.transcriptDict.keys():
        num += 1
infoDict["total transcript num after exon integrate"] = num
num = 0
for geneId, geneObject in total.geneDict.items():
    for transcriptObject in geneObject.transcriptDict.values():
        if transcriptObject.status == "NOVEL":
            num +=1
infoDict["novel transcript num after exon integrate"] = num

In [None]:
# 打印exon, transcript数量变化信息
infoDict

In [None]:
'''
之前的exon校正方法效果较差, 在经过校正后仍存在大量的NOVEL类型的exon, 现在更改了exon的校正及合并方法
校正方法: 
    1. 对于每个exon, 寻找基因组注释中该exon的start与end上下游各5bp范围内的所有exon, 共有四种映射类型
        1. 双端单映射, 该exon的start及end端都可映射到基因组注释中的同一个exon
        2. 双端双映射, 该exon的start及end端可分别映射到基因组注释中的exon
        3. 单端单映射, 该exon仅start端可映射到基因组注释中的exon
        4. 单端单映射, 该exon仅end端可映射到基因组注释中的exon
    2. 筛选参考exon
        优先级: 双端单映射 > 双端双映射 > 单端单映射
        1. 在这些参考exon的集合中, 优先筛选出双端单映射的exon, 并进一步筛选出相差碱基数最少的exon作为最终的参考exon
        2. 其次才筛选双端双映射的exon, 并筛选出相差碱基数最少的exon作为最终的参考exon
        3. 最后才筛选单端单映射的exon作为参考exon
    3. 校正exon
        根据参考exon的信息对当前exon的start及end进行修改
合并方法:
    1. 对于已记录的exon, 分别筛选出start, end未经过校正的exon
    2. 以合并start端为例
    3. 寻找其中每个exon的start上下游各5bp范围内的所有exon, 当在该范围内寻找到exon时再以该exon为中心继续寻找其附近的exon, 最终形成一个集合, 这个集合中所有exon的start上下游5bp范围内不再有其他exon
    4. 若这个集合中存在已校正的start端, 则将所有未校正的exon的start修改为距离其最近的已校正的start
    5. 若这个集合中不存在已校正的start端, 则对每个exon取其在所有细胞系中的平均表达值, 选择表达最高的exon的start作为集合中所有exon的start
'''

In [None]:
# debug
#with open("10002_exonCorrected.pkl", "wb") as file:
#    pickle.dump(total, file)

In [197]:
# debug
with open("10002_exonCorrected.pkl", "rb") as file:
    total = pickle.load(file)

---

start

In [None]:
'''
根据参考基因组进行校正的思路
0. 处理exon
    0.0 声明变量
        exonCorrected, dict, {"mapped": {<novel exon id>: <known exon id>, ...},
                              "TSS": {<novel exon id>: <known exon id>, ...},
                              "TES": {<novel exon id>: <known exon id>, ...},
                              "TSS-TES": {<novel exon id>: <known exon id>, ...}}
        readyCorList, list, 存储status为NOVEL的exonId[<exonId>, ...]
    0.1 引用变量
        EXONRANGE, int, exon的start及end允许的偏差范围
    已弃置 0.2 根据end建立索引
    0.3 根据参考基因组中exon数据对exon的TSS及TES进行校正
        因为三代测序中出现的错误绝大部分属于碱基的插入/缺失, 所以需要对exon的TSS/TES进行校正
        读取参考基因组注释中exon的数据, 并建立index
        统计校正前exon及transcript的数量
        遍历total.exonDict中所有exon对象, 判断每个exon上下游exonRange范围内是否存在参考exon
        声明list类型变量exonList, 存储该exon所能映射的所有参考exon, [[<exonRef>, <bool>, <bool>], ...]
        准备校正exon的TSS及TES
            获取[TSS-5, TES+5]范围的所有exon
                从中筛选出[TES-5, TES+5]范围的所有exon, 认为这些exon为可双端映射的exon
                未筛选出的exon则认为是仅可映射到TSS的exon
            获取[TES-5, TES+5]范围的所有exon
                从中筛选出(-oo, TSS-5)U(TSS+5, +oo)的所有exon, 认为这些exon为仅可映射到TES的exon
            在获取的所有exon中
                优先筛选出可双端映射的exon, 并在这些exon中选择总体距离最近的exon进行映射
                其次才筛选出仅可单端映射的exon, 并在这些exon中选择离单端距离最近的exon进行映射
        校正过程
            若不存在映射, 则跳过该exon, 分析下一个exon
            若存在双端映射exon
                获取相差碱基数最少的exon作为参考exon
                判断exonId是否相同
                若exonId相同, 则修改status, exonVersion, start, TSS, end, TES
                若exonId不同, 则查询参考exonId是否已记录
                    若参考exonId已记录, 则合并两exon
                    若参考exonId未记录
                        exonDict中新增参考exon对象
                        合并原exon与参考exon
                    将这种映射关系添加到exonCorrected["mapped"]中
            若仅存在单端映射exon
                如果status为KNOWN就不再进行映射, 跳过该exon
                修改exon的status为CORRECTED
                判断exon能否双端映射到两个exon
                    能映射到两个exon
                        在exon的status中添加-TSS-TES后缀
                        获取TSS端相差碱基数最少的exon
                        修改exon的start,end,TSS的位置
                        获取TES端相差碱基数最少的exon
                        修改exon的start,end,TES的位置
                        将映射关系{<exonId>: <newExonId>}保存到exonCorrected["TSS-TES"]中
                    不能映射到两个exon, 则进行下一步
                筛选出相差碱基数最少的exon进行映射
                若仅TSS端可映射
                    在exon的status中添加-TSS后缀
                    修改exon的start,end,TSS的位置
                    将映射关系{<exonId>: <newExonId>}保存到exonCorrected["TSS"]中
                若仅TES端可映射
                    在exon的status中添加-TES后缀
                    修改exon的start,end,TES的位置
                    将映射关系{<exonId>: <newExonId>}保存到exonCorrected["TES"]中
    已弃置 因为已根据参考基因组进行了校正
        0.3 处理total中的exon数据
        遍历total.exonDict中所有.status=="NOVEL"的exon, 记录这些exon
        声明list类型变量exonList, 存储该exon所能映射的所有参考exon, [[<exonRef>, <bool>, <bool>], ...]
        准备校正exon的start及end
            获取[start-5, start+5]范围的所有exon
                从中筛选出[end-5, end+5]范围的所有exon, 认为这些exon为可双端映射的exon
                未筛选出的exon则认为是仅可映射到start的exon
            获取[end-5, end+5]范围的所有exon
                从中筛选出(-oo, start-5)U(start+5, +oo)的所有exon, 认为这些exon为仅可映射到end的exon
            在获取的所有exon中
                优先筛选出可双端映射的exon, 并在这些exon中选择总体距离最近的exon进行映射
                其次才筛选出仅可单端映射的exon, 并在这些exon中选择离单端距离最近的exon进行映射
        校正过程
            若exon可双端映射到一个exon
                获取相差碱基数最少的exon
                在total.exonExisted中添加该映射
                删除total.exonDict中该exon对象
                将映射关系{<exonId>: <newExonId>}保存到exonCorrected["mapped"]中
            若exon不可双端映射到一个exon
                修改exon的status为CORRECTED
                判断exon能否双端映射到两个exon
                    能映射到两个exon
                        在exon的status中添加-start-end后缀
                        获取start端相差碱基数最少的exon
                        修改exon的start的位置
                        获取end端相差碱基数最少的exon
                        修改exon的end的位置
                        将映射关系{<exonId>: <newExonId>}保存到exonCorrected["start-end"]中
                    不能映射到两个exon, 则进行下一步
                筛选出相差碱基数最少的exon进行映射
                若仅start端可映射
                    在exon的status中添加-start后缀
                    修改exon的start的位置
                    将映射关系{<exonId>: <newExonId>}保存到exonCorrected["start"]中
                若仅end端可映射
                    在exon的status中添加-end后缀
                    修改exon的end的位置
                    将映射关系{<exonId>: <newExonId>}保存到exonCorrected["end"]中
    0.4 更新total中的exonDict及exonIndex
        更新transcriptDict以合并可能重复的transcript(因为exon组成可能重复)
        重建Index, 合并重复的exon及transcript
    0.5 合并相邻的exon, 即合并相差EXONRANGE范围内的NOVEL的exon
        情况说明
            在合并距离较近的exon时可能会出现这么一种情况
            当EXONRANGE=5时, 200可以合并到205, 205又可以合并到210, 所以200就可以合并到210
            因此需要对这种情况进行处理, 而不是简单的将两exon根据EXONRANGE进行合并
            所以需要先形成一个簇, 然后对这个簇中所有的exon的TSS或TES进行校正
        准备记录合并记录 
        建立index
            声明indexTSS, {<chr>: {<TSS>: {<TES>: <exonId>, ...}, ...}, ...}
            声明indexTES, {<chr>: {<TES>: {<TSS>: <exonId>, ...}, ...}, ...}
            遍历total.exonDict
                过滤掉所有status==KNOWN的exon
                若status中不含-TSS, 则在indexTSS中添加该exon信息
                若status中不含-TES, 则在indexTES中添加该exon信息
                否则, continue
        遍历total.exonDict中所有status不为KNOWN及其中一端未校正的exon
            处理TSS
                声明list变量exonSet, {<exonId>, ...}
                遍历所有exon, 去除KNOWN或TSS已校正的exon, 将剩余的exon添加到exonSet
                只要exonSet不为空集, 就随机提取exonSet中的一个exonId
                    temp中添加所有与该exon的TSS相同的exon
                    寻找其TSS上下游各EXONRANGE范围内的exon
                        若寻找不到其他的exon, 就认为该exon的TSS无法修改, 跳过该exon
                        若能寻找到其他的exon, 就进行下一步
                判断在temp中是否存在已校正端
                    分离其中的未校正端与已校正端
                    若存在已校正端
                        对每一个未校正端选择其距离最近的已校正端作为参考位点进行校正
                            寻找最近的已校正端
                            修改TSS, start/end位置
                            改变status
                            删除exonSet中该exonId
                        结束此次循环
                    若不存在已校正端, 则进行下一步
                以expression最大的exon为参照, 修改其他exon的TSS
                    寻找expression最大的exon
                    修改其他exon的TSS, start/end
                    修改status
                    删除exonSet中的这些exon
                删除exonSet中的这些exon
                示例
                    权重: 相对表达量
                        假设有7个exon, 从start=100开始寻找EXONRANGE=5范围的exonStart
                        exon0   95  200
                        exon1   99  201
                        exon2   99  200
                        exon3   100 200
                        exon4   104 204
                        exon5   106 206
                        exon6   107 210
                        exon7   110 210
                    分别递增, 递减寻找EXONRANGE范围内的exon
                        当递减寻找[100-5, 100)的start时, 寻找到了99
                            temp中添加[exon1, expression]
                            temp中添加[exon2, expression]
                            此时再根据99去递减寻找[99-5, 99)范围的start, 寻找到了95
                                temp中添加[exon0, expression]
                            此时再根据95去递减寻找[95-5, 95)范围的start, 寻找到了None
                                当寻找到None时, 该寻找过程终止
                        当递增寻找(100, 100+5]的start时, 寻找到了104
                            temp中添加[exon4, expression]
                            此时再根据104去递增寻找(104, 104+5]范围的start, 寻找到了106
                                temp中添加[exon5, expression]
                            此时再根据106去递增寻找(106, 106+5]范围的start, 寻找到了107
                                temp中添加[exon6, expression]
                            ...
                            此时再根据***去递增寻找(***, ***+5]范围的start, 寻找到了None
                                当寻找到None时, 寻找过程终止
                    当两个寻找过程都停止后, 则开始对temp中的exon的start进行校正
                        筛选temp中expression最大的exon, 作为参考exon
                        其他exon的start都将修改为参考exon的start
            处理TES
                声明list变量temp, [[<end>, <expression>], ...]
                权重: 相对表达量
                分别递增, 递减寻找EXONRANGE范围内的exon, 在temp中添加[exon, expression]
                当两个寻找过程都停止后, 则开始对temp中的exon的end进行校正
                    与start处理过程相似
    0.6 在所有exon都处理完后
        更新transcriptDict以合并可能重复的transcript(因为exon组成可能重复)
        重建Index
1. 处理transcript
    1.0 声明变量
        transcriptSet, set, 用于transcriptId {<transcriptId>, ...}
        transcriptCorrected, set, 用于存储校正过程中transcript的变化情况
        transcriptGeneMap, dict, 存储transcriptId与geneId的映射关系
        totalTranscriptIndex, dict, 准备total中transcript的index
    1.1 读取参考基因组注释数据
        获取所有transcript信息
        补充transcript中的exon信息
        获取index, {"TSS": {<chr>: {<TSS>: {<TES>: [<transcriptId>, ...],
                                                         ...},
                                                 ...},
                                         ...},
                                 "TES": {<chr>: {<TES>: {<TSS>: [<transcriptId>, ...],
                                                         ...},
                                                 ...},
                                         ...}
                                 }
    1.2 统计校正前及transcript的数量
    1.3 遍历所有transcript, 将所有transcriptId存储至变量transcriptSet中
    1.4 寻找transcriptSet中可映射的transcript
        声明变量transcriptList, 存储该transcript所能映射的所有参考transcript, [[<transcriptRef>, <num>, <bool>, <bool>], ...]
        根据TSS获取其上下游rangeTSS范围内的transcript
            在这些transcript中, 筛选TES上下游在rangeTES范围内的transcript, 认为这些transcript是可双端映射的transcript
            剩余的transcript则被认为是仅可校正TSS端的transcript
        根据TES获取其上下游rangeTES范围内的transcript
            在这些transcript中, 筛选TSS上下游不在rangeTSS范围内的transcript, 认为这些transcript是仅可校正TES端的transcript
    1.5 准备映射
        判断是否存在映射, 若不存在映射则跳过该transcript, 若存在映射则进行下一步
        判断映射的类型
            判断是否仅一个映射
                若仅单个映射就直接存储为transcriptRefSet={(<transcriptId>, <num>, <bool>, <bool>, <marker>, <mapped id>)}
            优先筛选出双端映射的transcript
                最优先筛选出transcriptId相同的transcript作为参考transcript
                其次筛选出exon组成最相似的transcript作为参考transcript
                    exon组成相似程度=参考transcript与该transcript的exon的重叠数量
                    若exon组成相似程度相同, 则筛选出TSS与TES相差碱基数最少的transcript作为参考transcript
                参考transcript存储为transcriptRefSet={(<transcriptId>, <num>, True, True, <marker>, <mapped id>)}
            无双端映射transcript, 仅有单端映射transcript
                transcriptRefSet=set()  # 可能有一个元素, 也可能有两个元素
                TSS端
                    最优先筛选出transcriptId相同的transcript作为参考transcript
                    其次筛选出exon组成最相似的transcript作为参考transcript
                        exon组成相似程度=参考transcript与该transcript的exon的重叠数量
                        若exon组成相似程度相同, 则筛选出TSS与TES相差碱基数最少的transcript作为参考transcript
                    transcriptRefSet中添加(<transcriptIdRef>, True, False, <marker>, <mapped id>)
                TES端
                    最优先筛选出transcriptId相同的transcript作为参考transcript
                    其次筛选出exon组成最相似的transcript作为参考transcript
                        exon组成相似程度=参考transcript与该transcript的exon的重叠数量
                        若exon组成相似程度相同, 则筛选出TSS与TES相差碱基数最少的transcript作为参考transcript
                    transcriptRefSet中添加(<transcriptIdRef>, False, True, <marker>, <mapped id>)
        遍历transcriptRefSet, 检查参考transcript是否已记录
            检查transcriptId是否已记录
                若参考transcriptId已记录, 认为该transcript已记录
                    准备合并两transcript
                    marker = 1
                    mapped id = None
                若参考transcriptId未记录, 则进行下一步
            检查geneObject.transcriptIndex中是否已记录该transcript
                若已记录, 则认为该transcript已记录
                    准备合并两transcript
                    marker = 2
                    mapped id = 参考transcript被记录的Id
                若未记录, 则认为该transcript未记录, 进行下一步
                    marker = 3
                    mapped id = None
    1.6 开始映射
        检查transcriptRefSet的长度为2则分别映射到两transcript, 否则为映射到单个transcript
        若长度为2, 分别映射到两transcript
            检查status, 若status为KNOWN就不再映射, 否则进行下一步
            遍历transcriptRefSet
                修改transcript的TSS, start/end
                修改transcript的TES, start/end
                修改transcript的status为CORRECTED-TSS-TES
                记录该映射到transcriptCorrected["TSS-TES"]
        若长度为1, 映射到单个transcript
            若双端映射
                检查参考transcript的记录状态marker
                    若参考transcript已记录
                        合并两transcript
                        记录这一次合并到transcriptCorrected["mapped"]
                    若参考transcript未记录
                        在对应的geneRefObject中添加参考transcript对象
                        合并两transcript
                        记录这一次合并到transcriptCorrected["mapped"]
            若单端映射
                检查status, 若status为KNOWN就不再映射, 否则进行下一步
                检查参考transcript的记录状态marker
                    若仅TSS端可映射
                        修改transcript的TSS, start/end
                        修改transcript的status为CORRECTED-TSS
                        记录该映射到transcriptCorrected["TSS"]
                    若仅TES端可映射
                        修改transcript的TES, start/end
                        修改transcript的status为CORRECTED-TES
                        记录该映射到transcriptCorrected["TES"]
    1.7 重建index, 重新计算relativeExpression及cellLineExpression
2. 处理gene
3. 重新计算relativeExpression及cellLineExpression
'''

In [9]:
# 声明变量
transcriptSet = set()  # {(<geneId>, <transcriptId>), ...}
transcriptCorrected = {"mapped": {}, "TSS-TES": {}, "TSS": {}, "TES": {}, "integrate TSS": {}, "integrate TES": {}}
# 准备transcriptId与geneId的映射关系
transcriptGeneMap = {}
for geneId, geneObject in total.geneDict.items():
    for transcriptId, transcriptObject in geneObject.transcriptDict.items():
        transcriptGeneMap[transcriptId] = geneId
transcriptGeneMapKeys = set(transcriptGeneMap.keys())
# 准备total中transcript的index
totalTranscriptIndex = total.transcriptIndexBuild(siteType="TSS")

In [38]:
# 读取参考基因组注释中transcript数据
temp = loadRefAnnotation(filename=INPUTREFANNOTATION, allowType="transcript")
# 获取所有transcript信息
transcriptRef = temp["dataRef"]
# 获取index
transcriptIndex = temp["positionRef"]

In [106]:
# 读取参考基因组注释中gene数据
temp = loadRefAnnotation(filename=INPUTREFANNOTATION, allowType="gene")
# 获取所有transcript信息
geneRef = temp["dataRef"]
# 获取index
geneIndex = temp["positionRef"]

In [49]:
# 读取参考基因组注释中exon数据
temp = loadRefAnnotation(filename=INPUTREFANNOTATION, allowType="exon")
exonRef = temp["dataRef"]
exonIndex = temp["positionRef"]

In [60]:
# 补充transcript中的exon信息
keys = set(transcriptRef)
for exonId, exonDict in exonRef.items():
    for geneId, transcriptId in exonDict["geneTranscriptIdSet"]:
        exonSet = transcriptRef[transcriptId].setdefault("exonSet", set())
        exonSet.add(exonId)

In [81]:
# 统计校正前transcript的数量
num = 0
for geneObject in total.geneDict.values():
    for transcriptObject in geneObject.transcriptDict.values():
        num += 1
infoDict["total transcript num before transcript correct"] = num
num = 0
for geneObject in total.geneDict.values():
    for transcriptObject in geneObject.transcriptDict.values():
        if transcriptObject.status == "NOVEL":
            num += 1
infoDict["novel transcript num before transcript correct"] = num

In [84]:
# 遍历所有transcript, 将所有transcriptId存储至变量transcriptSet中
for geneId, geneObject in total.geneDict.items():
    for transcriptId, transcriptObject in geneObject.transcriptDict.items():
        if transcriptObject.status == "NOVEL":
            transcriptSet.add((geneId, transcriptId))

In [109]:
for geneId, transcriptId in tqdm.tqdm(transcriptSet, leave=True, desc="transcript"):
    # 寻找transcriptSet中可映射的transcript
    transcriptObject = total.geneDict[geneId].transcriptDict[transcriptId]
    chr = total.geneDict[geneId].chr
    TSS = transcriptObject.TSS
    TES = transcriptObject.TES
    exonSet = set(transcriptObject.exonList)
    # 存储该transcript所能映射的所有参考transcript
    transcriptList = []  # [[<transcriptRef>, <bool>, <bool>], ...]
    # 根据TSS获取其上下游rangeTSS范围内的transcript
    temp = [transcriptIndex["TSS"][chr].get(i, None) for i in range(TSS-RANGETSS, TSS+RANGETSS+1)]
    # 去除空值
    while None in temp:
        temp.remove(None)
    while {} in temp:
        temp.remove({})
        pass
    # 拆解list
    if temp:
        temp = [{tempTES: tempId} for d in temp  for tempTES, tempIdList in d.items() for tempId in tempIdList]
    for d in temp:
        for tempTES, tempTranscriptId in d.items():
            if abs(tempTES-TES) <= RANGETES:
                # 从中筛选出[TES-RANGETES, TES+RANGETES]范围的所有transcript, 认为这些transcript为可双端映射的transcript
                num = abs(TSS-transcriptRef[tempTranscriptId]["TSS"]) + abs(TES-tempTES)
                transcriptList.append((tempTranscriptId, num, True, True))
            else:
                # 仅TSS可映射到tempTranscript
                num = abs(TSS-transcriptRef[tempTranscriptId]["TSS"])
                transcriptList.append((tempTranscriptId, num, True, False))
    # 根据TES获取其上下游rangeTES范围内的transcript
    temp = [transcriptIndex["TES"][chr].get(i, None) for i in range(TES-RANGETES, TES+RANGETES+1)]
    # 去除空值
    while None in temp:
        temp.remove(None)
    while {} in temp:
        temp.remove({})
        pass
    # 拆解list
    if temp:
        temp = [{tempTES: tempId} for d in temp  for tempTES, tempIdList in d.items() for tempId in tempIdList]
    for d in temp:
        for tempTSS, tempTranscriptId in d.items():
            # 从中筛选出(-oo, TSS-RANGETSS)U(TSS+RANGETSS, +oo)的所有transcript, 认为这些transcript为仅可映射到TES的transcript
            if abs(tempTSS-TSS) > RANGETSS:
                # 仅TES可映射到tempTranscript
                num = abs(TES-transcriptRef[tempTranscriptId]["TES"])
                transcriptList.append((tempTranscriptId, num, False, True))
    # 准备映射
    # 判断是否存在映射
    if not transcriptList:
        # 不存在映射, 跳过该transcript
        continue
    # 判断映射的类型
    # 判断是否仅一个映射
    if len(transcriptList) == 1:
        # 仅单个映射仅单个映射
        (transcriptIdRef, num, markerTSS, markerTES) = transcriptList[0]
        transcriptRefSet = {(transcriptIdRef, num, markerTSS, markerTES, None, None)}
    else:
        # 优先筛选出双端映射的transcript
        doubleMapped = []
        for (transcriptIdRef, num, markerTSS, markerTES) in transcriptList:
            if markerTSS == markerTES == True:
                doubleMapped.append([transcriptIdRef, num, markerTSS, markerTES])
        # 判断是否存在双端映射的transcript
        if doubleMapped:
            # 存在双端映射的transcript
            doubleMapped = sorted(doubleMapped, key=lambda x: x[1])
            # 最优先筛选出transcriptId相同的transcript作为参考transcript
            sameTranscriptId = False
            for [transcriptIdRef, num, markerTSS, markerTES] in doubleMapped:
                if transcriptId == transcriptIdRef:
                    # 存在transcriptId相同的项
                    transcriptRefSet = {(transcriptIdRef, num, markerTSS, markerTES, None, None)}
                    sameTranscriptId = True
                    break
            if sameTranscriptId is True:
                pass
            else:
                # 不存在transcriptId相同的项
                # 其次筛选出exon组成最相似的transcript
                # exon组成相似程度=参考transcript与该transcript的exon的重叠数量
                exonCombinationValueList = [(transcriptIdRef, num, len(transcriptRef[transcriptIdRef]["exonSet"].intersection(exonSet))) for [transcriptIdRef, num, markerTSS, markerTES] in doubleMapped]
                # 若exon组成相似程度相同, 则筛选出TSS与TES相差碱基数最少的transcript
                exonCombinationValueList = sorted(exonCombinationValueList, key=lambda x: (-x[2], x[1]))
                transcriptRefSet = {(transcriptIdRef, num, markerTSS, markerTES, None, None) for [transcriptIdRef, num, markerTSS, markerTES] in doubleMapped if transcriptIdRef==exonCombinationValueList[0][0]}
        else:
            # 不存在双端映射的transcript
            transcriptRefSet=set()  # 可能有一个元素, 也可能有两个元素
            # TSS端
            transcriptTSSList = [(transcriptIdRef, num, markerTSS, markerTES) for (transcriptIdRef, num, markerTSS, markerTES) in transcriptList if markerTSS is True]
            # 最优先筛选出transcriptId相同的transcript作为参考transcript
            sameTranscriptId = False
            for (transcriptIdRef, num, markerTSS, markerTES) in transcriptTSSList:
                if transcriptId == transcriptIdRef:
                    # 存在transcriptId相同的项
                    transcriptRefSet = transcriptRefSet.union({(transcriptId, num, markerTSS, markerTES, None, None)})
                    sameTranscriptId = True
                    break
            if sameTranscriptId is True:
                pass
            else:
                # 不存在transcriptId相同的项
                # 其次筛选出exon组成最相似的transcript
                # exon组成相似程度=参考transcript与该transcript的exon的重叠数量
                exonCombinationValueList = [(transcriptIdRef, num, len(transcriptRef[transcriptIdRef]["exonSet"].intersection(exonSet))) for (transcriptIdRef, num, markerTSS, markerTES) in transcriptTSSList]
                # 若exon组成相似程度相同, 则筛选出TSS与TES相差碱基数最少的transcript
                exonCombinationValueList = sorted(exonCombinationValueList, key=lambda x: (-x[2], x[1]))
                transcriptRefSet = transcriptRefSet.union({(transcriptIdRef, num, markerTSS, markerTES, None, None) for [transcriptIdRef, num, markerTSS, markerTES] in transcriptTSSList if transcriptIdRef==exonCombinationValueList[0][0]})
            # TES端
            transcriptTESList = [(transcriptIdRef, num, markerTSS, markerTES) for (transcriptIdRef, num, markerTSS, markerTES) in transcriptList if markerTES is True]
            # 最优先筛选出transcriptId相同的transcript作为参考transcript
            sameTranscriptId = False
            for (transcriptIdRef, num, markerTSS, markerTES) in transcriptTESList:
                if transcriptId == transcriptIdRef:
                    # 存在transcriptId相同的项
                    transcriptRefSet = transcriptRefSet.union({(transcriptId, num, markerTSS, markerTES, None, None)})
                    sameTranscriptId = True
                    break
            if sameTranscriptId is True:
                pass
            else:
                # 不存在transcriptId相同的项
                # 其次筛选出exon组成最相似的transcript
                # exon组成相似程度=参考transcript与该transcript的exon的重叠数量
                exonCombinationValueList = [(transcriptIdRef, num, len(transcriptRef[transcriptIdRef]["exonSet"].intersection(exonSet))) for (transcriptIdRef, num, markerTSS, markerTES) in transcriptTESList]
                # 若exon组成相似程度相同, 则筛选出TSS与TES相差碱基数最少的transcript
                exonCombinationValueList = sorted(exonCombinationValueList, key=lambda x: (-x[2], x[1]))
                transcriptRefSet = transcriptRefSet.union({(transcriptIdRef, num, markerTSS, markerTES, None, None) for [transcriptIdRef, num, markerTSS, markerTES] in transcriptTESList if transcriptIdRef==exonCombinationValueList[0][0]})
    # 检查参考transcript是否已记录
    temp = [[transcriptIdRef, num, markerTSS, markerTES, marker, mapped] for (transcriptIdRef, num, markerTSS, markerTES, marker, mapped) in transcriptRefSet]
    for index, value in enumerate(temp):
        transcriptIdRef = value[0]
        if transcriptIdRef in transcriptGeneMapKeys:
            # 参考transcriptId已记录, 认为该transcript已记录
            temp[index][4] = 1
            temp[index][5] = None
        else:
            # 检查geneObject.transcriptIndex中是否已记录该transcript
            transcriptRefChr = transcriptRef[transcriptIdRef]["chr"]
            transcriptRefTSS = transcriptRef[transcriptIdRef]["TSS"]
            transcriptRefTES = transcriptRef[transcriptIdRef]["TES"]
            transcriptRefExonSet = transcriptRef[transcriptIdRef]["exonSet"]
            if transcriptRefTSS not in totalTranscriptIndex[transcriptRefChr].keys():
                # 未记录该transcript
                temp[index][4] = 3
                temp[index][5] = None
            else:
                if transcriptRefTES not in totalTranscriptIndex[transcriptRefChr][transcriptRefTSS].keys():
                    # 未记录该transcript
                    temp[index][4] = 3
                    temp[index][5] = None
                else:
                    markerBreak = False
                    transcriptIdListTemp = totalTranscriptIndex[transcriptRefChr][transcriptRefTSS][transcriptRefTES]
                    for transcriptIdTemp in transcriptIdListTemp:
                        transcriptTempExon = set(total.geneDict[transcriptGeneMap[transcriptIdTemp]].transcriptDict[transcriptIdTemp].exonList)
                        if transcriptTempExon == transcriptRefExonSet:
                            # 已记录该transcript
                            markerBreak = True
                            temp[index][4] = 2
                            temp[index][5] = transcriptIdTemp
                            break
                    if markerBreak is True:
                        pass
                    else:
                        # 未记录该transcript
                        temp[index][4] = 3
                        temp[index][5] = None
    transcriptRefSet = {(transcriptIdRef, num, markerTSS, markerTES, marker, mapped) for [transcriptIdRef, num, markerTSS, markerTES, marker, mapped] in temp}
    # 开始映射
    if len(transcriptRefSet) == 2:
        # 分别映射到两transcript
        if transcriptObject.status == "KNOWN":
            continue
        transcriptCorrected["TSS-TES"] = {transcriptId: {"TSS": None, "TES": None}}
        # 对transcript进行校正
        total.geneDict[geneId].transcriptDict[transcriptId].status = "CORRECTED-TSS-TES"
        for (transcriptIdRef, num, markerTSS, markerTES, marker, mapped) in transcriptRefSet:
            if markerTSS is True:
                total.geneDict[geneId].transcriptDict[transcriptId].TSS = transcriptRef[transcriptIdRef]["TSS"]
                if total.geneDict[geneId].strand == '+':
                    total.geneDict[geneId].transcriptDict[transcriptId].start = transcriptRef[transcriptIdRef]["start"]
                else:
                    total.geneDict[geneId].transcriptDict[transcriptId].end = transcriptRef[transcriptIdRef]["end"]
                transcriptCorrected["TSS-TES"][transcriptId]["TSS"] = transcriptIdRef
            elif markerTES is True:
                total.geneDict[geneId].transcriptDict[transcriptId].TES = transcriptRef[transcriptIdRef]["TES"]
                if total.geneDict[geneId].strand == '+':
                    total.geneDict[geneId].transcriptDict[transcriptId].end = transcriptRef[transcriptIdRef]["end"]
                else:
                    total.geneDict[geneId].transcriptDict[transcriptId].start = transcriptRef[transcriptIdRef]["start"]
                transcriptCorrected["TSS-TES"][transcriptId]["TES"] = transcriptIdRef
    else:
        # 映射到单个transcript
        (transcriptIdRef, num, markerTSS, markerTES, marker, mapped) = transcriptRefSet.pop()
        if markerTSS == markerTES == True:
            # 双端映射
            if marker == 1:
                # 参考transcriptId已记录
                # 查询参考transcript所属gene是否已记录
                geneIdRef = transcriptRef[transcriptIdRef]["geneId"]
                total._transcriptMerge(transcriptId1=transcriptIdRef,
                                       transcriptId2=transcriptId,
                                       geneId1=geneIdRef,
                                       geneId2=geneId)
            elif marker == 2:
                # 参考transcript已记录, id为mapped
                total._transcriptMerge(transcriptId1=mapped,
                                       transcriptId2=transcriptId,
                                       geneId1=transcriptGeneMap[mapped],
                                       geneId2=geneId)
            else:
                # 参考transcript未记录
                geneIdRef = transcriptRef[transcriptIdRef]["geneId"]
                if geneIdRef not in total.geneDict.keys():
                    # 参考transcript所属的参考gene未记录, 需要进行记录
                    total.geneDict[geneIdRef] = Gene(status="KNOWN",
                                                     chr=transcriptRef[transcriptIdRef]["chr"],
                                                     strand=transcriptRef[transcriptIdRef]["strand"],
                                                     start=geneRef[geneIdRef]["start"],
                                                     end=geneRef[geneIdRef]["end"],
                                                     TSS=geneRef[geneIdRef]["TSS"],
                                                     TES=geneRef[geneIdRef]["TES"],
                                                     geneId=geneIdRef,
                                                     geneVersion=geneRef[geneIdRef]["geneVersion"],
                                                     geneName=geneRef[geneIdRef]["geneName"],
                                                     geneBiotype=geneRef[geneIdRef]["geneBiotype"],)
                total.geneDict[geneIdRef].transcriptDict[transcriptIdRef] = Transcript(status="KNOWN",
                                                                                        transcriptId=transcriptIdRef,
                                                                                        transcriptVersion=transcriptRef[transcriptIdRef]["transcriptVersion"],
                                                                                        transcriptName=transcriptRef[transcriptIdRef]["transcriptName"],
                                                                                        transcriptBiotype=transcriptRef[transcriptIdRef]["transcriptBiotype"],
                                                                                        start=transcriptRef[transcriptIdRef]["start"],
                                                                                        end=transcriptRef[transcriptIdRef]["end"],
                                                                                        exonList=list(transcriptRef[transcriptIdRef]["exonSet"]))
                total._transcriptMerge(transcriptId1=transcriptIdRef,
                                       transcriptId2=transcriptId,
                                       geneId1=transcriptRef[transcriptIdRef]["geneId"],
                                       geneId2=geneId)
        else:
            # 单端映射
            # 若status为KNOWN就不再映射
            if transcriptObject.status == "KNOWN":
                continue
            if markerTSS is True:
                # 仅TSS端可映射
                total.geneDict[geneId].transcriptDict[transcriptId].status = "CORRECTED-TSS"
                total.geneDict[geneId].transcriptDict[transcriptId].TSS = transcriptRef[transcriptIdRef]["TSS"]
                transcriptCorrected["TSS"][transcriptId] = transcriptIdRef
                if total.geneDict[geneId].strand == '+':
                    total.geneDict[geneId].transcriptDict[transcriptId].start = transcriptRef[transcriptIdRef]["start"]
                else:
                    total.geneDict[geneId].transcriptDict[transcriptId].end = transcriptRef[transcriptIdRef]["end"]
            else:
                # 仅TES端可映射
                total.geneDict[geneId].transcriptDict[transcriptId].status = "CORRECTED-TES"
                total.geneDict[geneId].transcriptDict[transcriptId].TES = transcriptRef[transcriptIdRef]["TES"]
                if total.geneDict[geneId].strand == '+':
                    total.geneDict[geneId].transcriptDict[transcriptId].end = transcriptRef[transcriptIdRef]["end"]
                else:
                    total.geneDict[geneId].transcriptDict[transcriptId].start = transcriptRef[transcriptIdRef]["start"]
                transcriptCorrected["TES"][transcriptId] = transcriptIdRef

    #print('\n', geneId, transcriptId)
    #print('transcriptList', transcriptList)
    #print("transcriptRefSet", transcriptRefSet)

transcript:   1%|          | 1074/183842 [00:00<00:22, 8128.64it/s]


KeyError: 'ENSG00000290832'

In [119]:
print(total.geneDict["ENSG00000174353"].dictGet())
print(geneRef["ENSG00000174353"])
print(geneRef["ENSG00000290832"])

{'geneId': 'ENSG00000174353', 'geneVersion': 17, 'geneName': 'STAG3L3', 'geneBiotype': 'transcribed_unprocessed_pseudogene', 'status': 'KNOWN', 'strand': '-', 'chr': 'chr7', 'start': 72869658, 'end': 73005915, 'TSS': 73005915, 'TES': 72869658, 'transcript': 'ENCLB240AAXT000349248, ENCLB345FIST000374847, ENCLB240AAXT000349242, ENCLB315YZPT000366823, ENCLB315YZPT000366827, ENCLB315YZPT000366833, ENCLB345FIST000374837, ENST00000448173, ENCLB345FIST000374848, ENCLB315YZPT000366824, ENCLB345FIST000374851, ENCLB315YZPT000366831, ENST00000423834, ENCLB240AAXT000349247, ENCLB240AAXT000349241, ENCLB240AAXT000349246, ENCLB767XEKT000335181, ENST00000428423', 'exon': 'ENSE00003500777, ENSE00001721222, ENSE00001797769, ENSE00001635491, GSM6782552_1131077, GSM6783527_1108042, GSM6782551_1151377, ENSE00002477385, ENSE00002499054, GSM6783528_1159418, ENSE00003710880, ENSE00003709159, ENSE00003708642, ENSE00002578620, ENSE00001798213, ENSE00001599135, ENSE00003708645, GSM6782551_1151376, GSM6782551_115

In [115]:
total.transcriptIdGet("ENST00000423834")

{'geneId': 'ENSG00000174353', 'transcriptId': 'ENST00000423834'}

In [112]:
print(transcriptIdRef)
print(transcriptId)
print(transcriptRef[transcriptIdRef]["geneId"])
print(geneId)

ENST00000423834
ENCLB240AAXT000349247
ENSG00000290832
ENSG00000174353


In [148]:
'''
有部分transcript, 在total中该transcript属于geneTotal, 但在基因组注释中属于geneRef
需要将该transcript所属的gene由geneTotal映射到geneRef
    查询geneRef是否已记录
        若geneRefId已记录
            geneRef中新增该transcript对象
            geneTotal中删除该transcript对象
        若geneRefId未记录但geneRef已记录
            获取其映射的geneRefMapped
            查询该transcript是否已存在于
        若geneRef未记录
            记录geneRef对象
            在geneRef中新增该transcript对象
            删除geneTotal中的transcript对象
去除不包含transcript的gene
重建index
保存这种错误映射的数据到tsv中
'''
totalMap = {}
for geneId, geneObject in total.geneDict.items():
    for transcriptId, transcriptObject in geneObject.transcriptDict.items():
        totalMap[transcriptId] = geneId
annotationMap = {}
for transcriptId, transcriptDict in transcriptRef.items():
    geneId = transcriptDict["geneId"]
    annotationMap[transcriptId] = geneId
# 筛选出映射到错误gene的transcript
errorMap = {}  # {<transcriptId>: {"total": <geneId>, "ref": <geneId>}}
for transcriptId, geneId in totalMap.items():
    marker = annotationMap.get(transcriptId, None)
    if marker is None:
        continue
    if geneId != marker:
        errorMap[transcriptId] = {"total": geneId, "ref": marker}

In [200]:
for transcriptId, d in errorMap.items():
    geneIdTotal = d["total"]
    geneIdRef = d["ref"]
    if geneIdRef in total.geneDict:
        if transcriptId in total.geneDict[geneIdRef].transcriptDict:
            print(1, transcriptId,geneIdTotal,geneIdRef)
        else:
            print(0, transcriptId,geneIdTotal,geneIdRef)


0 ENST00000508856 ENSG00000248632 ENSG00000183439
0 ENST00000463441 ENSG00000127399 ENSG00000188707
0 ENST00000521369 ENSG00000254615 ENSG00000164830
0 ENST00000577661 ENSG00000254615 ENSG00000164830
0 ENST00000430721 ENSG00000239665 ENSG00000165630
0 ENST00000359450 ENSG00000221995 ENSG00000196535
0 ENST00000395693 ENSG00000130489 ENSG00000284194
0 ENST00000442171 ENSG00000228106 ENSG00000154309
0 ENST00000435733 ENSG00000226377 ENSG00000236377
0 ENST00000598924 ENSG00000269814 ENSG00000142235
0 ENST00000603308 ENSG00000271127 ENSG00000206195
0 ENST00000624499 ENSG00000279355 ENSG00000026652
0 ENST00000568479 ENSG00000260822 ENSG00000067992


In [196]:
# 纠正错误映射
for transcriptId, d in errorMap.items():
    geneIdTotal = d["total"]
    geneIdRef = d["ref"]
    # 查询geneRef是否已记录
    #(marker, mappedId) = total._geneCheck(geneId=geneIdRef, chr=geneRef[geneIdRef]["chr"], strand=geneRef[geneIdRef]["strand"], start=geneRef[geneIdRef]["start"], end=geneRef[geneIdRef]["end"])
    #geneIdRef = (geneIdRef, mappedId)[marker==3]
    if geneIdRef in total.geneDict:
        # geneRef已记录
        if transcriptId in total.geneDict[geneIdRef].transcriptDict:
            # 该transcript对象已存在于geneIdRef中
            print("[Warning]{id} have been existed in {r}, failed {t} --> {r}".format(id=transcriptId, r=geneIdRef, t=geneIdTotal))
        else:
            # 该transcript对象未存在于geneIdRef中
            # geneRef中新增该transcript对象
            total.geneDict[geneIdRef].transcriptDict[transcriptId] = total.geneDict[geneIdTotal].transcriptDict[transcriptId]
        # geneTotal中删除该transcript对象
        total.geneDict[geneIdTotal].transcriptDict.pop(transcriptId)
        print(transcriptId)
    else:
        '''# geneRef未记录
        # 记录geneRef对象
        # 判断geneRef是否已记录
        total.geneAdd(status="KNOWN", chr=geneRef[geneIdRef]["chr"], strand=geneRef[geneIdRef]["strand"],
                      start=geneRef[geneIdRef]["start"], end=geneRef[geneIdRef]["end"], geneId=geneRef[geneIdRef]["geneId"],
                      geneVersion=geneRef[geneIdRef]["geneVersion"], geneName=geneRef[geneIdRef]["geneName"], geneBiotype=geneRef[geneIdRef]["geneBiotype"])
        # 在geneRef中新增该transcript对象
        total.geneDict[geneIdRef].transcriptDict[transcriptId] = total.geneDict[geneIdTotal].transcriptDict[transcriptId]
        # 删除geneTotal中的transcript对象
        total.geneDict[geneIdTotal].transcriptDict.pop(transcriptId)'''
        pass

ENST00000508856
ENST00000463441
ENST00000521369
ENST00000577661
ENST00000430721
ENST00000359450
ENST00000395693
ENST00000442171
ENST00000435733
ENST00000598924
ENST00000603308
ENST00000624499
ENST00000568479


In [188]:
errorMap["ENST00000452055"]

{'total': 'ENSG00000226287', 'ref': 'ENSG00000291085'}

In [199]:
print(total.geneDict["ENSG00000248632"].transcriptDict["ENST00000508856"].countsExpression)
print(total.geneDict["ENSG00000183439"].transcriptDict["ENST00000508856"].countsExpression)

{'GSM6783527': 2}


KeyError: 'ENST00000508856'

In [180]:
print(total.geneDict["ENSG00000226287"].dictGet())
print(total.geneDict["ENSG00000226287"].transcriptDict["ENST00000452055"].dictGet())
print(total.geneDict["ENSG00000226287"].dictGet())
print(total.geneDict["ENSG00000226287"].transcriptDict["ENST00000452055"].dictGet())

{'geneId': 'ENSG00000226287', 'geneVersion': 8, 'geneName': 'TMEM191A', 'geneBiotype': 'transcribed_unprocessed_pseudogene', 'status': 'KNOWN', 'strand': '+', 'chr': 'chr22', 'start': 20701114, 'end': 20704606, 'TSS': 20701114, 'TES': 20704606, 'transcript': 'ENCLB315YZPT000326343, ENST00000452055, ENCLB315YZPT000326345, ENCLB240AAXT000313140', 'exon': 'GSM6782551_1099831, ENSE00001641633, GSM6782551_1099833, ENSE00001645121, GSM6782552_1083956, ENSE00001788859, ENSE00003924989, ENSE00001679851, ENSE00001706912, ENSE00001712385, ENSE00001762770'}
{'transcriptId': 'ENST00000452055', 'transcriptVersion': 5, 'transcriptName': 'TMEM191A-209', 'transcriptBiotype': 'retained_intron', 'status': 'KNOWN', 'start': 20702921, 'end': 20704606, 'TSS': 20702921, 'TES': 20704606, 'exonList': 'ENSE00001641633, ENSE00001645121, ENSE00001679851, ENSE00001706912, ENSE00001712385, ENSE00001762770'}
{'geneId': 'ENSG00000226287', 'geneVersion': 8, 'geneName': 'TMEM191A', 'geneBiotype': 'transcribed_unproces

---

暂存部分

In [125]:
debugFile = "F:/OneDrive/Master/Project/trans/data/raw_data/GSM6782551/GSM6782551_ENCFF856FNN_transcriptome_annotations_GRCh38.gtf"

In [None]:
# 二级函数
def findIdAdvanced(index, chr, TSS, TES, rangeTSS, rangeTES, allowType):
    '''
    input:
        index, dict
        chr, str
        TSS, int
        TES, int
        rangeTSS, int
        rangeTES, int
        allowType, str, "gene" or transcript
    change:
        查询index中, 与TSS或TES上下游分别相距rangeTSS或rangeTES的id
    return:
        tuple, (id, markerTSS, markerTES)
    '''
    index = index = transcriptRef
    chr = chr
    TSS = TSS
    TES = TES
    rangeTSS = rangeTSS = RANGETSS
    rangeTES = rangeTES = RANGETES

    a = findId(index=index, chr=chr, location=location, rangeMax=rangeMax, orientation='-', allowType=allowType)
    return a

In [None]:
transcriptSet = set()
for geneObject in total.geneDict.values():
    for transcriptId, transcriptObject in geneObject.transcriptDict.items():
        if transcriptObject.status == "NOVEL":
            transcriptSet.add(transcriptId)

In [None]:
# 根据参考基因组注释数据对start及end进行校正
for geneObject in total.geneDict.values():
    for transcriptObject in geneObject.transcriptDict.values():
        strand = transcriptObject.strand
        start = transcriptObject.start
        end = transcriptObject.end
        (TSS, TES) = ((end, start), (start, end))[strand is '+']



end

---

---

In [None]:
a = set()
for exonObject in total.exonDict.values():
    status = exonObject.status
    a.add(status)
a

In [None]:
debug = {}
for exonObject in total.exonDict.values():
    status = exonObject.status
    if status == "KNOWN":
        debug["KNOWN"] = debug.setdefault("KNOWN", 0) + 1
    elif status == "NOVEL":
        debug["NOVEL"] = debug.setdefault("NOVEL", 0) + 1
    elif status == "CORRECTED-TSS-TES":
        debug["CORRECTED-TSS-TES"] = debug.setdefault("CORRECTED-TSS-TES", 0) + 1
    elif status == "CORRECTED-TES-TSS":
        debug["CORRECTED-TSS-TES"] = debug.setdefault("CORRECTED-TSS-TES", 0) + 1
    elif status == "CORRECTED-TSS":
        debug["CORRECTED-TSS"] = debug.setdefault("CORRECTED-TSS", 0) + 1
    elif status == "CORRECTED-TES":
        debug["CORRECTED-TES"] = debug.setdefault("CORRECTED-TES", 0) + 1
    elif status == "NOVEL-TSS":
        debug["NOVEL-TSS"] = debug.setdefault("NOVEL-TSS", 0) + 1
    elif status == "NOVEL-TES":
        debug["NOVEL-TES"] = debug.setdefault("NOVEL-TES", 0) + 1
    elif status == "NOVEL-TSS-TES":
        debug["NOVEL-TSS-TES"] = debug.setdefault("NOVEL-TSS-TES", 0) + 1
    elif status == "NOVEL-TES-TSS":
        debug["NOVEL-TSS-TES"] = debug.setdefault("NOVEL-TSS-TES", 0) + 1
    else:
        print(status)
debug

In [None]:
debug

In [None]:
import plotly.graph_objects as go
trace = go.Pie(labels=["novel exon", "start-end corrected", "start corrected", "end corrected", "known exon"],
	   values=[67087, 13543, 20126, 23812, 190845],
	   marker={"colors": ["#505168", "#2A7F62", "#75DDDD", "#E8E1EF", "lightgrey"],
			   "line": {"color": "white",
						"width": 1}},
	   sort=False,
	   textinfo="value",
	   textfont_size=12,
	   insidetextorientation='auto',
	   hole=0,
	   pull=[0.1, 0, 0, 0, 0],
	   )
fig = go.Figure(trace)

layout={"width": 500, "height": 500,
		"margin": {'l':25, 'r':25, 't':25, 'b':25},
        "title": {"text": "Correction of exon<br>Range: 5",
				  "font": {"family": "Arial",
						   "color": "black",
						   "size": 16},
				   "x": 0.4},
        "font": {"family": "Arial",  # 作用所有地方
				 "color": "black",
				 "size": 12,
				 },
        "paper_bgcolor": "white",
        "plot_bgcolor": "white",
        }
fig = fig.update_layout(layout)
'''
将最大校正范围设定在5bp时, 在标注为novel的74504个exon中, 有3558个exon可以完全映射到已注释exon上, 有16229个exon的start附近存在已注释exon, 有10967个exon的end附近存在已注释exon。
已根据其映射情况修改了exon的start或end的位置
'''

In [None]:
#fig.show()
fig.write_image("5exonBeforeIntegrate.png")

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 创建子图，两个 Y 轴
fig = make_subplots(specs=[[{"secondary_y": True}]])

trace1 = go.Scatter(x=[1,2,3,4,5,6,7,8,9,10,
                       11,12,13,14,15,16,17,18,19,20],
                   y=[213,934,1826,3197,3558,3867,4071,4233,4416,4559,
                      4677,4825,4914,5005,5138,5225,5308,5437,5546,5630],
                   mode="lines+markers",
                   name="start-end exon",
                   line={"color": "#2A7F62"})
trace2 = go.Scatter(x=[1,2,3,4,5,6,7,8,9,10,
                       11,12,13,14,15,16,17,18,19,20],
                   y=[44356,44197,44197,44197,43750,43665,43603,43549,43549,43476,
                      43445,43410,43410,43346,43327,43298,43278,43257,43232,43206],
                   mode="lines+markers",
                   name="novel exon",
                   line={"color": "#505168"})

# 添加第一个 Y 轴的数据
fig.add_trace(trace1, secondary_y=False)
# 添加第二个 Y 轴的数据
fig.add_trace(trace2, secondary_y=True)

# 设置图形布局
layout = {"width": 700, "height": 400,
		    "margin": {'l':25, 'r':25, 't':25, 'b':25},
          "title": {"text": "",
				  "font": {"family": "Arial",
						   "color": "black",
						   "size": 16},
				   "x": 0.4},
        "font": {"family": "Arial",  # 作用所有地方
				 "color": "black",
				 "size": 12,
				 },
         "xaxis": {"showline": True,
                   "linecolor": "black",
                   "linewidth": 1,
                   "showticklabels": True,
                   "dtick": 2},
         "yaxis": {"showline": True,
                   "linecolor": "black",
                   "linewidth": 1,
                   "showticklabels": True,},
         "yaxis2": {"showline": True,
                   "linecolor": "black",
                   "linewidth": 1,
                   "showticklabels": True,},
        "paper_bgcolor": "white",
        "plot_bgcolor": "white",}
fig = fig.update_layout(layout)
fig = fig.update_layout(title_text="Correction of exons in different ranges",
                  xaxis_title="Range",
                  yaxis_title="Counts of start-end exon",
                  yaxis2_title="Counts of novel exon")

# 显示图形
#fig.write_image("line.png")
fig.show()
'''
之所以选择exon的最大校正范围为5bp, 是因为已经分析了在最大校正范围为1~20时的校正情况。
当范围设定在5bp时, novel exon的减少速度趋于稳定, 且在所有exon中双端可映射到已注释exon的数量的增加速度趋于稳定。
'''

不根据参考基因组注释进行校正:

校正前total exon num: 323363
校正前novel exon num: 133311
校正前total transcript num: 213387
校正前novel transcript num: 185981
mapped 6095
start-end 6503
start 22997
end 19860
校正后total exon num: 309350
校正后novel exon num: 72014
校正后total transcript num: 211068
校正后novel transcript num: 183662

根据参考基因组注释进行校正:



In [None]:
'''	novel exon	start-end exon	start exon	end exon	remained exon
1	74504	213	17626	12309	44356
2	74504	934	17311	12062	44197
3	74504	1826	16935	11713	44197
4	74504	3197	16355	11107	44197
5	74504	3558	16229	10967	43750
6	74504	3867	16119	10853	43665
7	74504	4071	16029	10801	43603
8	74504	4233	15961	10761	43549
9	74504	4416	15877	10704	43549
10	74504	4559	15802	10667	43476
11	74504	4677	15753	10629	43445
12	74504	4825	15694	10575	43410
13	74504	4914	15651	10557	43410
14	74504	5005	15621	10532	43346
15	74504	5138	15543	10496	43327
16	74504	5225	15501	10480	43298
17	74504	5308	15452	10466	43278
18	74504	5437	15395	10415	43257
19	74504	5546	15337	10389	43232
20	74504	5630	15297	10371	43206'''
