In [1]:
import pandas
import gzip
import time
import argparse
import pickle
from Class import *
import function

In [2]:
__version__ = "V0.0(Editor) 2023-10-16"

In [3]:
# 前置参数-debug
COUNTSCUTOFF = 2
ABSOLUTEPATH = False
FILEPATH = "F:/OneDrive/Master/Project/trans/data/"
RAWDATAPATH = "raw_data/"
INPUTTOTAL = "10001_total.pkl"
INPUTREFANNOTATION = "Homo_sapiens.GRCh38.109.chr.gtf.gz"
OUTPUTTOTAL = "10002_total.pkl"

In [4]:
# 补全路径
if ABSOLUTEPATH == False:
    RAWDATAPATH = "{}{}".format(FILEPATH, RAWDATAPATH)
    INPUTTOTAL = "{}{}".format(FILEPATH, INPUTTOTAL)
    INPUTREFANNOTATION = "{}annotation/{}".format(RAWDATAPATH, INPUTREFANNOTATION)
    OUTPUTTOTAL = "{}{}".format(FILEPATH, OUTPUTTOTAL)

In [5]:
# 引用变量
EXONRANGE = 5
RANGETSS = 50
RANGETES = 150

In [6]:
# 一级函数
def findId(index, chr, location, rangeMax, orientation, allowType, total=None):
    '''
    input:
        total, Total Object
        index, dict, {<chr>: {<location>: {<location>: <id>, ...}, ...}, ...}
        chr, str
        location, int
        rangeMax, int
        orientation, str, 当为+时, location递增; 当为-时, location递减
        allowType, str, "gene" or "transcript" or "exon"
    change:
        寻找index中, 相距location为rangeMax的id(不包含location)
    '''
    total = total
    index = index
    chr = chr
    location = location
    rangeMax = rangeMax
    orientation = orientation
    allowType = allowType

    temp = []  # [(<id>, <mean expression in all cellLine>)]

    if orientation == '-':
        rangeList = [i for i in range(location-1, location-rangeMax-1, -1)]
    else:
        rangeList = [i for i in range(location+1, location+rangeMax+1, 1)]

    for i in rangeList:
        # 寻找是否在指定范围内存在相应的location
        if index[chr].get(i, None) is None:
            # 如果不存在, 就直接判断下一个位置
            continue
        else:
            # 寻找到了范围内的location
            newLocation = i
            if allowType == "exon":
                # index[chr][newLocation].values()中的值为str, 而不是list
                for id in index[chr][newLocation].values():
                    if total is not None:
                        expression = numpy.mean([total.exonDict[id].cellLineExpression[cellLine] for cellLine in total.celllineInfo.keys()])
                    else:
                        expression = None
                    temp.append((id, expression))
            elif allowType == "gene":
                # index[chr][newLocation].values()中的值为str, 而不是list
                for id in index[chr][newLocation].values():
                    if total is not None:
                        expression = numpy.mean([total.geneDict[id].cellLineExpression[cellLine] for cellLine in total.celllineInfo.keys()])
                    else:
                        expression = None
                    temp.append((id, expression))
            else:
                # TODO
                # index[chr][newLocation].values()中的值为list, 而不是str
                for idList in index[chr][newLocation].values():
                    for id in idList:
                        temp.append((id, None))
            temp = temp + findId(total=total, index=index, chr=chr, location=newLocation, rangeMax=rangeMax, orientation=orientation, allowType=allowType)

    # 去重
    temp = list(set(temp))

    return temp
    

In [7]:
# 一级函数
def loadRefAnnotation(filename, allowType):
    '''
    input:
        filename, str, 参考基因组注释文件位置.gz
        allowType, str, "transcript" or "gene" or "exon"
    change:
        读取参考基因组注释数据中allowType的数据
    return:
        dict, {"positionRef": {"TSS": {<chr>: {<TSS>: {<TES>: [<transcriptId>, ...],
                                                               ...},
                                                       ...},
                                               ...},
                                       ...},
                               "TES": {<chr>: {<TES>: {<TSS>: [<transcriptId>, ...],
                                                               ...},
                                                       ...},
                                               ...}
                                       ...},
                "dataRef": {<id>: {<key>: <value>,
                                   ...},
                            ...}
    '''
    # 读取参考基因组注释数据
    filename = filename
    allowType = [allowType]

    positionRef = {"TSS": {}, "TES": {}}  # {"TSS": {<chr>: {<TSS>: {<TES>: [<id>, ...], ...}, ...}, ...}, "TES": {<chr>: {<TES>: {<TSS>: [<id>, ...], ...}, ...}, ...}}
    dataRef = {}

    # 读取数据
    with gzip.open(filename, 'rt') as file:
        if allowType == ["transcript"]:
            for line in file:
                line = function.pickRefAnnotation(info=line, allowType=allowType)
                if line is None:
                    continue
                # 提取数据
                geneId = line["gene_id"]
                transcriptId = line["transcript_id"]
                transcriptId = function.strSplit(transcriptId)[0]  # 去掉版本号
                transcriptVersion = line["transcript_version"]
                strand = line["strand"]
                chr = line["chr"]
                chr = (chr, 'M')[chr == "MT"]
                chr = "chr{}".format(chr)
                start = int(line["start"])
                end = int(line["end"])
                # 判断转录起点与转录终点
                if strand == '+':
                    TSS, TES = start, end
                else:
                    TSS, TES = end, start
                # 在dataRef中添加信息
                dataRef[transcriptId] = {"geneId": geneId,
                                         "transcriptId": transcriptId,
                                         "transcriptVersion": transcriptVersion,
                                         "strand": strand,
                                         "chr": chr,
                                         "start": start,
                                         "end": end,
                                         "TSS": TSS,
                                         "TES": TES}
        elif allowType == ["gene"]:
            for line in file:
                line = function.pickRefAnnotation(info=line, allowType=allowType)
                if line is None:
                    continue
                # 提取数据
                geneId = function.strSplit(line["gene_id"])[0]
                geneVersion = line["gene_version"]
                strand = line["strand"]
                chr = line["chr"]
                chr = (chr, 'M')[chr == "MT"]
                chr = "chr{}".format(chr)
                start = int(line["start"])
                end = int(line["end"])
                # 判断转录起点与转录终点
                if strand == '+':
                    TSS, TES = start, end
                else:
                    TSS, TES = end, start
                # 在dataRef中添加信息
                dataRef[geneId] = {"geneId": geneId,
                                    "geneVersion": geneVersion,
                                    "strand": strand,
                                    "chr": chr,
                                    "start": start,
                                    "end": end,
                                    "TSS": TSS,
                                    "TES": TES}
        elif allowType == ["exon"]:
            for line in file:
                line = function.pickRefAnnotation(info=line, allowType=allowType)
                if line is None:
                    continue
                # 提取数据
                geneId = line["gene_id"]
                transcriptId = line["transcript_id"]
                [exonId, exonVersion] = function.strSplit(line["exon_id"])
                exonVersion = line["exon_version"]
                strand = line["strand"]
                chr = line["chr"]
                chr = ("chr{}".format(chr), "chrM")[chr == "MT"]
                start = int(line["start"])
                end = int(line["end"])
                # 判断转录起点与转录终点
                if strand == '+':
                    TSS, TES = start, end
                else:
                    TSS, TES = end, start
                # 在dataRef中添加信息
                dataRef[exonId] = {"geneId": geneId,
                                    "transcriptId": transcriptId,
                                    "exonId": exonId,
                                    "exonVersion": exonVersion,
                                    "strand": strand,
                                    "chr": chr,
                                    "start": start,
                                    "end": end,
                                    "TSS": TSS,
                                    "TES": TES}
        else:
            raise ValueError("the allowType should be 'transcript' or 'gene', but it is {}".format(allowType))

    # 整理index
    # 在positionRef中添加信息
    if allowType == ["transcript"]:
        for id, transcriptDict in dataRef.items():
            chr = transcriptDict["chr"]
            TSS = transcriptDict["TSS"]
            TES = transcriptDict["TES"]
            if chr not in positionRef["TSS"].keys():
                positionRef["TSS"][chr] = {TSS: {TES: [id]}}
            else:
                if TSS not in positionRef["TSS"][chr].keys():
                    positionRef["TSS"][chr][TSS] = {TES: [id]}
                else:
                    if TES not in positionRef["TSS"][chr][TSS].keys():
                        positionRef["TSS"][chr][TSS][TES] = [id]
                    else:
                        positionRef["TSS"][chr][TSS][TES].append(id)
            if chr not in positionRef["TES"].keys():
                positionRef["TES"][chr] = {TES: {TSS: [id]}}
            else:
                if TES not in positionRef["TES"][chr].keys():
                    positionRef["TES"][chr][TES] = {TSS: [id]}
                else:
                    if TSS not in positionRef["TES"][chr][TES].keys():
                        positionRef["TES"][chr][TES][TSS] = [id]
                    else:
                        positionRef["TES"][chr][TES][TSS].append(id)
    elif allowType == ["gene"] or allowType == ["exon"]:
        for id, dataDict in dataRef.items():
            chr = dataDict["chr"]
            TSS = dataDict["TSS"]
            TES = dataDict["TES"]
            if chr not in positionRef["TSS"].keys():
                positionRef["TSS"][chr] = {TSS: {TES: id}}
            else:
                if TSS not in positionRef["TSS"][chr].keys():
                    positionRef["TSS"][chr][TSS] = {TES: id}
                else:
                    if TES not in positionRef["TSS"][chr][TSS].keys():
                        positionRef["TSS"][chr][TSS][TES] = id
                    else:
                        pass
                        #print("[Warning]{id}-->{e}, but {e} is existed".format(id=id, e=positionRef["TSS"][chr][TSS][TES]))
            if chr not in positionRef["TES"].keys():
                positionRef["TES"][chr] = {TES: {TSS: id}}
            else:
                if TES not in positionRef["TES"][chr].keys():
                    positionRef["TES"][chr][TES] = {TSS: id}
                else:
                    if TSS not in positionRef["TES"][chr][TES].keys():
                        positionRef["TES"][chr][TES][TSS] = id
                    else:
                        pass
                        #print("[Warning]{id}-->{e}, but {e} is existed".format(id=id, e=positionRef["TSS"][chr][TSS][TES]))
    else:
        raise ValueError("the allowType should be 'transcript' or 'gene', but it is {}".format(allowType))
    
    return {"positionRef": positionRef, "dataRef": dataRef}
        

In [8]:
# load data
with open(INPUTTOTAL, "rb") as file:
    total = pickle.load(file)

In [9]:
# 更新total中的exonDict及exonIndex
total.refresh()
total.reIndex()

In [10]:
# 声明变量
exonCorrected = {"mapped": {}, "TSS-TES": {}, "TSS": {}, "TES": {}}  # {"": {<existed exon id>: <corrected exon id>}, ...}
readyCorList = []

'''# 根据end建立index
endExonIndex = {}  # {<chr>: {end: {start: exonId}}, ...}
for exonId, exonObject in total.exonDict.items():
    chr = exonObject.chr
    start = exonObject.start
    end = exonObject.end
    if chr not in endExonIndex.keys():
        endExonIndex[chr] = {end: {start: exonId}}
    else:
        if end not in endExonIndex[chr].keys():
            endExonIndex[chr][end] = {start: exonId}
        else:
            if start not in endExonIndex[chr][end].keys():
                endExonIndex[chr][end][start] = exonId'''

In [10]:
# 已弃置 因为已根据参考基因组进行了校正
'''
# 处理total中的exon数据
# 遍历total.exonDict中所有.status=="NOVEL"的exon, 记录这些exon
for exonId, exonObject in total.exonDict.items():
    if exonObject.status != "NOVEL":
        continue
    else:
        readyCorList.append(exonId)

# debug TODO del
print("校正前total exon num:", len(total.exonDict))  # debug TODO del
print("校正前novel exon num:", len(readyCorList))  # debug TODO del
num = 0  # debug TODO del
for geneId, geneObject in total.geneDict.items():  # debug TODO del
    for transcriptId in geneObject.transcriptDict.keys():  # debug TODO del
        num += 1  # debug TODO del
print("校正前total transcript num:", num)  # debug TODO del
num = 0  # debug TODO del
for geneId, geneObject in total.geneDict.items():  # debug TODO del
    for transcriptObject in geneObject.transcriptDict.values():  # debug TODO del
        if transcriptObject.status == "NOVEL":  # debug TODO del
            num +=1  # debug TODO del
print("校正前novel transcript num:", num)  # debug TODO del

for exonId in readyCorList:
    exonObject = total.exonDict[exonId]
    chr = exonObject.chr
    start = exonObject.start
    end = exonObject.end

    # 声明list类型变量exonList
    exonList = []  # 存储可进行映射的exon [(<exonId>, <dif>, <startMarker>, <endMarker>), ...]

    # 准备校正exon的start及end
    # 获取[start-5, start+5]范围的所有exon
    temp = [total.exonIndex[chr].get(i, None) for i in range(start-EXONRANGE, start+EXONRANGE+1)]
    # 去除空值
    while None in temp:
        temp.remove(None)
    # 去除自身
    for i in range(0, len(temp)):
        d = temp[i]
        if exonId not in d.values():
            continue
        temp[i].pop(end)
    while {} in temp:
        temp.remove({})
        pass
    for d in temp:
        for tempEnd, tempExonId in d.items():
            if total.exonDict[tempExonId].status == "NOVEL":
                continue
            if abs(tempEnd-end) <= EXONRANGE:
                # 从中筛选出[end-5, end+5]范围的所有exon, 认为这些exon为可双端映射的exon
                num = abs(start-total.exonDict[tempExonId].start) + abs(end-tempEnd)
                exonList.append((tempExonId, num, True, True))
            else:
                # 仅start可映射到tempExon
                num = abs(start-total.exonDict[tempExonId].start)
                exonList.append((tempExonId, num, True, False))

    # 获取[end-5, end+5]范围的所有exon
    temp = [endExonIndex[chr].get(i, None) for i in range(end-EXONRANGE, end+EXONRANGE+1)]
    # 去除空值
    while None in temp:
        temp.remove(None)
    # 去除自身
    for i in range(0, len(temp)):
        d = temp[i]
        if exonId not in d.values():
            continue
        temp[i].pop(start)
    while {} in temp:
        temp.remove({})
    for d in temp:
        for tempStart, tempExonId in d.items():
            if total.exonDict[tempExonId].status == "NOVEL":
                continue
            # 从中筛选出(-oo, start-5)U(start+5, +oo)的所有exon, 认为这些exon为仅可映射到end的exon
            if abs(tempStart-start) > EXONRANGE:
                # 仅end可映射到tempExon
                num = abs(end-total.exonDict[tempExonId].end)
                exonList.append((tempExonId, num, False, True))
    
    # 不分析无法映射的exon
    if len(exonList) == 0:
        continue
    
    # 校正过程
    mode = False  # 是否存在最佳映射
    for (newExonId, num, startMarker, endMarker) in exonList:
        if startMarker == endMarker == True:
            mode = True
            break
    # 开始映射
    if mode is True:
        # 若exon可双端映射到一个exon
        # 获取相差碱基数最少的exon
        temp = []
        for (newExonId, num, startMarker, endMarker) in exonList:
            if startMarker == endMarker == True:
                temp.append((newExonId, num))
        temp = sorted(temp, key=lambda x: x[1])
        (newExonId, num) = temp[0]
        # 在total.exonExisted中添加该映射
        total._exonExistedAdd(oldExonId=exonId, newExonId=newExonId)
        # 删除total.exonDict中该exon对象
        total.exonDict.pop(exonId)
        # 将映射关系{<exonId>: <newExonId>}保存到exonCorrected["mapped"]中
        exonCorrected["mapped"][exonId] = newExonId
    else:
        # 若exon不可双端映射到一个exon
        # 修改exon的status为CORRECTED
        total.exonDict[exonId].status = "CORRECTED"
        # 判断exon能否双端映射到两个exon
        marker=False  # 标记该exon能否映射到两个exon, 1-可根据start映射 2-可根据end映射 3-可分别根据start,end映射
        for (newExonId, num, startMarker, endMarker) in exonList:
            if startMarker is True:
                marker = 3 if marker==2 else 1
            else:
                marker = 3 if marker==1 else 2
        if marker == 3:
            # 能映射到两个exon
            # 在exon的status中添加-start-end后缀
            total.exonDict[exonId].status = total.exonDict[exonId].status + "-start-end"
            # 获取start端相差碱基数最少的exon
            temp = [(newExonId, num, startMarker, endMarker) for (newExonId, num, startMarker, endMarker) in exonList if startMarker is True]
            temp = sorted(temp, key=lambda x: x[1])
            (newExonId, num, startMarker, endMarker) = temp[0]
            # 修改exon的start的位置
            total.exonDict[exonId].start = total.exonDict[newExonId].start
            # 将映射关系{<exonId>: <newExonId>}保存到exonCorrected["start-end"]中
            exonCorrected["start-end"][exonId] = {"start": newExonId, "end": None}
            # 获取end端相差碱基数最少的exon
            temp = [(newExonId, num, startMarker, endMarker) for (newExonId, num, startMarker, endMarker) in exonList if endMarker is True]
            temp = sorted(temp, key=lambda x: x[1])
            (newExonId, num, startMarker, endMarker) = temp[0]
            # 修改exon的end的位置
            total.exonDict[exonId].end = total.exonDict[newExonId].end
            # 将映射关系{<exonId>: <newExonId>}保存到exonCorrected["start-end"]中
            exonCorrected["start-end"][exonId]["end"] = newExonId
        else:
            # 仅能单端映射到一个exon
            # 筛选出相差碱基数最少的exon进行映射
            exonList = sorted(exonList, key=lambda x: x[1])
            tempNum = exonList[0][1]
            exonList = [(newExonId, num, startMarker, endMarker) for (newExonId, num, startMarker, endMarker) in exonList if num == tempNum]
            (newExonId, num, startMarker, endMarker) = exonList[0]
            # 判断哪一端可进行映射
            if startMarker is True:
                # start端可映射
                # 在exon的status中添加-start后缀
                total.exonDict[exonId].status = total.exonDict[exonId].status + "-start"
                # 修改exon的start的位置
                total.exonDict[exonId].start = total.exonDict[newExonId].start
                # 将映射关系{<exonId>: <newExonId>}保存到exonCorrected["start"]中
                exonCorrected["start"][exonId] = newExonId
            else:
                # end端可映射
                # 在exon的status中添加-end后缀
                total.exonDict[exonId].status = total.exonDict[exonId].status + "-end"
                # 修改exon的end的位置
                total.exonDict[exonId].end = total.exonDict[newExonId].end
                # 将映射关系{<exonId>: <newExonId>}保存到exonCorrected["end"]中
                exonCorrected["end"][exonId] = newExonId
'''

校正前total exon num: 323363
校正前novel exon num: 133311
校正前total transcript num: 213387
校正前novel transcript num: 185981


In [13]:
# 更新total中的exonDict及exonIndex
total.refresh()
total.reIndex()

136992422
30286690
47980559
64498254
64498254
70269189
43996528
118896136
1815248


In [11]:
# 计算exon, transcript, gene的相对表达量
total.computeRelativeExpression()
total.computeCellLineExpression()

In [13]:
# 合并相邻的exon, 即合并相差EXONRANGE范围内的NOVEL的exon
if True:
    # 声明index
    startIndex = {}  # {<chr>: {<start>: {<end>: <exonId>, ...}, ...}, ...}
    endIndex = {}  # {<chr>: {<end>: {<start>: <exonId>, ...}, ...}, ...}

    for exonId, exonObject in total.exonDict.items():
        if exonObject.status == "KNOWN":
            continue
        chr = exonObject.chr
        start = exonObject.start
        end = exonObject.end
        if "-start" not in exonObject.status:
            if chr not in startIndex.keys():
                startIndex[chr] = {start: {end: exonId}}
            else:
                if start not in startIndex[chr].keys():
                    startIndex[chr][start] = {end: exonId}
                else:
                    if end not in startIndex[chr][start].keys():
                        startIndex[chr][start][end] = exonId
                    else:
                        print("[Warning]startIndex--exonId: {}".format(exonId))
        if "-end" not in exonObject.status:
            if chr not in endIndex.keys():
                endIndex[chr] = {end: {start: exonId}}
            else:
                if end not in endIndex[chr].keys():
                    endIndex[chr][end] = {start: exonId}
                else:
                    if start not in endIndex[chr][end].keys():
                        endIndex[chr][end][start] = exonId
                    else:
                        print("[Warning]endIndex--exonId: {}".format(exonId))

    # 处理start
    # 声明set变量exonSet
    exonSet = set()  # {<exonId>, ...}
    # 筛选符合校正条件的exon
    for exonId, exonObject in total.exonDict.items():
        if exonObject.status == "KNOWN":
            continue
        elif "-start" in exonObject.status:
            continue
        else:
            exonSet.add(exonId)
    # 只要exonSet不为空集, 就随机提取exonSet中的一个exonId, 寻找其start上下游各EXONRANGE范围内的exon
    while len(exonSet) != 0:
        exonId = exonSet.pop()
        exonObject = total.exonDict[exonId]
        chr = exonObject.chr
        start = exonObject.start
        # 声明list变量temp, [(<exonId>, <expression>), ...]
        temp = []
        # 寻找所有与该exon的start相同的exon
        for exonId in startIndex[chr][start].values():
            temp = temp + [(exonId, numpy.mean([total.exonDict[exonId].cellLineExpression[cellLine] for cellLine in total.celllineInfo.keys()]))]
        # 递减寻找EXONRANGE范围内的exon
        temp = temp + findId(total=total, index=startIndex, chr=chr, location=start, rangeMax=EXONRANGE, orientation='-', allowType="exon")
        # 递增寻找EXONRANGE范围内的exon
        temp = temp + findId(total=total, index=startIndex, chr=chr, location=start, rangeMax=EXONRANGE, orientation='+', allowType="exon")
        # 去重
        temp = list(set(temp))
        # 判断是否寻找到了其他的exon
        if len(temp)==1:
            # 未寻找到其他的exon, 认为该exon的start无法修改, 跳过该exon
            continue
        else:
            # 已寻找到其他的exon
            pass
        # 以expression最大的exon为参照, 修改其他exon的start
        exonIdTarget = None
        expressionMax = 0
        # 寻找expression最大的exon
        for (exonId, expression) in temp:
            if expression > expressionMax:
                expressionMax = expression
                exonIdTarget = exonId
            else:
                continue
        # 修改其他的exon
        for (exonId, expression) in temp:
            # 修改其他exon的start
            total.exonDict[exonId].start = total.exonDict[exonIdTarget].start
            # 注意, 不应该修改exon的status
            # 因为该exon的start未能映射到已注释exon上, 所以该exon的start的真实性存疑, 所以不能标注start为CORRECTED
            # 删除exonSet中的这些exon
            exonSet.discard(exonId)

    # 处理end
    # 声明set变量exonSet
    exonSet = set()  # {<exonId>, ...}
    # 筛选符合校正条件的exon
    for exonId, exonObject in total.exonDict.items():
        if exonObject.status == "KNOWN":
            continue
        elif "-end" in exonObject.status:
            continue
        else:
            exonSet.add(exonId)
    # 只要exonSet不为空集, 就随机提取exonSet中的一个exonId, 寻找其end上下游各EXONRANGE范围内的exon
    while len(exonSet) != 0:
        exonId = exonSet.pop()
        exonObject = total.exonDict[exonId]
        chr = exonObject.chr
        end = exonObject.end
        # 声明list变量temp, [(<exonId>, <expression>), ...]
        temp = []
        # 寻找所有与该exon的end相同的exon
        for exonId in endIndex[chr][end].values():
            temp = temp + [(exonId, numpy.mean([total.exonDict[exonId].cellLineExpression[cellLine] for cellLine in total.celllineInfo.keys()]))]
        # 递减寻找EXONRANGE范围内的exon
        temp = temp + findId(total=total, index=endIndex, chr=chr, location=end, rangeMax=EXONRANGE, orientation='-', allowType="exon")
        # 递增寻找EXONRANGE范围内的exon
        temp = temp + findId(total=total, index=endIndex, chr=chr, location=end, rangeMax=EXONRANGE, orientation='+', allowType="exon")
        # 去重
        temp = list(set(temp))
        # 判断是否寻找到了其他的exon
        if len(temp)==1:
            # 未寻找到其他的exon, 认为该exon的end无法修改, 跳过该exon
            continue
        else:
            # 已寻找到其他的exon, 继续
            pass
        # 以expression最大的exon为参照, 修改其他exon的end
        exonIdTarget = None
        expressionMax = 0
        # 寻找expression最大的exon
        for (exonId, expression) in temp:
            if expression >= expressionMax:
                expressionMax = expression
                exonIdTarget = exonId
        # 修改其他的exon
        for (exonId, expression) in temp:
            # 修改其他exon的start
            total.exonDict[exonId].end = total.exonDict[exonIdTarget].end
            # 注意, 不应该修改exon的status
            # 因为该exon的end未能映射到已注释exon上, 所以该exon的end的真实性存疑, 所以不能标注end为CORRECTED
            # 删除exonSet中的这些exon
            exonSet.discard(exonId)

# 更新exon的映射并合并重复的gene, transcript, exon
total.refresh()
total.reIndex()

In [17]:
# 记录信息
infoDict["mapped"] = len(exonCorrected["mapped"])
infoDict["TSS-TES"] = len(exonCorrected["TSS-TES"])
infoDict["TSS"] = len(exonCorrected["TSS"])
infoDict["TES"] = len(exonCorrected["TES"])
#infoDict[]
infoDict["total exon num after correct"] = len(total.exonDict)
num=0
for exonId, exonObject in total.exonDict.items():
    if exonObject.status == "NOVEL":
        num+=1
infoDict["novel exon num after correct"] = num
num = 0
for geneId, geneObject in total.geneDict.items():
    for transcriptId in geneObject.transcriptDict.keys():
        num += 1
infoDict["total transcript num after correct"] = num
num = 0
for geneId, geneObject in total.geneDict.items():
    for transcriptObject in geneObject.transcriptDict.values():
        if transcriptObject.status == "NOVEL":
            num +=1
infoDict["novel transcript num after correct"] = num

In [18]:
infoDict

{'total exon num before correct': 323363,
 'novel exon num before correct': 133311,
 'total transcript num before correct': 213375,
 'novel transcript num before correct': 185969,
 'mapped': 9609,
 'TSS-TES': 810,
 'TSS': 417,
 'TES': 2187,
 'total exon num after correct': 315773,
 'novel exon num after correct': 68416,
 'total transcript num after correct': 211346,
 'novel transcript num after correct': 183942}

---

start

In [None]:
'''
根据参考基因组进行校正的思路
0. 处理exon
    0.0 声明变量
        exonCorrected, dict, {"mapped": {<novel exon id>: <known exon id>, ...},
                              "start": {<novel exon id>: <known exon id>, ...},
                              "end": {<novel exon id>: <known exon id>, ...},
                              "start-end": {<novel exon id>: <known exon id>, ...}}
        readyCorList, list, 存储status为NOVEL的exonId[<exonId>, ...]
    0.1 引用变量
        EXONRANGE, int, exon的start及end允许的偏差范围
    0.2 根据end建立索引
    0.3 根据参考基因组中exon数据对exon的TSS及TES进行校正
        因为三代测序中出现的错误绝大部分属于碱基的插入/缺失, 所以需要对exon的TSS/TES进行校正
        读取参考基因组注释中exon的数据, 并建立index
        统计校正前exon及transcript的数量
        遍历total.exonDict中所有exon对象, 判断每个exon上下游exonRange范围内是否存在参考exon
        声明list类型变量exonList, 存储该exon所能映射的所有参考exon, [[<exonRef>, <bool>, <bool>], ...]
        准备校正exon的TSS及TES
            获取[TSS-5, TES+5]范围的所有exon
                从中筛选出[TES-5, TES+5]范围的所有exon, 认为这些exon为可双端映射的exon
                未筛选出的exon则认为是仅可映射到TSS的exon
            获取[TES-5, TES+5]范围的所有exon
                从中筛选出(-oo, TSS-5)U(TSS+5, +oo)的所有exon, 认为这些exon为仅可映射到TES的exon
            在获取的所有exon中
                优先筛选出可双端映射的exon, 并在这些exon中选择总体距离最近的exon进行映射
                其次才筛选出仅可单端映射的exon, 并在这些exon中选择离单端距离最近的exon进行映射
        校正过程
            若不存在映射, 则跳过该exon, 分析下一个exon
            若存在双端映射exon
                获取相差碱基数最少的exon作为参考exon
                判断exonId是否相同
                若exonId相同, 则修改status, exonVersion, start, TSS, end, TES
                若exonId不同, 则查询参考exonId是否已记录
                    若参考exonId已记录, 则合并两exon
                    若参考exonId未记录
                        exonDict中新增参考exon对象
                        合并原exon与参考exon
                    将这种映射关系添加到exonCorrected["mapped"]中
            若仅存在单端映射exon
                如果status为KNOWN就不再进行映射, 跳过该exon
                修改exon的status为CORRECTED
                判断exon能否双端映射到两个exon
                    能映射到两个exon
                        在exon的status中添加-start-end后缀
                        获取TSS端相差碱基数最少的exon
                        修改exon的start,end,TSS的位置
                        获取TES端相差碱基数最少的exon
                        修改exon的start,end,TES的位置
                        将映射关系{<exonId>: <newExonId>}保存到exonCorrected["TSS-TES"]中
                    不能映射到两个exon, 则进行下一步
                筛选出相差碱基数最少的exon进行映射
                若仅TSS端可映射
                    在exon的status中添加-start后缀
                    修改exon的start,end,TSS的位置
                    将映射关系{<exonId>: <newExonId>}保存到exonCorrected["TSS"]中
                若仅TES端可映射
                    在exon的status中添加-end后缀
                    修改exon的start,end,TES的位置
                    将映射关系{<exonId>: <newExonId>}保存到exonCorrected["TES"]中
    已弃置 因为已根据参考基因组进行了校正
        0.3 处理total中的exon数据
        遍历total.exonDict中所有.status=="NOVEL"的exon, 记录这些exon
        声明list类型变量exonList, 存储该exon所能映射的所有参考exon, [[<exonRef>, <bool>, <bool>], ...]
        准备校正exon的start及end
            获取[start-5, start+5]范围的所有exon
                从中筛选出[end-5, end+5]范围的所有exon, 认为这些exon为可双端映射的exon
                未筛选出的exon则认为是仅可映射到start的exon
            获取[end-5, end+5]范围的所有exon
                从中筛选出(-oo, start-5)U(start+5, +oo)的所有exon, 认为这些exon为仅可映射到end的exon
            在获取的所有exon中
                优先筛选出可双端映射的exon, 并在这些exon中选择总体距离最近的exon进行映射
                其次才筛选出仅可单端映射的exon, 并在这些exon中选择离单端距离最近的exon进行映射
        校正过程
            若exon可双端映射到一个exon
                获取相差碱基数最少的exon
                在total.exonExisted中添加该映射
                删除total.exonDict中该exon对象
                将映射关系{<exonId>: <newExonId>}保存到exonCorrected["mapped"]中
            若exon不可双端映射到一个exon
                修改exon的status为CORRECTED
                判断exon能否双端映射到两个exon
                    能映射到两个exon
                        在exon的status中添加-start-end后缀
                        获取start端相差碱基数最少的exon
                        修改exon的start的位置
                        获取end端相差碱基数最少的exon
                        修改exon的end的位置
                        将映射关系{<exonId>: <newExonId>}保存到exonCorrected["start-end"]中
                    不能映射到两个exon, 则进行下一步
                筛选出相差碱基数最少的exon进行映射
                若仅start端可映射
                    在exon的status中添加-start后缀
                    修改exon的start的位置
                    将映射关系{<exonId>: <newExonId>}保存到exonCorrected["start"]中
                若仅end端可映射
                    在exon的status中添加-end后缀
                    修改exon的end的位置
                    将映射关系{<exonId>: <newExonId>}保存到exonCorrected["end"]中
    0.4 更新total中的exonDict及exonIndex
        更新transcriptDict以合并可能重复的transcript(因为exon组成可能重复)
        重建Index, 合并重复的exon及transcript
    0.5 合并相邻的exon, 即合并相差EXONRANGE范围内的NOVEL的exon
        情况说明
            在合并距离较近的exon时可能会出现这么一种情况
            当EXONRANGE=5时, 200可以合并到205, 205又可以合并到210, 所以200就可以合并到210
            因此需要对这种情况进行处理, 而不是简单的将两exon根据EXONRANGE进行合并
            所以需要先形成一个簇, 然后对这个簇中所有的exon的start或end进行校正
        建立index
            声明startIndex, {<chr>: {<start>: {<end>: <exonId>, ...}, ...}, ...}
            声明endIndex, {<chr>: {<end>: {<start>: <exonId>, ...}, ...}, ...}
            遍历total.exonDict
                过滤掉所有status==KNOWN的exon
                若status中不含-start, 则在startIndex中添加该exon信息
                若status中不含-end, 则在endIndex中添加该exon信息
                否则, continue
        遍历total.exonDict中所有status不为KNOWN的exon
            处理start
                声明list变量exonSet, {<exonId>, ...}
                遍历所有exon, 去除KNOWN或start已校正的exon, 将剩余的exon添加到exonSet
                只要exonSet不为空集, 就随机提取exonSet中的一个exonId
                    temp中添加所有与该exon的start相同的exon
                    寻找其start上下游各EXONRANGE范围内的exon
                    若寻找不到其他的exon, 就认为该exon的start无法修改, 跳过该exon
                    若能寻找到其他的exon, 就进行下一步
                以expression最大的exon为参照, 修改其他exon的start
                注意, 这些exon的status不能修改为CORRECTED
                删除exonSet中的这些exon
                示例
                    权重: 相对表达量
                        假设有7个exon, 从start=100开始寻找EXONRANGE=5范围的exonStart
                        exon0   95  200
                        exon1   99  201
                        exon2   99  200
                        exon3   100 200
                        exon4   104 204
                        exon5   106 206
                        exon6   107 210
                        exon7   110 210
                    分别递增, 递减寻找EXONRANGE范围内的exon
                        当递减寻找[100-5, 100)的start时, 寻找到了99
                            temp中添加[exon1, expression]
                            temp中添加[exon2, expression]
                            此时再根据99去递减寻找[99-5, 99)范围的start, 寻找到了95
                                temp中添加[exon0, expression]
                            此时再根据95去递减寻找[95-5, 95)范围的start, 寻找到了None
                                当寻找到None时, 该寻找过程终止
                        当递增寻找(100, 100+5]的start时, 寻找到了104
                            temp中添加[exon4, expression]
                            此时再根据104去递增寻找(104, 104+5]范围的start, 寻找到了106
                                temp中添加[exon5, expression]
                            此时再根据106去递增寻找(106, 106+5]范围的start, 寻找到了107
                                temp中添加[exon6, expression]
                            ...
                            此时再根据***去递增寻找(***, ***+5]范围的start, 寻找到了None
                                当寻找到None时, 寻找过程终止
                    当两个寻找过程都停止后, 则开始对temp中的exon的start进行校正
                        筛选temp中expression最大的exon, 作为参考exon
                        其他exon的start都将修改为参考exon的start
            处理end
                声明list变量temp, [[<end>, <expression>], ...]
                权重: 相对表达量
                分别递增, 递减寻找EXONRANGE范围内的exon, 在temp中添加[exon, expression]
                当两个寻找过程都停止后, 则开始对temp中的exon的end进行校正
                    与start处理过程相似
    0.6 在所有exon都处理完后
        更新transcriptDict以合并可能重复的transcript(因为exon组成可能重复)
        重建Index
1. 处理transcript
    1.0 声明变量
        transcriptSet, set, 用于存储NOVEL类型的transcript {<transcriptId>, ...}
    1.1 读取参考基因组注释数据
        存储于变量transcriptRef, {"TSS": {<chr>: {<TSS>: {<TES>: [<transcriptId>, ...],
                                                         ...},
                                                 ...},
                                         ...},
                                 "TES": {<chr>: {<TES>: {<TSS>: [<transcriptId>, ...],
                                                         ...},
                                                 ...},
                                         ...}
                                 }
    1.2 遍历所有transcript, 筛选出status为NOVEL的transcript至变量transcriptSet中
    1.3 提取transcriptSet中的transcript, 寻找其可映射的transcript
        根据TSS获取其上下游rangeTSS范围内的transcript
            在这些transcript中, 筛选TES上下游在rangeTES范围内的transcript, 认为这些transcript是可双端映射的transcript
            剩余的transcript则被认为是仅可校正TSS端的transcript
        根据TES获取其上下游rangeTES范围内的transcript
            在这些transcript中, 筛选TSS上下游不在rangeTSS范围内的transcript, 认为这些transcript是仅可校正TES端的transcript
        开始映射
            判断是否存在映射, 若不存在映射则跳过该transcript, 若存在映射则进行下一步
            优先筛选出双端映射的transcript
                筛选所有双端映射transcript
                    最优先筛选出transcriptId相同的transcript作为参考transcript
                    其次才筛选出相差碱基数最少的transcript作为参考transcript
                检查参考transcript是否已记录
                    若参考transcript已记录, 则直接合并两transcript
                    若参考transcript未记录, 则进行下一步
                检查参考transcript的
                geneObject.transcriptDict中新增相同的transcript对象, 只不过transcriptId为参考transcriptId
                删除原transcript对象
                _transcriptExistedAdd()添加该映射
                修改新transcript对象的基础属性
                    修改status为CORRECTED-start-end
                    修改start为参考transcript的start
                    修改end为参考transcript的end
                    修改transcriptId
                    
            若无双端映射的transcript, 则进行单端校正

    1.4 对于未校正及未完全校正(start或end有一端未能校正)的transcript的未校正端, 尝试检索附近的端并合并
2. 处理gene
3. 重新映射exon并重建index
4. 重新计算relativeExpression及cellLineExpression
'''

'\n根据参考基因组进行校正的思路\n0. 处理exon\n    0.0 声明变量\n        exonRef, dict, 存储注释中的exon信息{<exonId>: {"TSS": <TSS>, "TES": <TES>}}\n        exonRefIndex, dict, 注释中exon的索引{<chr>: {"TSS": {<TSS>: {<TES>: <exonId>,\n                                                                    ...},\n                                                            ...},\n                                                    "TES": {<TES>: {<TSS>: <exonId>,\n                                                                    ...},\n                                                            ...},\n                                            ...}\n        exonCorrected, dict, {<existed exon id>: <corrected exon id>}\n    0.1 引用变量\n        exonCorrectedTSS, int, 若 |exonTSS - 参考exonTSS| <= exonCorrectedTSS, 则认为该exon的TSS可以校正到参考exon的TSS\n        exonCorrectedTES, int, 若 |exonTES - 参考exonTES| <= exonCorrectedTES, 则认为该exon的TES可以校正到参考exon的TES\n    0.2 处理基因组注释数据\n        提取基因组注释中所有exon对应的exonId, strand, start, end\n           

In [12]:
# 读取参考基因组注释中exon的数据, 并建立index
temp = loadRefAnnotation(filename=INPUTREFANNOTATION, allowType="exon")
exonRef = temp["dataRef"]
exonIndex = temp["positionRef"]
# 遍历total.exonDict中所有exon, 记录这些exon
for exonId, exonObject in total.exonDict.items():
    readyCorList.append(exonId)
# 统计校正前exon及transcript的数量
infoDict = {}
infoDict["total exon num before correct"] = len(total.exonDict)
num = 0
for exonId, exonObject in total.exonDict.items():
    if exonObject.status == "NOVEL":
        num += 1
infoDict["novel exon num before correct"] = num
num = 0
for geneId, geneObject in total.geneDict.items():
    for transcriptId in geneObject.transcriptDict.keys():
        num += 1
infoDict["total transcript num before correct"] = num
num = 0
for geneId, geneObject in total.geneDict.items():
    for transcriptObject in geneObject.transcriptDict.values():
        if transcriptObject.status == "NOVEL":
            num +=1
infoDict["novel transcript num before correct"] = num

# 根据参考基因组中exon数据对exon的TSS及TES进行校正
for exonId in readyCorList:
    exonObject = total.exonDict[exonId]
    chr = exonObject.chr
    TSS = exonObject.TSS
    TES = exonObject.TES

    # 声明list类型变量exonList
    exonList = []  # 存储可进行映射的exon [(<exonId>, <dif>, <startMarker>, <endMarker>), ...]

    # 准备校正exon的TSS及TES
    # 获取[TSS-5, TES+5]范围的所有exon
    temp = [exonIndex["TSS"][chr].get(i, None) for i in range(TSS-EXONRANGE, TSS+EXONRANGE+1)]
    # 去除空值
    while None in temp:
        temp.remove(None)
    while {} in temp:
        temp.remove({})
        pass
    for d in temp:
        for tempTES, tempExonId in d.items():
            if abs(tempTES-TES) <= EXONRANGE:
                # 从中筛选出[TES-5, TES+5]范围的所有exon, 认为这些exon为可双端映射的exon
                num = abs(TSS-exonRef[tempExonId]["TSS"]) + abs(TES-tempTES)
                exonList.append((tempExonId, num, True, True))
            else:
                # 仅TSS可映射到tempExon
                num = abs(TSS-exonRef[tempExonId]["TSS"])
                exonList.append((tempExonId, num, True, False))

    # 获取[TES-5, TES+5]范围的所有exon
    temp = [exonIndex["TES"][chr].get(i, None) for i in range(TES-EXONRANGE, TES+EXONRANGE+1)]
    # 去除空值
    while None in temp:
        temp.remove(None)
    while {} in temp:
        temp.remove({})
    for d in temp:
        for tempTSS, tempExonId in d.items():
            # 从中筛选出(-oo, TSS-5)U(TSS+5, +oo)的所有exon, 认为这些exon为仅可映射到TES的exon
            if abs(tempTSS-TSS) > EXONRANGE:
                # 仅end可映射到tempExon
                num = abs(TES-exonRef[tempExonId]["TES"])
                exonList.append((tempExonId, num, False, True))

    # 不分析无法映射的exon
    if len(exonList) == 0:
        continue

    # 校正过程
    mode = False  # 是否存在最佳映射
    for (newExonId, num, startMarker, endMarker) in exonList:
        if startMarker == endMarker == True:
            mode = True
            break
    # 开始映射
    if mode is True:
        # 若exon可双端映射到一个exon
        # 获取相差碱基数最少的exon
        temp = []
        for (newExonId, num, TSSMarker, TESMarker) in exonList:
            if TSSMarker == TESMarker == True:
                temp.append((newExonId, num))
        temp = sorted(temp, key=lambda x: x[1])
        (newExonId, num) = temp[0]
        if num == 0:
            # 表明不必进行start, end的修改
            continue
        # 判断exonId是否相同
        if exonId == newExonId:
            # exonId相同
            # 修改status, exonVersion, start, TSS, end, TES
            total.exonDict[exonId].status = "KNOWN"
            total.exonDict[exonId].exonVersion = exonRef[newExonId]["exonVersion"]
            total.exonDict[exonId].start = exonRef[newExonId]["start"]
            total.exonDict[exonId].end = exonRef[newExonId]["end"]
            total.exonDict[exonId].TSS = exonRef[newExonId]["TSS"]
            total.exonDict[exonId].TES = exonRef[newExonId]["TES"]
        else:
            # exonId不同
            # 查询参考exonId是否已记录
            if newExonId in total.exonDict.keys():
                # 参考exonId已记录, 合并两exon
                total._exonMerge(exonId1=newExonId, exonId2=exonId)
            else:
                # 参考exonId未记录
                # exonDict中新增参考exon对象
                total.exonDict[newExonId] = Exon(status="KNOWN",
                                                 exonId=exonRef[newExonId]["exonId"],
                                                 exonVersion=exonRef[newExonId]["exonVersion"],
                                                 chr=exonRef[newExonId]["chr"],
                                                 strand=exonRef[newExonId]["strand"],
                                                 start=exonRef[newExonId]["start"],
                                                 end=exonRef[newExonId]["end"],
                                                 TSS=exonRef[newExonId]["TSS"],
                                                 TES=exonRef[newExonId]["TES"])
                # 合并原exon与参考exon
                total._exonMerge(exonId1=newExonId, exonId2=exonId)
            # 将映射关系{<exonId>: <newExonId>}保存到exonCorrected["mapped"]中
            exonCorrected["mapped"][exonId] = newExonId
    else:
        # 若exon不可双端映射到一个exon
        # 如果status为KNOWN就不再进行映射, 跳过该exon
        if total.exonDict[exonId].status == "KNOWN":
            continue
        # 修改exon的status为CORRECTED
        total.exonDict[exonId].status = "CORRECTED"
        # 判断exon能否双端映射到两个exon
        marker=False  # 标记该exon能否映射到两个exon, 1-可根据start映射 2-可根据end映射 3-可分别根据start,end映射
        for (newExonId, num, startMarker, endMarker) in exonList:
            if startMarker is True:
                marker = 3 if marker==2 else 1
            else:
                marker = 3 if marker==1 else 2
            if marker == 3:
                break
        if marker == 3:
            # 能映射到两个exon
            # 获取TSS端相差碱基数最少的exon
            temp = [(newExonId, num, startMarker, endMarker) for (newExonId, num, startMarker, endMarker) in exonList if startMarker is True]
            temp = sorted(temp, key=lambda x: x[1])
            (newExonId, num, startMarker, endMarker) = temp[0]
            # 在exon的status中添加-TSS-TES后缀
            total.exonDict[exonId].status = total.exonDict[exonId].status + "-TSS-TES"
            if num != 0:
                # 表明需要进行TSS的修改
                # 修改exon的TSS的位置
                if total.exonDict[exonId].strand == '+':
                    total.exonDict[exonId].start = exonRef[newExonId]["start"]
                    total.exonDict[exonId].TSS = exonRef[newExonId]["TSS"]
                else:
                    total.exonDict[exonId].end = exonRef[newExonId]["end"]
                    total.exonDict[exonId].TSS = exonRef[newExonId]["TSS"]
                # 将映射关系{<exonId>: <newExonId>}保存到exonCorrected["TSS-TES"]中
                exonCorrected["TSS-TES"][exonId] = {"TSS": newExonId, "TES": None}
            # 获取TES端相差碱基数最少的exon
            temp = [(newExonId, num, startMarker, endMarker) for (newExonId, num, startMarker, endMarker) in exonList if endMarker is True]
            temp = sorted(temp, key=lambda x: x[1])
            (newExonId, num, startMarker, endMarker) = temp[0]
            if num != 0:
                # 表明需要进行TES的修改
                # 修改exon的TES的位置
                if total.exonDict[exonId].strand == '+':
                    total.exonDict[exonId].end = exonRef[newExonId]["end"]
                    total.exonDict[exonId].TES = exonRef[newExonId]["TES"]
                else:
                    total.exonDict[exonId].start = exonRef[newExonId]["start"]
                    total.exonDict[exonId].TES = exonRef[newExonId]["TES"]
                # 将映射关系{<exonId>: <newExonId>}保存到exonCorrected["TSS-TES"]中
                if exonId not in exonCorrected["TSS-TES"].keys():
                    exonCorrected["TSS-TES"][exonId] = {"TSS": None, "TES": newExonId}
                else:
                    exonCorrected["TSS-TES"][exonId]["TES"] = newExonId
        else:
            # 仅能单端映射到一个exon
            # 筛选出相差碱基数最少的exon进行映射
            exonList = sorted(exonList, key=lambda x: x[1])
            tempNum = exonList[0][1]
            exonList = [(newExonId, num, startMarker, endMarker) for (newExonId, num, startMarker, endMarker) in exonList if num == tempNum]
            (newExonId, num, startMarker, endMarker) = exonList[0]
            if num == 0:
                # # 表明不必进行TSS/TES的修改
                continue
            # 判断哪一端可进行映射
            if startMarker is True:
                # TSS端可映射
                # 在exon的status中添加-TSS后缀
                total.exonDict[exonId].status = total.exonDict[exonId].status + "-TSS"
                # 修改exon的TSS的位置
                if total.exonDict[exonId].strand == '+':
                    total.exonDict[exonId].start = exonRef[newExonId]["start"]
                    total.exonDict[exonId].TSS = exonRef[newExonId]["TSS"]
                else:
                    total.exonDict[exonId].end = exonRef[newExonId]["end"]
                    total.exonDict[exonId].TSS = exonRef[newExonId]["TSS"]
                # 将映射关系{<exonId>: <newExonId>}保存到exonCorrected["TSS"]中
                exonCorrected["TSS"][exonId] = newExonId
            else:
                # TES端可映射
                # 在exon的status中添加-TES后缀
                total.exonDict[exonId].status = total.exonDict[exonId].status + "-TES"
                # 修改exon的TES的位置
                if total.exonDict[exonId].strand == '+':
                    total.exonDict[exonId].end = exonRef[newExonId]["end"]
                    total.exonDict[exonId].TES = exonRef[newExonId]["TES"]
                else:
                    total.exonDict[exonId].start = exonRef[newExonId]["start"]
                    total.exonDict[exonId].TES = exonRef[newExonId]["TES"]
                # 将映射关系{<exonId>: <newExonId>}保存到exonCorrected["TES"]中
                exonCorrected["TES"][exonId] = newExonId


---

暂存部分

In [None]:
# 一级函数
def buildIndex(total, allowType, siteType, statusSet=None):
    '''
    input:
        total, Total Object
        allowType, str, "gene" or "transcript" or "exon"
        siteType, str, "start" or "end" or "TSS" or "TES"
        statusSet, set, 只有status在这个set中的iterm才会被纳入index
    '''
    total = total
    allowType = allowType
    siteType = siteType
    statusSet = (statusSet, ("KNOWN", "NOVEL", "CORRECTED-start", "CORRECTED-end", "CORRECTED-start-end", "CORRECTED-end-start"))[statusSet is None]

    mapped = {"start": "end", "end": "start", "TSS": "TES", "TES": "TSS"}
    index = {}

    if allowType == "gene":
        for geneId, geneObject in total.geneDict.items():
            chr = geneObject.chr
            status = geneObject.status
            # 过滤掉不符合条件的iterm
            if status not in statusSet:
                continue
            data = {"start": geneObject.start, "end": geneObject.end, "TSS": geneObject.TSS, "TES": geneObject.TES}
            if chr not in index.keys():
                index[chr] = {data[siteType]: {data[mapped[siteType]]: geneId}}
            else:
                if data[siteType] not in index[chr].keys():
                    index[chr][data[siteType]] = {data[mapped[siteType]]: geneId}
                else:
                    if data[mapped[siteType]] not in index[chr][data[siteType]].keys():
                        index[chr][data[siteType]][data[mapped[siteType]]] = geneId
                    else:
                        pass

    elif allowType == "transcript":
        1
    elif allowType == "exon":
        1
    else:
        raise ValueError("buildIndex(): the allowType should be 'gene' or 'transcript' or 'exon', but it is {}".format(allowType))

In [None]:
# 遍历total.exonDict中所有exon对象

In [None]:
# 遍历total.exonDict中所有exon对象


In [None]:
# 二级函数
def findIdAdvanced(index, chr, TSS, TES, rangeTSS, rangeTES, allowType):
    '''
    input:
        index, dict
        chr, str
        TSS, int
        TES, int
        rangeTSS, int
        rangeTES, int
        allowType, str, "gene" or transcript
    change:
        查询index中, 与TSS或TES上下游分别相距rangeTSS或rangeTES的id
    return:
        tuple, (id, markerTSS, markerTES)
    '''
    index = index = transcriptRef
    chr = chr
    TSS = TSS
    TES = TES
    rangeTSS = rangeTSS = RANGETSS
    rangeTES = rangeTES = RANGETES

    a = findId(index=index, chr=chr, location=location, rangeMax=rangeMax, orientation='-', allowType=allowType)
    return a

In [32]:
# 根据参考基因组注释数据对start及end进行校正
for geneObject in total.geneDict.values():
    for transcriptObject in geneObject.transcriptDict.values():
        strand = transcriptObject.strand
        start = transcriptObject.start
        end = transcriptObject.end
        (TSS, TES) = ((end, start), (start, end))[strand is '+']

        

{'TSS': {'1': {1471765: {1497848: ['ENST00000673477.1']},
   1478026: {1497848: ['ENST00000472194.6']},
   1479049: {1482662: ['ENST00000378736.3']},
   1483485: {1496202: ['ENST00000485748.5']},
   1484569: {1496201: ['ENST00000474481.1']},
   1471784: {1496201: ['ENST00000308647.8']},
   182696: {184174: ['ENST00000624431.2']},
   2581560: {2584533: ['ENST00000424215.1']},
   3069168: {3434342: ['ENST00000511072.5']},
   3069183: {3186591: ['ENST00000607632.1']},
   3069197: {3435421: ['ENST00000378391.6']},
   3069211: {3434342: ['ENST00000514189.5']},
   3069203: {3438621: ['ENST00000270722.10']},
   3237931: {3433925: ['ENST00000512462.5']},
   3244132: {3386918: ['ENST00000463591.1']},
   3396491: {3434293: ['ENST00000509860.1']},
   3424919: {3434095: ['ENST00000378389.5']},
   3425216: {3426072: ['ENST00000606170.1']},
   5307394: {5301928: ['ENST00000641871.1']},
   2412564: {2403964: ['ENST00000288774.8'], 2403974: ['ENST00000447513.7']},
   2412456: {2404061: ['ENST000006502

end

---

---

In [31]:
import plotly.graph_objects as go
trace = go.Pie(labels=["novel exon", "start-end corrected", "start corrected", "end corrected"],
	   values=[43206, 5630, 15297, 10371],
	   marker={"colors": ["#505168", "#2A7F62", "#75DDDD", "#E8E1EF"],
			   "line": {"color": "white",
						"width": 1}},
	   sort=False,
	   textinfo="value",
	   textfont_size=12,
	   insidetextorientation='auto',
	   hole=0,
	   pull=[0.1, 0, 0, 0],
	   )
fig = go.Figure(trace)

layout={"width": 500, "height": 500,
		"margin": {'l':25, 'r':25, 't':25, 'b':25},
        "title": {"text": "Correction of exon<br>Range: 20",
				  "font": {"family": "Arial",
						   "color": "black",
						   "size": 16},
				   "x": 0.4},
        "font": {"family": "Arial",  # 作用所有地方
				 "color": "black",
				 "size": 12,
				 },
        "paper_bgcolor": "white",
        "plot_bgcolor": "white",
        }
fig = fig.update_layout(layout)
'''
将最大校正范围设定在5bp时, 在标注为novel的74504个exon中, 有3558个exon可以完全映射到已注释exon上, 有16229个exon的start附近存在已注释exon, 有10967个exon的end附近存在已注释exon。
已根据其映射情况修改了exon的start或end的位置
'''

In [30]:
#fig.show()
fig.write_image("20.png")

In [33]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 创建子图，两个 Y 轴
fig = make_subplots(specs=[[{"secondary_y": True}]])

trace1 = go.Scatter(x=[1,2,3,4,5,6,7,8,9,10,
                       11,12,13,14,15,16,17,18,19,20],
                   y=[213,934,1826,3197,3558,3867,4071,4233,4416,4559,
                      4677,4825,4914,5005,5138,5225,5308,5437,5546,5630],
                   mode="lines+markers",
                   name="start-end exon",
                   line={"color": "#2A7F62"})
trace2 = go.Scatter(x=[1,2,3,4,5,6,7,8,9,10,
                       11,12,13,14,15,16,17,18,19,20],
                   y=[44356,44197,44197,44197,43750,43665,43603,43549,43549,43476,
                      43445,43410,43410,43346,43327,43298,43278,43257,43232,43206],
                   mode="lines+markers",
                   name="novel exon",
                   line={"color": "#505168"})

# 添加第一个 Y 轴的数据
fig.add_trace(trace1, secondary_y=False)
# 添加第二个 Y 轴的数据
fig.add_trace(trace2, secondary_y=True)

# 设置图形布局
layout = {"width": 700, "height": 400,
		    "margin": {'l':25, 'r':25, 't':25, 'b':25},
          "title": {"text": "",
				  "font": {"family": "Arial",
						   "color": "black",
						   "size": 16},
				   "x": 0.4},
        "font": {"family": "Arial",  # 作用所有地方
				 "color": "black",
				 "size": 12,
				 },
         "xaxis": {"showline": True,
                   "linecolor": "black",
                   "linewidth": 1,
                   "showticklabels": True,
                   "dtick": 2},
         "yaxis": {"showline": True,
                   "linecolor": "black",
                   "linewidth": 1,
                   "showticklabels": True,},
         "yaxis2": {"showline": True,
                   "linecolor": "black",
                   "linewidth": 1,
                   "showticklabels": True,},
        "paper_bgcolor": "white",
        "plot_bgcolor": "white",}
fig = fig.update_layout(layout)
fig = fig.update_layout(title_text="Correction of exons in different ranges",
                  xaxis_title="Range",
                  yaxis_title="Counts of start-end exon",
                  yaxis2_title="Counts of novel exon")

# 显示图形
#fig.write_image("line.png")
fig.show()
'''
之所以选择exon的最大校正范围为5bp, 是因为已经分析了在最大校正范围为1~20时的校正情况。
当范围设定在5bp时, novel exon的减少速度趋于稳定, 且在所有exon中双端可映射到已注释exon的数量的增加速度趋于稳定。
'''

不根据参考基因组注释进行校正:

校正前total exon num: 323363
校正前novel exon num: 133311
校正前total transcript num: 213387
校正前novel transcript num: 185981
mapped 6095
start-end 6503
start 22997
end 19860
校正后total exon num: 309350
校正后novel exon num: 72014
校正后total transcript num: 211068
校正后novel transcript num: 183662

根据参考基因组注释进行校正:



	novel exon	start-end exon	start exon	end exon	remained exon
1	74504	213	17626	12309	44356
2	74504	934	17311	12062	44197
3	74504	1826	16935	11713	44197
4	74504	3197	16355	11107	44197
5	74504	3558	16229	10967	43750
6	74504	3867	16119	10853	43665
7	74504	4071	16029	10801	43603
8	74504	4233	15961	10761	43549
9	74504	4416	15877	10704	43549
10	74504	4559	15802	10667	43476
11	74504	4677	15753	10629	43445
12	74504	4825	15694	10575	43410
13	74504	4914	15651	10557	43410
14	74504	5005	15621	10532	43346
15	74504	5138	15543	10496	43327
16	74504	5225	15501	10480	43298
17	74504	5308	15452	10466	43278
18	74504	5437	15395	10415	43257
19	74504	5546	15337	10389	43232
20	74504	5630	15297	10371	43206
