# gtfParser Class

In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#######################################################################################
###                                                                                 ###
###     Copyright (C) 2019  Zhongxu ZHU, CityU, 20200925                            ###
#######################################################################################

# https://github.com/Jverma/GFF-Parser


class gtfParser:
    def __init__(self, input_file):
        import sys
        self.data = {}
        self.dict = {}
        self.gene_attributes_dict = {} # ZZX
        self.transcriptID_geneID_dict = {} # ZZX

        sys.stderr.write("#####################\nParsing reference gtf file: " + input_file + '\n#####################\n')

        for line in open(input_file):
            if line.startswith("#"): continue
            record = line.strip().split("\t")
            sequence_name = record[0]
            source = record[1]
            feature = record[2]
            start = int(record[3])
            end = int(record[4])
            if (record[5] != '.'):
                score = record[5]
            else:
                score = None
            strand = record[6]
            if (record[7] != '.'):
                frame = record[7]
            else:
                frame = None
            
            attributes = record[8].split(';')
            attributes = [x.strip() for x in attributes[0:-1]] # ZZX
            
            if(" " in record[8] and "\"" in record[8]): # compatible with refSeq annotation 20210505
                attributes = {x.split(' ')[0]: x.split(' ')[1].strip("\"") for x in attributes if " " in x} # ZZX
            elif ("=" in record[8]) : # compatible with gencode annotation 20210505
                attributes = {x.split('=')[0]: x.split('=')[1] for x in attributes} # ZZX
                           
            if not (sequence_name in self.data): self.data[sequence_name] = []
            alpha = {'source': source, 'feature': feature, 'start': start, 'end': end, 'score': score, 'strand': strand,
                     'frame': frame}
            # python 3 .items(), python 2 .iteritems() ZZX
            for k, v in attributes.items(): alpha[k] = v
            self.data[sequence_name].append(alpha)

        # ZZX
        for k, v in self.data.items():
            for alpha in v:
                
                if alpha['feature'] == 'gene': continue # compatible with refSeq annotation 20210505
                
                gene_id = alpha["gene_id"] # refSeq version
                gene_id = alpha["gene_name"] # gencode version,如果是refseq，注释掉该行
                transcript_id = alpha["transcript_id"]

                self.transcriptID_geneID_dict[transcript_id] = gene_id
                if gene_id in self.gene_attributes_dict.keys():
                    self.gene_attributes_dict[gene_id].append(alpha)
                else:
                    self.gene_attributes_dict[gene_id] = list()
                    self.gene_attributes_dict[gene_id].append(alpha)


    def getRecordsByID(self, id, attType, attValue):
        """ Gets all the features for a given gene.
            Parameters
            ----------
            id : Identifier of the gene (gene_id) or mRNA (transcript_id).

            Returns
            -------
            A list of dictionaries where each dictionary contains the
            informations about features for the transcript.
            """

        att_list = []
        if id in self.gene_attributes_dict.keys():
            for x in self.gene_attributes_dict[id]:
                if ( attType in x.keys() and  x[attType] == attValue ):
                    att_info = x
                    att_list.append(att_info)
        elif id in self.transcriptID_geneID_dict.keys():
            for x in self.gene_attributes_dict[self.transcriptID_geneID_dict[id]]:
                if ( attType in x.keys() and  x[attType] == attValue and x["transcript_id"] == id):
                    att_info = x
                    att_list.append(att_info)
        else:
            sys.stderr.write("Could not find ID "+id+'\n')
            sys.stderr.write("Could not find attribute " + attType + ": " + attValue+'\n')
            sys.exit(1)

        return att_list
    


# Initiating

In [2]:


from optparse import OptionParser
import sys
import fastaparser   

"""
Description
"""
__author__    = "Zhongxu"
__copyright__ = "Copyright 2020, Planet Earth"

sys.argv = ["", "-c", "/data/home2/Zhongxu/tmp/ORFFinder/CRC20230310Full/ensembl/candidate.aa.fa",
                "-r", "/data/home2/Zhongxu/tmp/ORFFinder/CRC20230310Full/ensembl/ref.aa.fa",
                "-m", "/data/home2/Zhongxu/tmp/ORFFinder/CRC20230310Full/ensembl/transcript.gene.map",
                "-o", "/data/home2/Zhongxu/tmp/ORFFinder/CRC20230310Full/ensembl/orf.analysis.tsv",
                "-p", "orffinder", # sqanti or orffinder
                #"-g", "/data/home2/Zhongxu/Ref/refSeq.hg38.gtf",
                "-g", "/data/home2/Zhongxu/tmp/ORFFinder/gencode.v37.annotation.gff3",
                #"-g", "/data/home2/Zhongxu/ref/refSeq.hg19.gtf",
                "-s", "/data/home2/Zhongxu/work/cuhk-crc/20230310out/gencode/1mergegff/all.5.filter.gtf",
                #"-s", "/data/home2/Zhongxu/work/cuhk-crc/20230310out/refseq/1mergegff/all.3.filter20230311.ThreeClassCode.gtf",
                #"-s", "/data/home2/Zhongxu/work/cuhk-crc/20220507/gencode/1mergegff/all.5.filter.gtf",
                #"-s", "/data/home2/Zhongxu/work/cuhk-crc/20220507/refseq/1mergegff/all.3.filter20220507.ThreeClassCode.gtf"
                #"-s", "/data/home2/Zhongxu/work/cuhk-crc/analysis/t0504.ThreeClassCode.gtf"
                #"-s", "/data/home2/Zhongxu/work/pbFLNC/c99i95refMin2/gff/s2.clean.gtf"
                #"-s", "/dataserver145/genomics/zhongxu/work/HCC-organoid-AS/analysis0504gencode/organoid/1prepareMergedGff/all.5.filter.gtf"
                #"-s", "/data/home2/Zhongxu/work/cuhk-crc/20210504Gencode/1prepareMergedGff/all.5.filter.gtf"
            
           ]

parser = OptionParser()
parser.add_option("-c", "--candidate", dest="candidate",
                  help="candidate protein sequence predicted by ORFFinder or Sqanti", metavar="FILE")
parser.add_option("-r", "--reference", dest="reference",
                  help="reference protein sequence", metavar="FILE")
parser.add_option("-m", "--map", dest="idmap",
                  help="1st col: transcript ID, 2ed col: Gene ID", metavar="FILE")
parser.add_option("-o", "--output", dest="out",
                  help="Output file path", metavar="FILE")
parser.add_option("-g", "--refgtf", dest="refgtf",
                  help="GTF file path", metavar="FILE")
parser.add_option("-s", "--usergtf", dest="usergtf",
                  help="GTF file path", metavar="FILE")
parser.add_option("-b", "--blast", dest="blast",
                  help="Blast file path", metavar="FILE")
parser.add_option("-p", "--predict", dest="orfpredictiontool",
                  help="ORF prediction tool. sqanti or orffinder", metavar="FILE")
parser.add_option("-e", "--ensembl", dest="ensemblannotation",action="store_true", default=False,
                  help="If ensembl or gencode annotation", metavar="FILE")
#-----------------------------------------------------


def proteinSimilarity(seq1,seq2):
    from Bio import pairwise2 as pw2
    seq_length = min(len(seq1), len(seq2))
    global_align = pw2.align.globalxx(seq1, seq2)
    try:
        matches = global_align[0][2]
    except Exception as e:
        print("Seq1\n")
        print(seq1)
        print("\nSeq2\n")
        print(seq2)
    percent_match = (matches / seq_length) * 100
    seq2_res = global_align[0][0] + '\n' + global_align[0][1]
    return round( percent_match,1 ), seq2_res

#-----------------------------------------------------
(options, args) = parser.parse_args()
#-----------------------------------------------------

# Parse reference sequence

In [3]:

#-----------------------------------------------------
print("1 -- parsing this file to store gene and transcript mapping information")
# parsing this file to store gene and transcript mapping information
transcriptGeneMap = {line.strip().split("\t")[0]:line.strip().split("\t")[1] for line in open(options.idmap)}
geneTranscriptListMap = {}

for line in open(options.idmap):
    gene = line.strip().split("\t")[1]
    transcript = line.strip().split("\t")[0]
    if not gene in geneTranscriptListMap.keys():
        geneTranscriptListMap[gene]=[]
    geneTranscriptListMap[gene].append(transcript)
        


# store CDS start and end position
refCDSStartEndMap = {}
ref_seq_gtf = gtfParser(options.refgtf)


print("2 -- parsing ref protein sequence and store cds start and end")
# ref.aa.fa: parsing ref protein sequence and store cds start and end
refIdSeqMap = {}

with open(options.reference) as fasta_file:
    parser = fastaparser.Reader(fasta_file)
    for seq in parser:
        if seq.sequence_as_string() == "SEQUENCE UNAVAILABLE": continue
        for e in seq.id.split(";"):
            # obtain cds start and end
            
            refmRNA_start = ref_seq_gtf.getRecordsByID(e, "feature", "start_codon")
            start = ''
            end = ''
                        
            if len(refmRNA_start) != 1:
                start = ''
                end = ''
            else:
                strand = refmRNA_start[0]["strand"]
                if strand == '+':                
                    start = refmRNA_start[0]["start"]
                else: # 如果在负链
                    start = refmRNA_start[0]["end"]
                
                refmRNA_end = ref_seq_gtf.getRecordsByID(e, "feature", "stop_codon")
                if (len(refmRNA_end) != 1):
                    end = ''
                else:
                    if strand == '+':
                        end = refmRNA_end[0]["end"]
                    else:
                        end = refmRNA_end[0]["start"]           
            
            refCDSStartEndMap[e] = str(start)+"|"+str(end)
            # remove * at the end of sequence and remove the first M amino acid
            if seq.sequence_as_string()[-1]=='*': # if the last AA is *
                refIdSeqMap[e] = seq.sequence_as_string()[1:-1] 
            else: # if the last AA is not *
                refIdSeqMap[e] = seq.sequence_as_string()
del ref_seq_gtf



#-------------------------------------------
# 将candidate.aa.fa的结果读到map中，ORFID：line #结果是best的，一个orf只对应一行

print("3 -- parsing user protein sequence and store cds start and end")
# parsing user protein sequence and store cds start and end
user_seq_gtf = gtfParser(options.usergtf)
userCDSStartEndMap = {}



# parsing predicted ORF and store cds start and end
canIdSeqMap = {}
def mapTranscriptPosToGenomic(gtf, transcriptID, start, end):
    records = gtf.getRecordsByID(transcriptID, "feature", "exon")
    strand = records[0]["strand"]
    
    regions = []
    
    for record in records:
        e_start = record["start"]
        e_end   = record["end"]
        regions.extend(list(range(e_start,e_end+1)))
    
    if strand == '+':
        start = regions[int(start)] # 本来打算要start-1的，不知道为什么不减1得到的是正确的。20210421补充，sqanti是正确的
        end   = regions[int(end)]
    else:
        start = regions[-1*int(start)-1] # 本来打算start，不知道为什么必须减1得到的是正确的
        end   = regions[-1*int(end)-1]
    return start, end



if(options.orfpredictiontool=="orffinder"): # 如果是ORFFinder输出的fasta
    print("3 -- parsing user protein sequence and store cds start and end - ORFFinder")              
    with open(options.candidate) as fasta_file:
        parser = fastaparser.Reader(fasta_file)

        for seq in parser:
            
            if('-CDS:' in seq.id):
                id = seq.id.split("_",1)[1].split(":")[0] + ":" + seq.id.split("_",1)[1].split(":")[1]
                start = seq.id.split(":")[2]
                end = seq.id.split(":")[3].split(" ")[0]
            else:
                id = seq.id.split("_",1)[1].split(":")[0]
                start = seq.id.split(":")[1]
                end = seq.id.split(":")[2].split(" ")[0]
            orf = id+":"+start+":"+end
            # >lcl|ORF2_All-H720T_ORG.7684.1:140:1894 unnamed protein product
            if id in canIdSeqMap.keys(): # 选择最长的那个
                if ( (len(seq.sequence_as_string()) -1 ) > len(canIdSeqMap[id]) ): # 减1是不考虑第一个M氨基酸
                    canIdSeqMap[id] = seq.sequence_as_string()[1:]
                    g_start, g_end = mapTranscriptPosToGenomic(user_seq_gtf, id, start, end)
                    userCDSStartEndMap[id] = str(g_start)+"|"+str(g_end)
            else:
                canIdSeqMap[id] = seq.sequence_as_string()[1:]
                g_start, g_end = mapTranscriptPosToGenomic(user_seq_gtf, id, start, end)
                userCDSStartEndMap[id] = str(g_start)+"|"+str(g_end)
elif(options.orfpredictiontool=="sqanti"):
    print("3 -- parsing user protein sequence and store cds start and end - SQANTI") 
    with open(options.candidate) as fasta_file:
        parser = fastaparser.Reader(fasta_file)
        
        for seq in parser:
            # >All-H720T_ORG.7684.1   gene_41230|GeneMark.hmm|584_aa|+|141|1895
            id = seq.id
            start = str(int(seq.description.split("|")[4])-1)
            end = str(int(seq.description.split("|")[5])-1)

            orf = id+":"+start+":"+end

            canIdSeqMap[id] = seq.sequence_as_string()[1:]
            g_start, g_end = mapTranscriptPosToGenomic(user_seq_gtf, id, start, end)
            userCDSStartEndMap[id] = str(g_start)+"|"+str(g_end)                
            
            
del user_seq_gtf

#-------------------------------------------

1 -- parsing this file to store gene and transcript mapping information


#####################
Parsing reference gtf file: /data/home2/Zhongxu/tmp/ORFFinder/gencode.v37.annotation.gff3
#####################


2 -- parsing ref protein sequence and store cds start and end
3 -- parsing user protein sequence and store cds start and end


#####################
Parsing reference gtf file: /data/home2/Zhongxu/work/cuhk-crc/20230310out/gencode/1mergegff/all.5.filter.gtf
#####################


3 -- parsing user protein sequence and store cds start and end - ORFFinder


# Parsing blast result

In [4]:

#transcriptGeneMap['All-H720T_ORG.7684.1']

#refCDSStartEndMap['All-H720T_ORG.7684.1']

#print(refIdSeqMap['ENST00000532278.1']+'---')
#print(len(canIdSeqMap.keys()))

# Analysis

In [None]:
#-------------------------------------------
if options.out:
    ofs = open(options.out,"w", encoding='utf-8')
else:
    ofs = sys.stdout
    
def errInfo(err, id, seq, transcript, refSeq, similarity, seqLength, refSeqLength, 
            len_diff, upstreamSimilarity, u_similar_res, downstreamSimilarity, d_similar_res, stopCodonStateOneRound):
    sys.stderr.write(err)
    sys.stderr.write(id+": "+ seq +"\n")
    sys.stderr.write(transcript+": "+ refSeq +"\n")
    sys.stderr.write("Similarity: "+ str(similarity) +"\n")  
    sys.stderr.write("Candidate length: "+ str(seqLength) +"\n")  
    sys.stderr.write("Reference length: "+ str(refSeqLength) +"\n")  
    sys.stderr.write("Length different: "+ str(len_diff) +"\n")  
    sys.stderr.write("Upstream similarity (10aAA), similarity: "+ str(upstreamSimilarity) +"\n")  
    sys.stderr.write(u_similar_res +"\n")  
    sys.stderr.write("Downstream similarity (20aAA), similarity: "+ str(downstreamSimilarity) +"\n")  
    sys.stderr.write(d_similar_res +"\n")  
    sys.stderr.write("Stop codon status "+stopCodonStateOneRound+'\n')
    sys.stderr.write("Unknown condition, please check ----- \n")
    sys.exit(1)
    
ofs.write("\t".join(["Gene", "Isoform", "CDS Start", "CDS End", "Class", 
                     "Protein Length" ,"Frame", "Start Codon","Stop Codon", "Reference","Protein Length Change","Predicted Protein Sequence"])+'\n')

frameMap = { 'unchanged': 1,
             "5' elongation": 2,
             "3' elongation": 2,
             "insert": 2,
             "delete": 2,
             "contain annotated orf": 2,
             'in-frame change': 3,
             'frameshift': 4,
             'novel start': 5,
             'novel transcript': 5,
             'novel': 5 ,
             'no reference orf': 6,
             'todo1': 0 ,
             'todo2': 0 ,
             'todo3': 0}
classMap ={ 0: "todo", 1: "unchanged", 2:"elongation", 3:"in-frame change", 4:"frameshift", 5:"novel", 6:"non-coding or located in non-coding gene"}

# impact priority: unchanged < elongation < inframe < frameshift < novel <- ''

n_trans = 0
for id, seq in canIdSeqMap.items():
    gene = '' #得到基因
    n_trans = n_trans + 1
    # if n_trans < 47015: continue  # debug  
        
    if not id in transcriptGeneMap.keys():
        continue # 不在关注的基因内
        
    if ( "-NM" in id or "-XM" in id or "-NR" in id or "-XR" in id or "ENST00" in id):  # 如果是参考的话，则不考虑
        gene = transcriptGeneMap[id]
        nm = id.replace("All-","").replace("ALL-","")
        if nm.startswith("NR") or nm.startswith("XR") or not nm in refCDSStartEndMap.keys():
            ofs.write("\t".join([gene, id, '', '', "annotated", "", "annotated", "annotated", "", ""])+"\n")
            continue
        if nm in refCDSStartEndMap.keys():# 如果在参考转录本的蛋白序列文件中candidate.aa.fa
            nm_s_e = refCDSStartEndMap[nm]
            nm_s_e = nm_s_e.split("|")
            pro_len = str(len(refIdSeqMap[nm]))
        else: # 如果不在参考转录本的蛋白序列文件中candidate.aa.fa
            nm_s_e = [' ', ' ']
            pro_len = '0'
            
        ofs.write("\t".join([gene, id, nm_s_e[0], nm_s_e[1], "annotated", pro_len, "annotated", "annotated", "", ""])+"\n")
        continue     # 如果是参考的话，则不考虑
    
    if seq == "": # 如果预测的序列为空的话, sqanti会输出为空的序列
        gene = transcriptGeneMap[id]
        ofs.write("\t".join([gene, id, '', '', "unannotated", "", "unannotated", "unannotated", "", ""])+"\n")
        continue
    
    
    user_transcript_start = int(userCDSStartEndMap[id].split("|")[0])
    user_transcript_end   = int(userCDSStartEndMap[id].split("|")[1])

    gene = transcriptGeneMap[id] #得到基因
    
    
    transcript_in_gene = geneTranscriptListMap[gene] # 得到基因内的转录本
    transcript_in_gene = [t for t in transcript_in_gene if not t.startswith("A")] # 只考虑参考转录本，不以ALL-开头
    
    
    frameshiftStateFinal = []       # 单个基因
    frameshiftTranscriptFinal = []  # 单个基因
    stopCodonStateFinal = []       # 单个基因
    lengthChangeFinal = []
    startCodonStateFinal = []
    
    #sys.stderr.write(id,str(user_transcript_start),str(user_transcript_end))
    seqLength    = len(seq) #待研究的转录本信息
    strand = ''
    if user_transcript_start - user_transcript_end < 0: # bug解决，原来是大于0，应该是小于，20210416
        strand = '+'
    elif user_transcript_start - user_transcript_end > 0:
        strand = '-'
    maximum_similiary = 0
      
    for transcript in transcript_in_gene: # 遍历基因内的参考转录本
        # print(transcript) # debug
        
        frameshiftStateOneRound = '' # 单个transcript
        stopCodonStateOneRound = ''  # 单个transcript
        startCodonStateOneRound = ''
        
        if not transcript in refIdSeqMap.keys():
            continue # 如果没有氨基酸序列，则continue
        refSeq = refIdSeqMap[transcript]
        
        refSeqLength = len(refSeq) # 参考转录本信息    
        if(refSeqLength) <= 2: continue # ENST00000679073.1的氨基酸序列是F* # 20230328
        
        similarity, s_similar_res = proteinSimilarity(seq, refSeq)
                
        upstreamSimilarity, u_similar_res = proteinSimilarity(seq[0:10], refSeq[0:10]) # 前十个氨基酸序列的相似性
        downstreamSimilarity, d_similar_res = proteinSimilarity(seq[-20:], refSeq[-20:]) # 后二十个氨基酸序列的相似性
        if similarity > maximum_similiary:
            maximum_similiary = similarity
        

        pos_split = refCDSStartEndMap[transcript].split("|")
        if pos_split[1] == '': continue
        ref_start = int(pos_split[0])
        ref_end = int(pos_split[1])
              
        # 判断终止密码子的位置 stopCodon
        if user_transcript_end == ref_end:
            stopCodonStateOneRound = "unchanged"
        elif strand == '+':
            if user_transcript_end > ref_end:
                stopCodonStateOneRound = "late"
            else:
                stopCodonStateOneRound = "early"
        elif strand == "-":
            if user_transcript_end > ref_end:
                stopCodonStateOneRound = "early"
            else:
                stopCodonStateOneRound = "late" 
                
        # 判断起始密码子的位置
        #print(strand)
        #print(str(user_transcript_start)+'-'+str(ref_start) )
        #print(str(user_transcript_end)+'-'+str(ref_end) )
        
        if user_transcript_start == ref_start:
            startCodonStateOneRound = "unchanged"
        elif strand == '+':
            if user_transcript_start > ref_start:
                startCodonStateOneRound = "late"
            else:
                startCodonStateOneRound = "early"
        elif strand == "-":
            if user_transcript_start > ref_start:
                startCodonStateOneRound = "early"
            else:
                startCodonStateOneRound = "late"        
        

                
        # 判断移码情况 frameshift
        len_diff = seqLength - refSeqLength
        # 20210421不考虑起始位置，只根据相似性来判断
        if True:
 #       if user_transcript_start == ref_start and user_transcript_end == ref_end :
            if seq == refSeq:
                frameshiftStateOneRound = "unchanged"
            elif similarity >= 99 and abs(len_diff) <= 5: # 起始位置相同，相似度在95以上，且长度不变,定为不变，20210421
                frameshiftStateOneRound = "unchanged"
            elif similarity >= 99 and abs(len_diff) > 5: # 起始位置相同，相似度在95以上，且长度不变,定为不变，20210421
                frameshiftStateOneRound = "in-frame change" 
            elif (seq[1:7] in refSeq or upstreamSimilarity >= 80) and (seq[-7:-1] in refSeq or downstreamSimilarity >= 80):
                frameshiftStateOneRound = "in-frame change" # 前后都包含,不考虑
            elif upstreamSimilarity >= 80 and downstreamSimilarity >= 80:
                frameshiftStateOneRound = "in-frame change"
            elif similarity < 50 and upstreamSimilarity >=80:
                frameshiftStateOneRound = "frameshift" # 起始位置相同，但整体相似性不高，判断为移码
            elif refSeq in seq and refSeq[1:5] != seq[1:5]:
                frameshiftStateOneRound = "contain annotated orf" # 前面相同，后面不同
            elif (seq[1:7] in refSeq or upstreamSimilarity >= 80) and not seq[-7:-1] in refSeq and downstreamSimilarity < 80:
                frameshiftStateOneRound = "frameshift" # 前面相同，后面不同
            elif not seq[1:7] in refSeq and upstreamSimilarity < 80:
                frameshiftStateOneRound = "novel transcript" # 前面不同             
            else:
                errInfo("Same start and end codon, but different sequences\n", id, seq, transcript, refSeq, similarity, seqLength, refSeqLength, 
                                    len_diff, upstreamSimilarity, u_similar_res, downstreamSimilarity, d_similar_res, stopCodonStateOneRound)     
        else:
            frameshiftStateOneRound = "todo1"
            errInfo("todo1\n", id, seq, transcript, refSeq, similarity, seqLength, refSeqLength, 
                                    len_diff, upstreamSimilarity, u_similar_res, downstreamSimilarity, d_similar_res, stopCodonStateOneRound)
   

    # update
        frameshiftStateFinal.append(frameshiftStateOneRound)
        frameshiftTranscriptFinal.append(transcript)
        stopCodonStateFinal.append(stopCodonStateOneRound)
        lengthChangeFinal.append(str(len_diff))
        startCodonStateFinal.append(startCodonStateOneRound)
    ############# write output
    #print(frameshiftStateFinal)
    #print(frameshiftTranscriptFinal)
    #print(stopCondanStateFinal)

    if (len(stopCodonStateFinal)==0):
        frameshiftStateFinal = ["no reference orf"]
    else:
        seq='M'+seq # 添加第一个氨基酸M
    
    ofs.flush()
    
    ofs.write(gene+'\t')
    ofs.write(id+'\t')    
    ofs.write(str(user_transcript_start)+'\t')       
    ofs.write(str(user_transcript_end)+'\t')  
    ofs.write(classMap[min([frameMap[t] for t in frameshiftStateFinal])]+'\t')
    ofs.write(str( len(seq)+1)+'\t')    
    ofs.write(', '.join(frameshiftStateFinal)+'\t')
    ofs.write(', '.join(startCodonStateFinal)+'\t')
    ofs.write(', '.join(stopCodonStateFinal)+'\t')
    ofs.write(', '.join(frameshiftTranscriptFinal)+'\t')
    ofs.write(', '.join(lengthChangeFinal)+'\t')
    ofs.write(seq+'\n')    
    ofs.flush() 
    
    

ofs.flush()    
ofs.close()


print("Finished\nTotal transcript: ")
print(n_trans)
