From 62e5f301f6aa5339b91a95ef8f6136f4290ef40d Mon Sep 17 00:00:00 2001 From: Beibei Chen Qbri Date: Mon, 6 Oct 2014 01:26:15 -0500 Subject: [PATCH 1/6] Deal with reads without MD flag --- lib/CLIP.py | 1 + lib/Mutation2.py | 67 +++++++++++++++++++++++++----------------------- 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/lib/CLIP.py b/lib/CLIP.py index 9c9a646..4d32ee4 100644 --- a/lib/CLIP.py +++ b/lib/CLIP.py @@ -247,6 +247,7 @@ def updateMutation(self,read,mis): elif self.type ==2: mutation_filter = Utils.filterMutations(mutations,"G->A",True) mutations = mutation_filter + #logging.debug("read %s " % read) if len(mutations)>0: for m in mutations: #print m diff --git a/lib/Mutation2.py b/lib/Mutation2.py index 08a80d8..ee29424 100644 --- a/lib/Mutation2.py +++ b/lib/Mutation2.py @@ -47,12 +47,14 @@ def countMismatch(b): for i in myList: if i[0]=="NM": return (i[1]) + return 0 def survey(entry): + #logging.debug(entry.cigar) mismatchNumber = countMismatch(entry.tags) insertionNumber = countInsertionNumber(entry.cigar) deletionNumber = countDeletionNumber(entry.cigar) - substitutionNumber = mismatchNumber-insertionNumber-deletionNumber + substitutionNumber = max(0,mismatchNumber-deletionNumber) #print insertionNumber,deletionNumber return ([insertionNumber,deletionNumber,substitutionNumber]) @@ -92,6 +94,7 @@ def parseCIGAR(ci): #calculate match segment length list return matchSeg def parseMD(b): + buf = [] myList = b for j in myList: if j[0]=="MD": @@ -242,37 +245,37 @@ def getMutations(infile,read): #logging.debug("call getMutation function %s" % read) mutationList = [] b=read.tags - if countMismatch(b)>0: #and countMismatch(b)<2 and countMatchNumber(item.cigar)>=20: - #tmp.write(item) - sur = survey(read) - insertion = sur[0] - deletion = sur[1] - substi = sur[2] - insertionSeqLoc = [] - if insertion > 0: - insertionDic = insertionLocation(read,insertion) - for k in insertionDic.keys(): - for loc_index in range(len(insertionDic[k])): - insertionSeqLoc.append(insertionDic[k][loc_index]) - mu = read.seq[insertionDic[k][loc_index]] - loc = k+loc_index+read.pos - if read.tid >=0: - chr = infile.getrname(read.tid) - if read.is_reverse: - strand = '-' - mu = RC([mu])[0] - else: - strand = "+" - mutationList.append(MutationBed(chr,loc,loc+1,read.qname,1,strand,"Insertion->"+mu))#change insertionDic[k][loc_index] to 1 - insertionSeqLoc.sort() - if deletion + substi > 0: - for mu in mutationLocation(read,insertionSeqLoc): - if read.tid>=0: - chr = infile.getrname(read.tid) - newMu = MutationBed(chr,mu[0],mu[1],mu[2],mu[3],mu[4],mu[5]) - mutationList.append(newMu) - #logging.debug(mutationList) - return mutationList + + sur = survey(read) + insertion = sur[0] + deletion = sur[1] + substi = sur[2] + #logging.debug("insertio:,%d, deletion:%d,sub:%d" % (insertion,deletion,substi)) + insertionSeqLoc = [] + if insertion > 0: + insertionDic = insertionLocation(read,insertion) + for k in insertionDic.keys(): + for loc_index in range(len(insertionDic[k])): + insertionSeqLoc.append(insertionDic[k][loc_index]) + mu = read.seq[insertionDic[k][loc_index]] + loc = k+loc_index+read.pos + if read.tid >=0: + chr = infile.getrname(read.tid) + if read.is_reverse: + strand = '-' + mu = RC([mu])[0] + else: + strand = "+" + mutationList.append(MutationBed(chr,loc,loc+1,read.qname,1,strand,"Insertion->"+mu))#change insertionDic[k][loc_index] to 1 + insertionSeqLoc.sort() + if deletion + substi > 0: + for mu in mutationLocation(read,insertionSeqLoc): + if read.tid>=0: + chr = infile.getrname(read.tid) + newMu = MutationBed(chr,mu[0],mu[1],mu[2],mu[3],mu[4],mu[5]) + mutationList.append(newMu) + #logging.debug(mutationList) + return mutationList def getTruncations(infile,read): From 74fc1affcbae2fa9dd19386e6b78ebc2e58b4282 Mon Sep 17 00:00:00 2001 From: Beibei Chen Qbri Date: Mon, 6 Oct 2014 16:39:23 -0500 Subject: [PATCH 2/6] Separate plus and minus clusters --- lib/CLIP.py | 53 ++++++++++++++++++++++++++++++++++++----------------- pipeclip.py | 2 +- 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/lib/CLIP.py b/lib/CLIP.py index 4d32ee4..3c0b875 100644 --- a/lib/CLIP.py +++ b/lib/CLIP.py @@ -29,15 +29,17 @@ def __init__(self,fileaddr,prefix): self.currentGroupKey = "None" self.currentGroup = [] #Used in rmdup self.previousQul = [0,0,0]#for rmdup,[matchlen,mapq,mismatch] - self.clusters = [] - self.currentCluster = Alignment.BED("",0,0,"",0,".") + self.clusters_plus = [] + self.clusters_minus = [] + self.clusters = [] + self.currentCluster_plus = Alignment.BED("",0,0,"",0,"+") + self.currentCluster_minus = Alignment.BED("",0,0,"",0,"-") #self.sigClusters = {} self.mutations = {} #Dictionary of bed instance self.mutationCount = 0 self.sigMutations = {}#same as sigClusters self.sigMutationCount = 0 self.sigClusterCount = 0 - self.wig = None self.coverage = 0 #"reads coverage of this sample" self.bamheader = None self.crosslinking = {} @@ -100,9 +102,9 @@ def readfile(self): # for i in self.filteredAlignment: # print i -# def printClusters(self): -# for i in self.clusters: -# print i + def printClusters(self): + for i in self.clusters: + print i def printMutations(self): for i in self.mutations.values(): @@ -221,19 +223,33 @@ def rmdup(self): def updateCluster(self,read): '''Cluster new read to known clusters and update cluster reads count''' strandDic = {"True":"-","False":"+"} - clusterName = "cluster"+"_"+str(len(self.clusters)+1) + clusterName = "cluster"+"_"+str(len(self.clusters_plus)+len(self.clusters_minus)+1) newRead = Alignment.ClusterBed(self.originalBAM.getrname(read.tid),read.pos,read.pos+len(read.seq),clusterName,1,strandDic[str(read.is_reverse)]) - if self.currentCluster.chr == "": #Initiate cluster - self.currentCluster = newRead - self.clusters.append(self.currentCluster) + if read.is_reverse: + if self.currentCluster_minus.chr == "": #Initiate cluster + self.currentCluster_minus = newRead + self.clusters.append(self.currentCluster_minus) + else: + if self.currentCluster_minus.overlap(newRead): + self.currentCluster_minus.merge(newRead) + self.clusters_minus[-1]=self.currentCluster_minus + else:#New read is a new cluster + #self.clusters.append(self.currentCluster) + self.currentCluster_minus = newRead + self.clusters_minus.append(self.currentCluster_minus) else: - if self.currentCluster.overlap(newRead): - self.currentCluster.merge(newRead) - self.clusters[-1]=self.currentCluster - else:#New read is a new cluster - #self.clusters.append(self.currentCluster) - self.currentCluster = newRead - self.clusters.append(self.currentCluster) + if self.currentCluster_plus.chr == "": #Initiate cluster + self.currentCluster_plus = newRead + self.clusters_plus.append(self.currentCluster_plus) + else: + if self.currentCluster_plus.overlap(newRead): + self.currentCluster_plus.merge(newRead) + self.clusters_plus[-1]=self.currentCluster_plus + else:#New read is a new cluster + #self.clusters.append(self.currentCluster) + self.currentCluster_plus = newRead + self.clusters_plus.append(self.currentCluster_plus) + def updateMutation(self,read,mis): mutations = [] @@ -393,6 +409,9 @@ def filter(self,matchLen,mismatch,cliptype,duprm): self.posfilteredBAM = pysam.Samfile(self.outprefix+".pos.filtered.bam","rb")# move file pointer to the file head self.negfilteredBAM = pysam.Samfile(self.outprefix+".neg.filtered.bam","rb")# move file pointer to the file head self.originalBAM = None + self.clusters = self.clusters_plus + self.clusters_minus + self.clusters_plus = None + self.clusters_minus = None logging.debug("After filtering, %d reads left" % (self.filteredAlignment)) logging.debug("There are %d clusters in total" % (len(self.clusters))) logging.debug("There are %d mutations in total" % (len(self.mutations))) diff --git a/pipeclip.py b/pipeclip.py index 59a4255..09dd875 100755 --- a/pipeclip.py +++ b/pipeclip.py @@ -3,7 +3,6 @@ #Refactored by: eric.roos@utsouthwestern.edu #Usage: python pipeclip.py input.sam output_prefix match_length mismatch_number pcr_rm fdr_cluster clip_type fdr_mutation species #Required packages: pysam, ghmm, pybedtools -#Last modification: 18 Sep 2014 import sys @@ -36,6 +35,7 @@ def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluste logging.info("Species info %s" % species) if myClip.readfile(): myClip.filter(matchLength,mismatch,clipType,rmdup) + #myClip.printClusters() #myClip.printMutations() if len(myClip.clusters)>0: logging.info("Get enriched clusters") From 98b74e66121c0fdf9f9e265eb854f081f8edc1d5 Mon Sep 17 00:00:00 2001 From: Beibei Chen Qbri Date: Wed, 8 Oct 2014 10:20:45 -0500 Subject: [PATCH 3/6] reduce memory, mutation dic to array --- lib/Alignment.py | 21 +++++---- lib/CLIP.py | 14 +++--- lib/Enrich.py | 110 +++++++++++++++++++++++++++++------------------ lib/Utils.py | 12 ++++++ pipeclip.py | 92 +++++++++++++++++++-------------------- 5 files changed, 147 insertions(+), 102 deletions(-) diff --git a/lib/Alignment.py b/lib/Alignment.py index ea9c3ca..4b1336f 100644 --- a/lib/Alignment.py +++ b/lib/Alignment.py @@ -79,15 +79,18 @@ def fishertest(self): self.fisherP = fp +class wiglist: + def __init__(self): + self.pos = [] + self.value = [] -class BAM: - def __init__(self,filepath): - self.filePath = filePath - self.header = None - self.sorted = None - - def checkHeader(self): - '''Use gzip package to check if this is a bam or sam''' - pass + def valueByPos(self,p): + try: + return self.value[self.pos.index(p)] + except: + return False + def update(self,p,v): + self.pos.append(p) + self.value.append(v) diff --git a/lib/CLIP.py b/lib/CLIP.py index 3c0b875..6576e39 100644 --- a/lib/CLIP.py +++ b/lib/CLIP.py @@ -12,7 +12,9 @@ import OptValidator import subprocess import datetime +import gc +gc.enable() OptValidator.opt_validate() @@ -406,12 +408,14 @@ def filter(self,matchLen,mismatch,cliptype,duprm): pysam.index(self.outprefix+".neg.filtered.bam") #self.filteredBAM = pysam.Samfile(outprefix+".filtered.bam","rb")# move file pointer to the file head - self.posfilteredBAM = pysam.Samfile(self.outprefix+".pos.filtered.bam","rb")# move file pointer to the file head - self.negfilteredBAM = pysam.Samfile(self.outprefix+".neg.filtered.bam","rb")# move file pointer to the file head - self.originalBAM = None + self.posfilteredBAM = self.outprefix+".pos.filtered.bam" + self.negfilteredBAM = self.outprefix+".neg.filtered.bam" + del self.originalBAM self.clusters = self.clusters_plus + self.clusters_minus - self.clusters_plus = None - self.clusters_minus = None + del self.clusters_plus + del self.clusters_minus + self.mutations = self.mutations.values() + gc.collect() logging.debug("After filtering, %d reads left" % (self.filteredAlignment)) logging.debug("There are %d clusters in total" % (len(self.clusters))) logging.debug("There are %d mutations in total" % (len(self.mutations))) diff --git a/lib/Enrich.py b/lib/Enrich.py index 51422a9..04989c4 100755 --- a/lib/Enrich.py +++ b/lib/Enrich.py @@ -25,8 +25,11 @@ import OptValidator import datetime import Utils +from memory_profiler import profile +import gc OptValidator.opt_validate() +gc.enable() stats = importr('stats') @@ -45,6 +48,7 @@ def BH(pvalue,pRank,N): qva = max(pvalue, q) return qva +#@profile def KMvalue(posmapfile,negmapfile,mufile): ''' Calculate K(coverage) value for each mutation location @@ -90,9 +94,11 @@ def KMvalue(posmapfile,negmapfile,mufile): else: km_pair[pair_name] = 1 #km.append(item) + gc.collect() return km_pair -def KMvalue_ignore(posmapfile,negmapfile,mufile): +@profile +def KMvalue_test(posmapfile,negmapfile,mufile): ''' Calculate K(coverage) value for each mutation location Mutations are already unique. @@ -100,7 +106,62 @@ def KMvalue_ignore(posmapfile,negmapfile,mufile): km = []#store mutations with updated k value km_pair = {}#Dic of count tuples of (k,m),key:"K_M" count = 0 + currentChr = "" + #logging.debug("make wig %s" % str(datetime.datetime.now())) + start_time = datetime.datetime.now() + posBAM = pysam.Samfile(posmapfile,"rb") + negBAM = pysam.Samfile(negmapfile,"rb") + for item in mufile: + count += 1 + if count % 5000 == 0: + stop_time = datetime.datetime.now() + #logging.debug("Counting K-M for %d mutation sites, using %s" % (count,str(stop_time-start_time))) + start_time = stop_time + st = [] + strand = item.strand + M = item.score + K = 0 + if item.chr != currentChr: + poswig = Utils.makeWigListByChr(posBAM,item.chr) + negwig = Utils.makeWigListByChr(negBAM,item.chr) + currentChr = item.chr + #logging.debug("Time begin to pileup is %s" % (str(datetime.datetime.now()))) + if strand == "+": + try: + K = poswig.valueByPos(item.start) + except: + continue + elif strand == "-": + try: + K = negwig.valueByPos(item.start) + except: + contiune + + if K>=M: + item.updateK(K) + pair_name = str(K)+"_"+str(M) + if km_pair.has_key(pair_name): + km_pair[pair_name] += 1 + else: + km_pair[pair_name] = 1 + #km.append(item) + posBAM.close() + negBAM.close() + del posBAM + del negBAM + gc.collect() + return km_pair + + +def KMvalue_ignore(posmapfile,negmapfile,mufile): + ''' + Calculate K(coverage) value for each mutation location + Mutations are already unique. + ''' + km = []#store mutations with updated k value + km_pair = {}#Dic of count tuples of (k,m),key:"K_M" + count = 0 start_time = datetime.datetime.now() for item in mufile: count += 1 @@ -143,12 +204,12 @@ def uniq(b): #b is a list uniqElements.sort() return uniqElements - +#@profile def mutationEnrich(clip,threshold=0.01): coverage = clip.coverage *1.0 totalMuCount = clip.mutationCount #(original_KM,KM_test) = KMvalue(clip.originalBAM, clip.mutations) - KM_test = KMvalue(clip.posfilteredBAM,clip.negfilteredBAM, clip.mutations.values())#check after doing KM, if clip.mutations changed + KM_test = KMvalue_test(clip.posfilteredBAM,clip.negfilteredBAM, clip.mutations)#check after doing KM, if clip.mutations changed clip.posfilteredBAM = None clip.negfilteredBAM = None #logging.info("Finished K-M counting, starting fitting.") @@ -164,7 +225,7 @@ def mutationEnrich(clip,threshold=0.01): km_p[k]=p pCount = dict(Counter(pvalues)) pRank = freqRank(pCount,True) - total_test = len(clip.mutations.keys()) + total_test = len(clip.mutations) pqDic={} for i in pRank.keys(): try: @@ -175,7 +236,7 @@ def mutationEnrich(clip,threshold=0.01): print >> sys.stderr,"Cannot find p value in dictionary" continue count = 0 - for mu in clip.mutations.values(): + for mu in clip.mutations:#.values(): name = str(mu.kvalue)+"_"+str(mu.score) try: mu.pvalue = km_p[name] @@ -190,6 +251,7 @@ def mutationEnrich(clip,threshold=0.01): mu.sig = True clip.sigMutationCount += 1 clip.addSigToDic(clip.sigMutations,mu) + clip.mutations = None #logging.info("There are %d reliable mutations" % clip.sigMutationCount) @@ -255,6 +317,7 @@ def clusterEnrich(clip,threshold=0.01): clip.clusters[i].sig = True clip.sigClusterCount += 1 #clip.addSigToDic(clip.sigClusters,clip.clusters[i]) + nbDic = None if clip.sigClusterCount == 0: return False else: @@ -275,40 +338,3 @@ def fisherTest(clusterp,mutationp): return fps -def mutationEnrich_ignore(clip): - coverage = clip.coverage - if self.isPar: #input is a par,no need to split the file - filename = self.outputRoot+".bed" - outputfile = open(filename,"wa") - print >> outputfile,"#chr\tstart\tend\tname\tp\tstrand\ttype\tk\tm" - for reliable_mu in self.muEvaluate(self.bamFile,self.mutationFile,coverage,self.fdr): - print >>outputfile,'\t'.join(reliable_mu) - else: #splitfile to insertion, deletion, substitution - insertion = [] - deletion = [] - substitution = [] - for item in self.mutationFile: - if item[-1].count("Deletion")>0: - deletion.append(item) - elif item[-1].count("Insertion")>0: - insertion.append(item) - else: - substitution.append(item) - del_name = self.outputRoot+"_deletion.bed" - ins_name = self.outputRoot+"_insertion.bed" - sub_name = self.outputRoot+"_substitution.bed" - - outfile_del = open(del_name,"wa") - outfile_ins = open(ins_name,"wa") - outfile_sub = open(sub_name,"wa") - print >> outfile_ins,"#chr\tstart\tend\tname\t-log(q)\tstrand\ttype\tk\tm" - print >> outfile_del,"#chr\tstart\tend\tname\t-log(q)\tstrand\ttype\tk\tm" - print >> outfile_sub,"#chr\tstart\tend\tname\t-log(q)\tstrand\ttype\tk\tm" - for reliable_mu in self.muEvaluate(self.bamFile,insertion,coverage,self.fdr): - print >> outfile_ins,'\t'.join(reliable_mu) - for reliable_mu in self.muEvaluate(self.bamFile,deletion,coverage,self.fdr): - print >> outfile_del,'\t'.join(reliable_mu) - for reliable_mu in self.muEvaluate(self.bamFile,substitution,coverage,self.fdr): - print >> outfile_sub,'\t'.join(reliable_mu) - - diff --git a/lib/Utils.py b/lib/Utils.py index e87da64..2471d1c 100644 --- a/lib/Utils.py +++ b/lib/Utils.py @@ -85,6 +85,18 @@ def makeWig(bamfile): return wig +def makeWigByChr(bamfile,chr): + wig = {} + for pileupColumn in bamfile.pileup(chr): + wig[str(pileupColumn.pos)]=pileupColumn.n + return wig + + +def makeWigListByChr(bamfile,chr): + wig = Alignment.wiglist() + for pileupColumn in bamfile.pileup(chr): + wig.update(pileupColumn.pos,pileupColumn.n) + return wig def bisort(alist,b): '''List is a list of bed instances, b is also an instance''' diff --git a/pipeclip.py b/pipeclip.py index 09dd875..be700b9 100755 --- a/pipeclip.py +++ b/pipeclip.py @@ -36,21 +36,21 @@ def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluste if myClip.readfile(): myClip.filter(matchLength,mismatch,clipType,rmdup) #myClip.printClusters() - #myClip.printMutations() - if len(myClip.clusters)>0: - logging.info("Get enriched clusters") - status = Enrich.clusterEnrich(myClip,fdrEnrichedCluster) - if status: - logging.info("Found %d enriched clusters" % myClip.sigClusterCount) - myClip.printEnrichedClusters() - else: - logging.error("There is no enriched cluster found. Exit program") - sys.exit(1) - else: - logging.error("There is no clusters found. Please check input.Exit program.") - sys.exit(1) + # myClip.printMutations() +#BC# if len(myClip.clusters)>0: +#BC# logging.info("Get enriched clusters") +#BC# status = Enrich.clusterEnrich(myClip,fdrEnrichedCluster) +#BC# if status: +#BC# logging.info("Found %d enriched clusters" % myClip.sigClusterCount) +#BC# myClip.printEnrichedClusters() +#BC# else: +#BC# logging.error("There is no enriched cluster found. Exit program") +#BC# sys.exit(1) +#BC# else: +#BC# logging.error("There is no clusters found. Please check input.Exit program.") +#BC# sys.exit(1) - if len(myClip.mutations.keys())>0: + if len(myClip.mutations)>0: logging.info("Get reliable mutations") Enrich.mutationEnrich(myClip,fdrReliableMutation) logging.info("There are %d reliable mutations" % myClip.sigMutationCount) @@ -58,38 +58,38 @@ def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluste else: logging.warning("There is no mutation found in this BAM file.") #Start to get crosslinking sites - if myClip.sigClusterCount > 0 and myClip.sigMutationCount>0: - logging.info("Get cross-linking sites") - myClip.getCrosslinking() - if (len(myClip.crosslinking.keys())>0): - outfilelist = myClip.printCrosslinkingSites() - myClip.printCrosslinkingMutations() - else: - logging.warning("There is no crosslinking found. May be caused by no reliable mutations in enriched clusters. Print out enriched clusters instead.") - outfilelist = myClip.printEnrichClusters() - else: - if myClip.sigClusterCount <= 0: - logging.error("There is no enriched clusters for this sample, please check your input file. Exit.") - sys.exit(2) - elif myClip.sigMutationCount <=0: - logging.warning("There is no reliable mutations found. PIPE-CLIP will provide enriched clusters as crosslinking candidates.") - outfilelist = myClip.printEnrichClusters() - #annotation if possible - if species in ["mm10","mm9","hg19"]: - logging.info("Started to annotate cross-linking sits using HOMER") - for name in outfilelist: - #logging.debug("Start to do annotation for %s" % name) - Utils.annotation(name,species) - #output a status log file - logfile = open(outputPrefix+".pipeclip.summary.log","w") - print >>logfile,"PIPE-CLIP run finished. Parameters are:" - print >> logfile,"Input BAM: %s \nOutput prefix: %s \nMinimum matched length: %d \nMaximum mismatch count: %d \nPCR duplicate removal code: %d \nFDR for enriched clusters: %f \nFDR for reliable mutations: %f" % (infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,fdrReliableMutation) - print >> logfile, "There are %d mapped reads in input BAM file. After filtering,%d reads left" % (myClip.originalMapped,myClip.filteredAlignment) - print >> logfile, "%d out of %d clusters are enriched." % (myClip.sigClusterCount,len(myClip.clusters)) - print >> logfile, "%d out of %d mutations are reliable." % (myClip.sigMutationCount,myClip.mutationCount) - print >> logfile, "%d crosslinking site candidates are found, with %d supporting reliable mutations." % (len(myClip.crosslinking.keys()),len(myClip.crosslinkingMutations)) - logfile.close() - logging.info("PIPE-CLIP finished the job, please check your results. :)") +#BC# if myClip.sigClusterCount > 0 and myClip.sigMutationCount>0: +#BC# logging.info("Get cross-linking sites") +#BC# myClip.getCrosslinking() +#BC# if (len(myClip.crosslinking.keys())>0): +#BC# outfilelist = myClip.printCrosslinkingSites() +#BC# myClip.printCrosslinkingMutations() +#BC# else: +#BC# logging.warning("There is no crosslinking found. May be caused by no reliable mutations in enriched clusters. Print out enriched clusters instead.") +#BC# outfilelist = myClip.printEnrichClusters() +#BC# else: +#BC# if myClip.sigClusterCount <= 0: +#BC# logging.error("There is no enriched clusters for this sample, please check your input file. Exit.") +#BC# sys.exit(2) +#BC# elif myClip.sigMutationCount <=0: +#BC# logging.warning("There is no reliable mutations found. PIPE-CLIP will provide enriched clusters as crosslinking candidates.") +#BC# outfilelist = myClip.printEnrichClusters() +#BC# #annotation if possible +#BC# if species in ["mm10","mm9","hg19"]: +#BC# logging.info("Started to annotate cross-linking sits using HOMER") +#BC# for name in outfilelist: +#BC# #logging.debug("Start to do annotation for %s" % name) +#BC# Utils.annotation(name,species) +#BC# #output a status log file +#BC# logfile = open(outputPrefix+".pipeclip.summary.log","w") +#BC# print >>logfile,"PIPE-CLIP run finished. Parameters are:" +#BC# print >> logfile,"Input BAM: %s \nOutput prefix: %s \nMinimum matched length: %d \nMaximum mismatch count: %d \nPCR duplicate removal code: %d \nFDR for enriched clusters: %f \nFDR for reliable mutations: %f" % (infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,fdrReliableMutation) +#BC# print >> logfile, "There are %d mapped reads in input BAM file. After filtering,%d reads left" % (myClip.originalMapped,myClip.filteredAlignment) +#BC# print >> logfile, "%d out of %d clusters are enriched." % (myClip.sigClusterCount,len(myClip.clusters)) +#BC# print >> logfile, "%d out of %d mutations are reliable." % (myClip.sigMutationCount,myClip.mutationCount) +#BC# print >> logfile, "%d crosslinking site candidates are found, with %d supporting reliable mutations." % (len(myClip.crosslinking.keys()),len(myClip.crosslinkingMutations)) +#BC# logfile.close() +#BC# logging.info("PIPE-CLIP finished the job, please check your results. :)") else: logging.error("File corruputed, program exit.") sys.exit(0) From 5f5ad791dc5efd9e9750617c63bd1bbc30688bdb Mon Sep 17 00:00:00 2001 From: Beibei Chen Qbri Date: Wed, 8 Oct 2014 12:06:20 -0500 Subject: [PATCH 4/6] bug fix in mutation --- lib/Mutation2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/Mutation2.py b/lib/Mutation2.py index ee29424..04bc4a6 100644 --- a/lib/Mutation2.py +++ b/lib/Mutation2.py @@ -214,7 +214,8 @@ def mutationLocation(entry,insertLoc):#return mutation location in strand = '+' index1 = loc - match.pos insertionBefore = countInsertionBefore(index1,insertLoc) - index1 += insertionBefore #added 9 Oct + index1 += insertionBefore-set_insertion #added 9 Oct + set_insertion = insertionBefore mutation = [str(loc),str(loc+1),match.qname,1,strand,"Deletion->"+ch] #change str(index1) to 1 yield mutation pre = ch From 450f5c86489b569b013727caa20d113bae352ab7 Mon Sep 17 00:00:00 2001 From: Beibei Chen Qbri Date: Sun, 2 Nov 2014 20:37:13 -0600 Subject: [PATCH 5/6] clustering bug fix and memory reduce --- lib/CLIP.py | 174 ++++++++++++++++++++++++++++++++++++++++---- lib/Enrich.py | 121 +++++++++++++++--------------- lib/Utils.py | 11 ++- lib/ZTNB_tryCatch.R | 4 - pipeclip.py | 8 +- 5 files changed, 233 insertions(+), 85 deletions(-) diff --git a/lib/CLIP.py b/lib/CLIP.py index 6576e39..ac0b54c 100644 --- a/lib/CLIP.py +++ b/lib/CLIP.py @@ -12,7 +12,9 @@ import OptValidator import subprocess import datetime +import time import gc +import os gc.enable() OptValidator.opt_validate() @@ -23,6 +25,7 @@ def __init__(self,fileaddr,prefix): self.filepath = fileaddr self.originalBAM = None self.originalMapped = 0 + self.refInfo = [] self.posfilteredBAM = None self.negfilteredBAM = None self.filteredAlignment = 0 @@ -31,15 +34,18 @@ def __init__(self,fileaddr,prefix): self.currentGroupKey = "None" self.currentGroup = [] #Used in rmdup self.previousQul = [0,0,0]#for rmdup,[matchlen,mapq,mismatch] + self.clusters = [] self.clusters_plus = [] self.clusters_minus = [] - self.clusters = [] + self.clusterCount = 0 self.currentCluster_plus = Alignment.BED("",0,0,"",0,"+") self.currentCluster_minus = Alignment.BED("",0,0,"",0,"-") #self.sigClusters = {} self.mutations = {} #Dictionary of bed instance self.mutationCount = 0 + self.mutationList = [] self.sigMutations = {}#same as sigClusters + self.kmpair = {} self.sigMutationCount = 0 self.sigClusterCount = 0 self.coverage = 0 #"reads coverage of this sample" @@ -95,6 +101,7 @@ def testInput(self): def readfile(self): try: self.originalBAM = pysam.Samfile(self.filepath,"rb") + self.refInfo = zip(self.originalBAM.references,self.originalBAM.lengths) return True except IOError,message: logging.error("Cannot open input file"+message) @@ -105,13 +112,39 @@ def readfile(self): # print i def printClusters(self): - for i in self.clusters: - print i + outfile = open(self.outprefix+".clusters.bed","w") + for i in [self.clusters_plus,self.clusters_minus]: + for j in i: + print >>outfile,j + outfile.close() def printMutations(self): + outfile = open(self.outprefix+".mutations.bed","w") for i in self.mutations.values(): - print i + print >>outfile,i + outfile.close() + + def printMutations_chr(self): + fhs = {}#file handle dic, key is chr, value is file handle + for chr,chrlen in self.refInfo: + outfile = open(self.outprefix+"."+chr+".mutations.bed","w") + fhs[chr] = outfile + for i in self.mutations.itervalues(): + print >>fhs[i.chr],i + #clean up + for fh in fhs.itervalues(): + fh.close() + del fhs + + + def printMutationSingleChr(self,chr): + outfile = open(self.outprefix+"."+chr+".mutations.bed","w") + for i in self.mutations.itervalues(): + print >>outfile,i + #clean up + outfile.close() + def printReliableMutations(self): outfile = open(self.outprefix+".reliableMutations.pipeclip.bed","w") header = "#chr\tstart\tstop\tmutation_name\tM_value\tstrand\ttype\tK_value\tp_value\tfdr" @@ -230,7 +263,8 @@ def updateCluster(self,read): if read.is_reverse: if self.currentCluster_minus.chr == "": #Initiate cluster self.currentCluster_minus = newRead - self.clusters.append(self.currentCluster_minus) + self.clusters_minus.append(self.currentCluster_minus) + self.clusterCount += 1 else: if self.currentCluster_minus.overlap(newRead): self.currentCluster_minus.merge(newRead) @@ -239,10 +273,12 @@ def updateCluster(self,read): #self.clusters.append(self.currentCluster) self.currentCluster_minus = newRead self.clusters_minus.append(self.currentCluster_minus) + self.clusterCount+=1 else: if self.currentCluster_plus.chr == "": #Initiate cluster self.currentCluster_plus = newRead self.clusters_plus.append(self.currentCluster_plus) + self.clusterCount+=1 else: if self.currentCluster_plus.overlap(newRead): self.currentCluster_plus.merge(newRead) @@ -251,6 +287,7 @@ def updateCluster(self,read): #self.clusters.append(self.currentCluster) self.currentCluster_plus = newRead self.clusters_plus.append(self.currentCluster_plus) + self.clusterCount += 1 def updateMutation(self,read,mis): @@ -320,7 +357,7 @@ def getCrosslinking(self): self.crosslinking[cross_key] = Alignment.CrosslinkingBed(cluster.chr,cluster.start,cluster.stop,cluster.name,cluster.score,cluster.strand,cluster.pvalue,cluster.qvalue,mutation.start,mutation.name,mutation.pvalue) self.crosslinkingMutations.append(mutation) #start to calculate fisher test p value - for k in self.crosslinking.keys(): + for k in self.crosslinking.iterkeys(): self.crosslinking[k].fishertest() @@ -332,6 +369,9 @@ def filter(self,matchLen,mismatch,cliptype,duprm): logging.info("CLIP type:%s" % (str(cliptype))) logging.info("Rmdup code:%s" % (str(duprm))) logging.info("There are %d reads in origianl input file" % (self.originalBAM.mapped)) + #print >> sys.stderr,zip(self.originalBAM.references,self.originalBAM.lengths) + #sys_info = os.system("free -m") + #logging.debug(sys_info) #outBAM = pysam.Samfile(outprefix+".filtered.bam","wb",template=self.originalBAM) self.originalMapped = self.originalBAM.mapped outBAM_pos = pysam.Samfile(self.outprefix+".pos.filtered.bam","wb",template=self.originalBAM) @@ -406,17 +446,123 @@ def filter(self,matchLen,mismatch,cliptype,duprm): #pysam.index(outprefix+".filtered.bam") pysam.index(self.outprefix+".pos.filtered.bam") pysam.index(self.outprefix+".neg.filtered.bam") - #self.filteredBAM = pysam.Samfile(outprefix+".filtered.bam","rb")# move file pointer to the file head - self.posfilteredBAM = self.outprefix+".pos.filtered.bam" self.negfilteredBAM = self.outprefix+".neg.filtered.bam" - del self.originalBAM self.clusters = self.clusters_plus + self.clusters_minus - del self.clusters_plus - del self.clusters_minus - self.mutations = self.mutations.values() + self.printClusters() + self.printMutations_chr() + #Clean up variables + self.originalBAM = None + self.clusters_plus = None + self.clusters_minus = None + self.mutations = None gc.collect() logging.debug("After filtering, %d reads left" % (self.filteredAlignment)) - logging.debug("There are %d clusters in total" % (len(self.clusters))) - logging.debug("There are %d mutations in total" % (len(self.mutations))) + logging.debug("There are %d clusters in total" % (self.clusterCount)) + logging.debug("There are %d mutations in total" % (self.mutationCount)) + + def filter2(self,matchLen,mismatch,cliptype,duprm): + '''Filter the input BAM file according to parameters. Make clusters and mutations at the same time''' + logging.info("Start to filter alignment using parameters:") + logging.info("match length:%d" % (matchLen)) + logging.info("mismatch count: %d" % (mismatch)) + logging.info("CLIP type:%s" % (str(cliptype))) + logging.info("Rmdup code:%s" % (str(duprm))) + logging.info("There are %d reads in origianl input file" % (self.originalBAM.mapped)) + + self.originalMapped = self.originalBAM.mapped + outBAM_pos = pysam.Samfile(self.outprefix+".pos.filtered.bam","wb",template=self.originalBAM) + outBAM_neg = pysam.Samfile(self.outprefix+".neg.filtered.bam","wb",template=self.originalBAM) + self.type = cliptype + if cliptype == 3:#make sure there is no rmdup for iCLIP data + duprm = 0 + count = 0 + start_time = datetime.datetime.now() + currentChr = "" + for alignment in self.originalBAM: + if not alignment.cigar : #reads is unmapped + continue + c_chr = self.originalBAM.getrname(alignment.tid) + if currentChr == "": + currentChr = c_chr + elif c_chr!=currentChr: + #logging.debug("Mutation sites for Current ref %s is %d" % (currentChr,len(self.mutations.keys()))) + self.printMutationSingleChr(currentChr) + self.mutations = None + gc.collect() + self.mutations = {} + currentChr = c_chr + count += 1 + if count % 5000000 ==0: + stop_time = datetime.datetime.now() + logging.debug("Processed %d reads in %s" % (count,str(stop_time-start_time))) + start_time = stop_time + flag,mlen,mis = Utils.readQuaFilter(alignment,matchLen,mismatch) + if flag: + if duprm > 0: + #get group key + if duprm == 1: + groupkey = Utils.rmdupKey_Start(alignment) + elif duprm == 2: + groupkey = Utils.rmdupKey_Seq(alignment) + #check current group + if groupkey == self.currentGroupKey:#overlap with current group, update group + self.updateCurrentGroup(alignment,mlen,mis) + else:#get read from current group and discard it, use current read to start a new current group + if self.currentGroupKey!="None":#current group exists + keep = self.rmdup() + #logging.debug("Pop out read to keep %s" % keep) + self.currentGroup = [] + self.filteredAlignment += 1 + flag,mlen,mis = Utils.readQuaFilter(keep,matchLen,mismatch) + self.updateCLIPinfo(keep,mlen,mis) + #outBAM.write(keep) + if keep.is_reverse: + outBAM_neg.write(keep) + else: + outBAM_pos.write(keep) + self.iniDupGroupInfo(alignment,groupkey,mlen,mis)#make new group using current alignment + else:#there is no rmdup + #logging.debug("Good read, update clip info %s" % read.qname) + self.filteredAlignment+=1 + self.updateCLIPinfo(alignment,mlen,mis) + #outBAM.write(alignment) + if alignment.is_reverse: + outBAM_neg.write(alignment) + else: + outBAM_pos.write(alignment) + #clean up the final dupGroup, if rmdup==0, there is no final dupGroup + if len(self.currentGroup)>0: + keep = self.rmdup() + self.currentGroup = [] + self.filteredAlignment+=1 + flag,mlen,mis = Utils.readQuaFilter(keep,matchLen,mismatch) + self.updateCLIPinfo(keep,mlen,mis) + #outBAM.write(alignment) + if keep.is_reverse: + outBAM_neg.write(keep) + else: + outBAM_pos.write(keep) + #clean up current mutation hash + self.printMutationSingleChr(currentChr) + self.mutations = None + #Logging CLIP sample information + #outBAM.close() + outBAM_pos.close() + outBAM_neg.close() + pysam.index(self.outprefix+".pos.filtered.bam") + pysam.index(self.outprefix+".neg.filtered.bam") + self.posfilteredBAM = self.outprefix+".pos.filtered.bam" + self.negfilteredBAM = self.outprefix+".neg.filtered.bam" + self.clusters = self.clusters_plus + self.clusters_minus + self.printClusters() + #Clean up variables + self.originalBAM = None + self.clusters_plus = None + self.clusters_minus = None + self.mutations = None + gc.collect() + logging.debug("After filtering, %d reads left" % (self.filteredAlignment)) + logging.debug("There are %d clusters in total" % (self.clusterCount)) + logging.debug("There are %d mutations in total" % (self.mutationCount)) diff --git a/lib/Enrich.py b/lib/Enrich.py index 04989c4..11bdca6 100755 --- a/lib/Enrich.py +++ b/lib/Enrich.py @@ -24,9 +24,12 @@ import subprocess import OptValidator import datetime +import time import Utils -from memory_profiler import profile +from os import gc +import Alignment +import os OptValidator.opt_validate() gc.enable() @@ -57,16 +60,16 @@ def KMvalue(posmapfile,negmapfile,mufile): km = []#store mutations with updated k value km_pair = {}#Dic of count tuples of (k,m),key:"K_M" count = 0 - #logging.debug("make wig %s" % str(datetime.datetime.now())) + logging.debug("make wig %s" % str(datetime.datetime.now())) poswig = Utils.makeWig(posmapfile) negwig = Utils.makeWig(negmapfile) start_time = datetime.datetime.now() - #logging.debug("finish making wig %s" % str(start_time)) + logging.debug("finish making wig %s" % str(start_time)) for item in mufile: count += 1 if count % 5000 == 0: stop_time = datetime.datetime.now() - #logging.debug("Counting K-M for %d mutation sites, using %s" % (count,str(stop_time-start_time))) + logging.debug("Counting K-M for %d mutation sites, using %s" % (count,str(stop_time-start_time))) start_time = stop_time st = [] strand = item.strand @@ -97,61 +100,55 @@ def KMvalue(posmapfile,negmapfile,mufile): gc.collect() return km_pair -@profile -def KMvalue_test(posmapfile,negmapfile,mufile): +def KMvalue_test(clip,mutations,chr,chrlen): ''' Calculate K(coverage) value for each mutation location Mutations are already unique. ''' km = []#store mutations with updated k value - km_pair = {}#Dic of count tuples of (k,m),key:"K_M" count = 0 - currentChr = "" #logging.debug("make wig %s" % str(datetime.datetime.now())) + posBAM = pysam.Samfile(clip.posfilteredBAM,"rb") + negBAM = pysam.Samfile(clip.negfilteredBAM,"rb") start_time = datetime.datetime.now() - posBAM = pysam.Samfile(posmapfile,"rb") - negBAM = pysam.Samfile(negmapfile,"rb") - for item in mufile: + poswig = Utils.makeWigByChr(posBAM,chr) + negwig = Utils.makeWigByChr(negBAM,chr) + stop_time = datetime.datetime.now() + #logging.debug("Finished making wig for %s using %s" % (chr,str(stop_time-start_time))) + start_time = stop_time + for item in mutations: count += 1 - if count % 5000 == 0: + if count % 100000 == 0: stop_time = datetime.datetime.now() - #logging.debug("Counting K-M for %d mutation sites, using %s" % (count,str(stop_time-start_time))) + logging.debug("Counting K-M for %d mutation sites, using %s" % (count,str(stop_time-start_time))) start_time = stop_time st = [] - strand = item.strand M = item.score K = 0 - if item.chr != currentChr: - poswig = Utils.makeWigListByChr(posBAM,item.chr) - negwig = Utils.makeWigListByChr(negBAM,item.chr) - currentChr = item.chr - #logging.debug("Time begin to pileup is %s" % (str(datetime.datetime.now()))) + strand = item.strand if strand == "+": try: - K = poswig.valueByPos(item.start) + K = poswig[item.start] except: continue elif strand == "-": try: - K = negwig.valueByPos(item.start) + K = negwig[item.start] except: - contiune - - if K>=M: - item.updateK(K) + continue + if K>=M: + item.updateK(K) - pair_name = str(K)+"_"+str(M) - if km_pair.has_key(pair_name): - km_pair[pair_name] += 1 - else: - km_pair[pair_name] = 1 - #km.append(item) + pair_name = str(K)+"_"+str(M) + if clip.kmpair.has_key(pair_name): + clip.kmpair[pair_name] += 1 + else: + clip.kmpair[pair_name] = 1 posBAM.close() negBAM.close() del posBAM del negBAM gc.collect() - return km_pair def KMvalue_ignore(posmapfile,negmapfile,mufile): @@ -204,28 +201,40 @@ def uniq(b): #b is a list uniqElements.sort() return uniqElements -#@profile def mutationEnrich(clip,threshold=0.01): coverage = clip.coverage *1.0 totalMuCount = clip.mutationCount - #(original_KM,KM_test) = KMvalue(clip.originalBAM, clip.mutations) - KM_test = KMvalue_test(clip.posfilteredBAM,clip.negfilteredBAM, clip.mutations)#check after doing KM, if clip.mutations changed - clip.posfilteredBAM = None - clip.negfilteredBAM = None - #logging.info("Finished K-M counting, starting fitting.") + mutations = [] + total_test = 0 + for chr,chrlen in clip.refInfo: + try: + mufile = open(clip.outprefix+"."+chr+".mutations.bed") + except: + logging.info("Cannot open mutation file %s , move on." % (clip.outprefix+"."+chr+".mutations.bed")) + continue + for record in mufile: + total_test += 1 + info = record.rstrip().split("\t") + new_mu = Alignment.MutationBed(info[0],int(info[1]),int(info[2]),info[3],int(info[4]),info[5],info[6]) + mutations.append(new_mu) + os.remove(clip.outprefix+"."+chr+".mutations.bed") + KM_test = KMvalue_test(clip,mutations,chr,chrlen)#check after doing KM, if clip.mutations changed + del clip.posfilteredBAM + del clip.negfilteredBAM + gc.collect()#logging.info("Finished K-M counting, starting fitting.") + R = robject.r reliableList = [] P = totalMuCount/coverage km_p = {}#store km and corresponding p value pvalues = [] - for k in KM_test: + for k in clip.kmpair:#KM_test: parameters = k.split("_") p = R.pbinom(int(parameters[1])-1,int(parameters[0]),P,False)[0] pvalues.append(p) km_p[k]=p pCount = dict(Counter(pvalues)) pRank = freqRank(pCount,True) - total_test = len(clip.mutations) pqDic={} for i in pRank.keys(): try: @@ -236,7 +245,7 @@ def mutationEnrich(clip,threshold=0.01): print >> sys.stderr,"Cannot find p value in dictionary" continue count = 0 - for mu in clip.mutations:#.values(): + for mu in mutations: name = str(mu.kvalue)+"_"+str(mu.score) try: mu.pvalue = km_p[name] @@ -249,34 +258,27 @@ def mutationEnrich(clip,threshold=0.01): new_mutationName = "Mutation_"+str(count) mu.name = new_mutationName mu.sig = True - clip.sigMutationCount += 1 + clip.sigMutationCount+=1 clip.addSigToDic(clip.sigMutations,mu) - clip.mutations = None - #logging.info("There are %d reliable mutations" % clip.sigMutationCount) - + + mutations = None + gc.collect() def clusterEnrich(clip,threshold=0.01): - #write temp file - temp_filename = clip.outprefix+".merge"#clip.filepath.split("/")[-1].split(".")[0] - fh = open(temp_filename,"w") - for i in clip.clusters: - print >> fh,i - fh.close() + cluster_filename = clip.outprefix+".clusters.bed"#clip.filepath.split("/")[-1].split(".")[0] #Call R code and get result epsilon = [0.01,0.15,0.1] step = [0.1,0.08,0.05] for index in range(len(epsilon)): e = epsilon[index] s = step[index] - r_args = ['Rscript','lib/ZTNB_tryCatch.R',temp_filename,str(threshold),str(e),str(s)] + r_args = ['Rscript','lib/ZTNB_tryCatch.R',cluster_filename,str(threshold),str(e),str(s)] + gc.collect() p = subprocess.Popen(r_args) stdout_value = p.communicate()[0] - #output = subprocess.check_output['ls','-l','test.merge.ztnb'] - #output_log = subprocess.check_output['ls','-l','test.merge.ztnblog'] - #If regression converged, there is no need to try other epsilon or step,check log file flag: Y means coverged, N means not converged try: - r_output_log = open(temp_filename+".pipeclip.ztnblog","r") + r_output_log = open(cluster_filename+".pipeclip.ztnblog","r") #logging.debug("Log file opened") flag = r_output_log.read(1) if flag == "Y":#converged @@ -288,10 +290,8 @@ def clusterEnrich(clip,threshold=0.01): continue #check ztnb file - #r_output = subprocess.check_output(['ls','-l',temp_filename+'.ztnb']) - #if int(r_output.split()[4])>100: #more than header,file OK try: - enrich_parameter = open(temp_filename+".pipeclip.ztnb","r") + enrich_parameter = open(cluster_filename+".pipeclip.ztnb","r") except IOError,message: logging.error("Cannot open ztnb result file") return False @@ -316,7 +316,6 @@ def clusterEnrich(clip,threshold=0.01): clip.clusters[i].qvalue = nbDic[r_key][1] clip.clusters[i].sig = True clip.sigClusterCount += 1 - #clip.addSigToDic(clip.sigClusters,clip.clusters[i]) nbDic = None if clip.sigClusterCount == 0: return False @@ -326,8 +325,6 @@ def clusterEnrich(clip,threshold=0.01): def fisherTest(clusterp,mutationp): R = robject.r min_mp = min(mutationp) - #logging.debug("clusterP %f,%s" % (clusterp, type(clusterp))) - #logging.debug("mutationP %f,%s" % (min_mp, type(min_mp))) product = clusterp * min_mp if product == 0: fps = 0 diff --git a/lib/Utils.py b/lib/Utils.py index 2471d1c..fc3887d 100644 --- a/lib/Utils.py +++ b/lib/Utils.py @@ -88,7 +88,7 @@ def makeWig(bamfile): def makeWigByChr(bamfile,chr): wig = {} for pileupColumn in bamfile.pileup(chr): - wig[str(pileupColumn.pos)]=pileupColumn.n + wig[pileupColumn.pos]=pileupColumn.n return wig @@ -98,6 +98,15 @@ def makeWigListByChr(bamfile,chr): wig.update(pileupColumn.pos,pileupColumn.n) return wig + +def makeWigListByChr_array(bamfile,chr,chrlen): + wig = [] + for i in range(chrlen): + wig.append(0) + for pileupColumn in bamfile.pileup(chr): + wig[pileupColumn.pos]=pileupColumn.n + return wig + def bisort(alist,b): '''List is a list of bed instances, b is also an instance''' if isinstance(alist[0],Alignment.BED) and isinstance(b,Alignment.BED): diff --git a/lib/ZTNB_tryCatch.R b/lib/ZTNB_tryCatch.R index f890599..e4cae7f 100755 --- a/lib/ZTNB_tryCatch.R +++ b/lib/ZTNB_tryCatch.R @@ -116,11 +116,7 @@ for (i in intercept1) { for (j in intercept2) { - #print(paste("Coef start",i,j)); - #print(paste("Recent likelihood",biggest_likelihood)) - #print(paste("NB parameters",khat,muhat)) nb<-tryCatch({vglm(tread_fit ~ 1, posnegbinomial(), weight = twts_fit, maxit = 200, trace = FALSE, step = vglm_step,offset = logmu,epsilon=vglm_epsilon,silent=FALSE,coefstart=c(i,j))},warning = function(w){ - #print(paste("typeof warning",typeof(w["message"]))) warnmsg = strsplit(toString(w["message"])," iteration")[[1]][1]; if(grepl("convergence not obtained",warnmsg)) { diff --git a/pipeclip.py b/pipeclip.py index be700b9..6497592 100755 --- a/pipeclip.py +++ b/pipeclip.py @@ -34,10 +34,10 @@ def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluste logging.info("Input file OK,start to run PIPE-CLIP") logging.info("Species info %s" % species) if myClip.readfile(): - myClip.filter(matchLength,mismatch,clipType,rmdup) + myClip.filter2(matchLength,mismatch,clipType,rmdup) #myClip.printClusters() - # myClip.printMutations() -#BC# if len(myClip.clusters)>0: + #myClip.printMutations() +#BC# if myClip.clusterCount>0: #BC# logging.info("Get enriched clusters") #BC# status = Enrich.clusterEnrich(myClip,fdrEnrichedCluster) #BC# if status: @@ -50,7 +50,7 @@ def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluste #BC# logging.error("There is no clusters found. Please check input.Exit program.") #BC# sys.exit(1) - if len(myClip.mutations)>0: + if myClip.mutationCount>0: logging.info("Get reliable mutations") Enrich.mutationEnrich(myClip,fdrReliableMutation) logging.info("There are %d reliable mutations" % myClip.sigMutationCount) From e700ab421cbcd33883ced12624ab6ef1b61c92d2 Mon Sep 17 00:00:00 2001 From: Beibei Chen Qbri Date: Sun, 2 Nov 2014 21:24:31 -0600 Subject: [PATCH 6/6] Add file cleaning --- lib/Enrich.py | 61 ++++++++++------------------------- pipeclip.py | 88 +++++++++++++++++++++++++-------------------------- 2 files changed, 60 insertions(+), 89 deletions(-) diff --git a/lib/Enrich.py b/lib/Enrich.py index 11bdca6..ce4ea79 100755 --- a/lib/Enrich.py +++ b/lib/Enrich.py @@ -26,7 +26,7 @@ import datetime import time import Utils -from os +import os import gc import Alignment import os @@ -150,49 +150,6 @@ def KMvalue_test(clip,mutations,chr,chrlen): del negBAM gc.collect() - -def KMvalue_ignore(posmapfile,negmapfile,mufile): - ''' - Calculate K(coverage) value for each mutation location - Mutations are already unique. - ''' - km = []#store mutations with updated k value - km_pair = {}#Dic of count tuples of (k,m),key:"K_M" - count = 0 - start_time = datetime.datetime.now() - for item in mufile: - count += 1 - if count % 5000 == 0: - stop_time = datetime.datetime.now() - logging.info("Counting K-M for %d mutation sites, using %s" % (count,str(stop_time-start_time))) - start_time = stop_time - st = [] - strand = item.strand - M = item.score - K = 0 - logging.debug("Time begin to pileup is %s, mutation %d" % (str(datetime.datetime.now()),count)) - if strand == "+": - for pileupColumn in posmapfile.pileup(item.chr,int(item.start),int(item.stop)): - if pileupColumn == int(item.start): - K = pileupColumn.n - break - elif strand == "-": - for pileupColumn in negmapfile.pileup(item.chr,int(item.start),int(item.stop)): - if pileupColumn == int(item.start): - K = pileupColumn.n - break - if K>=M: - item.updateK(K) - #logging.debug("K value for item %s is %d" % (item, K)) - pair_name = str(K)+"_"+str(M) - if km_pair.has_key(pair_name): - km_pair[pair_name] += 1 - else: - km_pair[pair_name] = 1 - #km.append(item) - return km_pair - - def uniq(b): #b is a list uniqElements = [] for i in b: @@ -217,8 +174,18 @@ def mutationEnrich(clip,threshold=0.01): info = record.rstrip().split("\t") new_mu = Alignment.MutationBed(info[0],int(info[1]),int(info[2]),info[3],int(info[4]),info[5],info[6]) mutations.append(new_mu) - os.remove(clip.outprefix+"."+chr+".mutations.bed") + try: + os.remove(clip.outprefix+"."+chr+".mutations.bed") + except: + pass KM_test = KMvalue_test(clip,mutations,chr,chrlen)#check after doing KM, if clip.mutations changed + try: + os.remove(clip.posfilteredBAM) + os.remove(clip.negfilteredBAM) + os.remove(clip.posfilteredBAM+".bai") + os.remove(clip.negfilteredBAM+".bai") + except: + pass del clip.posfilteredBAM del clip.negfilteredBAM gc.collect()#logging.info("Finished K-M counting, starting fitting.") @@ -317,6 +284,10 @@ def clusterEnrich(clip,threshold=0.01): clip.clusters[i].sig = True clip.sigClusterCount += 1 nbDic = None + try: + os.remove(cluster_filename) + except: + pass if clip.sigClusterCount == 0: return False else: diff --git a/pipeclip.py b/pipeclip.py index 6497592..e193b70 100755 --- a/pipeclip.py +++ b/pipeclip.py @@ -37,18 +37,18 @@ def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluste myClip.filter2(matchLength,mismatch,clipType,rmdup) #myClip.printClusters() #myClip.printMutations() -#BC# if myClip.clusterCount>0: -#BC# logging.info("Get enriched clusters") -#BC# status = Enrich.clusterEnrich(myClip,fdrEnrichedCluster) -#BC# if status: -#BC# logging.info("Found %d enriched clusters" % myClip.sigClusterCount) -#BC# myClip.printEnrichedClusters() -#BC# else: -#BC# logging.error("There is no enriched cluster found. Exit program") -#BC# sys.exit(1) -#BC# else: -#BC# logging.error("There is no clusters found. Please check input.Exit program.") -#BC# sys.exit(1) + if myClip.clusterCount>0: + logging.info("Get enriched clusters") + status = Enrich.clusterEnrich(myClip,fdrEnrichedCluster) + if status: + logging.info("Found %d enriched clusters" % myClip.sigClusterCount) + myClip.printEnrichedClusters() + else: + logging.error("There is no enriched cluster found. Exit program") + sys.exit(1) + else: + logging.error("There is no clusters found. Please check input.Exit program.") + sys.exit(1) if myClip.mutationCount>0: logging.info("Get reliable mutations") @@ -58,38 +58,38 @@ def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluste else: logging.warning("There is no mutation found in this BAM file.") #Start to get crosslinking sites -#BC# if myClip.sigClusterCount > 0 and myClip.sigMutationCount>0: -#BC# logging.info("Get cross-linking sites") -#BC# myClip.getCrosslinking() -#BC# if (len(myClip.crosslinking.keys())>0): -#BC# outfilelist = myClip.printCrosslinkingSites() -#BC# myClip.printCrosslinkingMutations() -#BC# else: -#BC# logging.warning("There is no crosslinking found. May be caused by no reliable mutations in enriched clusters. Print out enriched clusters instead.") -#BC# outfilelist = myClip.printEnrichClusters() -#BC# else: -#BC# if myClip.sigClusterCount <= 0: -#BC# logging.error("There is no enriched clusters for this sample, please check your input file. Exit.") -#BC# sys.exit(2) -#BC# elif myClip.sigMutationCount <=0: -#BC# logging.warning("There is no reliable mutations found. PIPE-CLIP will provide enriched clusters as crosslinking candidates.") -#BC# outfilelist = myClip.printEnrichClusters() -#BC# #annotation if possible -#BC# if species in ["mm10","mm9","hg19"]: -#BC# logging.info("Started to annotate cross-linking sits using HOMER") -#BC# for name in outfilelist: -#BC# #logging.debug("Start to do annotation for %s" % name) -#BC# Utils.annotation(name,species) -#BC# #output a status log file -#BC# logfile = open(outputPrefix+".pipeclip.summary.log","w") -#BC# print >>logfile,"PIPE-CLIP run finished. Parameters are:" -#BC# print >> logfile,"Input BAM: %s \nOutput prefix: %s \nMinimum matched length: %d \nMaximum mismatch count: %d \nPCR duplicate removal code: %d \nFDR for enriched clusters: %f \nFDR for reliable mutations: %f" % (infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,fdrReliableMutation) -#BC# print >> logfile, "There are %d mapped reads in input BAM file. After filtering,%d reads left" % (myClip.originalMapped,myClip.filteredAlignment) -#BC# print >> logfile, "%d out of %d clusters are enriched." % (myClip.sigClusterCount,len(myClip.clusters)) -#BC# print >> logfile, "%d out of %d mutations are reliable." % (myClip.sigMutationCount,myClip.mutationCount) -#BC# print >> logfile, "%d crosslinking site candidates are found, with %d supporting reliable mutations." % (len(myClip.crosslinking.keys()),len(myClip.crosslinkingMutations)) -#BC# logfile.close() -#BC# logging.info("PIPE-CLIP finished the job, please check your results. :)") + if myClip.sigClusterCount > 0 and myClip.sigMutationCount>0: + logging.info("Get cross-linking sites") + myClip.getCrosslinking() + if (len(myClip.crosslinking.keys())>0): + outfilelist = myClip.printCrosslinkingSites() + myClip.printCrosslinkingMutations() + else: + logging.warning("There is no crosslinking found. May be caused by no reliable mutations in enriched clusters. Print out enriched clusters instead.") + outfilelist = myClip.printEnrichClusters() + else: + if myClip.sigClusterCount <= 0: + logging.error("There is no enriched clusters for this sample, please check your input file. Exit.") + sys.exit(2) + elif myClip.sigMutationCount <=0: + logging.warning("There is no reliable mutations found. PIPE-CLIP will provide enriched clusters as crosslinking candidates.") + outfilelist = myClip.printEnrichClusters() + #annotation if possible + if species in ["mm10","mm9","hg19"]: + logging.info("Started to annotate cross-linking sits using HOMER") + for name in outfilelist: + #logging.debug("Start to do annotation for %s" % name) + Utils.annotation(name,species) + #output a status log file + logfile = open(outputPrefix+".pipeclip.summary.log","w") + print >>logfile,"PIPE-CLIP run finished. Parameters are:" + print >> logfile,"Input BAM: %s \nOutput prefix: %s \nMinimum matched length: %d \nMaximum mismatch count: %d \nPCR duplicate removal code: %d \nFDR for enriched clusters: %f \nFDR for reliable mutations: %f" % (infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,fdrReliableMutation) + print >> logfile, "There are %d mapped reads in input BAM file. After filtering,%d reads left" % (myClip.originalMapped,myClip.filteredAlignment) + print >> logfile, "%d out of %d clusters are enriched." % (myClip.sigClusterCount,len(myClip.clusters)) + print >> logfile, "%d out of %d mutations are reliable." % (myClip.sigMutationCount,myClip.mutationCount) + print >> logfile, "%d crosslinking site candidates are found, with %d supporting reliable mutations." % (len(myClip.crosslinking.keys()),len(myClip.crosslinkingMutations)) + logfile.close() + logging.info("PIPE-CLIP finished the job, please check your results. :)") else: logging.error("File corruputed, program exit.") sys.exit(0)