From 62e5f301f6aa5339b91a95ef8f6136f4290ef40d Mon Sep 17 00:00:00 2001
From: Beibei Chen  Qbri <beibei.chen@utsouthwestern.edu>
Date: Mon, 6 Oct 2014 01:26:15 -0500
Subject: [PATCH 1/6] Deal with reads without MD flag

---
 lib/CLIP.py      |  1 +
 lib/Mutation2.py | 67 +++++++++++++++++++++++++-----------------------
 2 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/lib/CLIP.py b/lib/CLIP.py
index 9c9a646..4d32ee4 100644
--- a/lib/CLIP.py
+++ b/lib/CLIP.py
@@ -247,6 +247,7 @@ def updateMutation(self,read,mis):
 			elif self.type ==2:
 				mutation_filter = Utils.filterMutations(mutations,"G->A",True)
 				mutations = mutation_filter
+		#logging.debug("read %s " % read)
 		if len(mutations)>0:
 			for m in mutations:
 				#print m
diff --git a/lib/Mutation2.py b/lib/Mutation2.py
index 08a80d8..ee29424 100644
--- a/lib/Mutation2.py
+++ b/lib/Mutation2.py
@@ -47,12 +47,14 @@ def countMismatch(b):
 	for i in myList:
 		if i[0]=="NM":
 			return (i[1])
+	return 0
 
 def survey(entry):
+	#logging.debug(entry.cigar)
 	mismatchNumber = countMismatch(entry.tags)
 	insertionNumber = countInsertionNumber(entry.cigar)
 	deletionNumber = countDeletionNumber(entry.cigar)
-	substitutionNumber = mismatchNumber-insertionNumber-deletionNumber
+	substitutionNumber = max(0,mismatchNumber-deletionNumber)
 	#print insertionNumber,deletionNumber
 	return ([insertionNumber,deletionNumber,substitutionNumber])
 
@@ -92,6 +94,7 @@ def parseCIGAR(ci): #calculate match segment length list
 	return matchSeg	
 
 def parseMD(b):
+	buf = []
 	myList = b
 	for j in myList:
 		if j[0]=="MD":
@@ -242,37 +245,37 @@ def getMutations(infile,read):
 	#logging.debug("call getMutation function %s" % read)
 	mutationList = []
 	b=read.tags
-	if countMismatch(b)>0: #and countMismatch(b)<2 and countMatchNumber(item.cigar)>=20:
-		#tmp.write(item)
-		sur = survey(read)
-		insertion = sur[0]
-		deletion = sur[1]
-		substi = sur[2]
-		insertionSeqLoc = []
-		if insertion > 0:
-			insertionDic = insertionLocation(read,insertion)
-			for k in insertionDic.keys():
-				for loc_index in range(len(insertionDic[k])):
-					insertionSeqLoc.append(insertionDic[k][loc_index])
-					mu = read.seq[insertionDic[k][loc_index]]
-					loc = k+loc_index+read.pos
-					if read.tid >=0:
-						chr = infile.getrname(read.tid)
-					if read.is_reverse:
-						strand = '-'
-						mu = RC([mu])[0]
-					else:
-						strand = "+"
-					mutationList.append(MutationBed(chr,loc,loc+1,read.qname,1,strand,"Insertion->"+mu))#change insertionDic[k][loc_index] to 1
-			insertionSeqLoc.sort()
-		if deletion + substi > 0:
-			for mu in mutationLocation(read,insertionSeqLoc):
-				if read.tid>=0:
-					chr  = infile.getrname(read.tid)
-					newMu = MutationBed(chr,mu[0],mu[1],mu[2],mu[3],mu[4],mu[5])
-					mutationList.append(newMu)
-		#logging.debug(mutationList)
-		return mutationList
+	
+	sur = survey(read)
+	insertion = sur[0]
+	deletion = sur[1]
+	substi = sur[2]
+	#logging.debug("insertio:,%d, deletion:%d,sub:%d" % (insertion,deletion,substi))
+	insertionSeqLoc = []
+	if insertion > 0:
+		insertionDic = insertionLocation(read,insertion)
+		for k in insertionDic.keys():
+			for loc_index in range(len(insertionDic[k])):
+				insertionSeqLoc.append(insertionDic[k][loc_index])
+				mu = read.seq[insertionDic[k][loc_index]]
+				loc = k+loc_index+read.pos
+				if read.tid >=0:
+					chr = infile.getrname(read.tid)
+				if read.is_reverse:
+					strand = '-'
+					mu = RC([mu])[0]
+				else:
+					strand = "+"
+				mutationList.append(MutationBed(chr,loc,loc+1,read.qname,1,strand,"Insertion->"+mu))#change insertionDic[k][loc_index] to 1
+		insertionSeqLoc.sort()
+	if deletion + substi > 0:
+		for mu in mutationLocation(read,insertionSeqLoc):
+			if read.tid>=0:
+				chr  = infile.getrname(read.tid)
+				newMu = MutationBed(chr,mu[0],mu[1],mu[2],mu[3],mu[4],mu[5])
+				mutationList.append(newMu)
+	#logging.debug(mutationList)
+	return mutationList
 
 
 def getTruncations(infile,read):

From 74fc1affcbae2fa9dd19386e6b78ebc2e58b4282 Mon Sep 17 00:00:00 2001
From: Beibei Chen  Qbri <beibei.chen@utsouthwestern.edu>
Date: Mon, 6 Oct 2014 16:39:23 -0500
Subject: [PATCH 2/6] Separate plus and minus clusters

---
 lib/CLIP.py | 53 ++++++++++++++++++++++++++++++++++++-----------------
 pipeclip.py |  2 +-
 2 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/lib/CLIP.py b/lib/CLIP.py
index 4d32ee4..3c0b875 100644
--- a/lib/CLIP.py
+++ b/lib/CLIP.py
@@ -29,15 +29,17 @@ def __init__(self,fileaddr,prefix):
 		self.currentGroupKey = "None"
 		self.currentGroup = [] #Used in rmdup
 		self.previousQul = [0,0,0]#for rmdup,[matchlen,mapq,mismatch]
-		self.clusters = [] 
-		self.currentCluster = Alignment.BED("",0,0,"",0,".")
+		self.clusters_plus = []
+		self.clusters_minus = []
+		self.clusters = []
+		self.currentCluster_plus = Alignment.BED("",0,0,"",0,"+")
+		self.currentCluster_minus = Alignment.BED("",0,0,"",0,"-")
 		#self.sigClusters = {}
 		self.mutations = {} #Dictionary of bed instance
 		self.mutationCount = 0
 		self.sigMutations = {}#same as sigClusters
 		self.sigMutationCount = 0
 		self.sigClusterCount = 0
-		self.wig = None
 		self.coverage = 0 #"reads coverage of this sample"
 		self.bamheader = None
 		self.crosslinking = {}
@@ -100,9 +102,9 @@ def readfile(self):
 #		for i in self.filteredAlignment:
 #			print i
 	
-#	def printClusters(self):
-#		for i in self.clusters:
-#			print i
+	def printClusters(self):
+		for i in self.clusters:
+			print i
 
 	def printMutations(self):
 		for i in self.mutations.values():
@@ -221,19 +223,33 @@ def rmdup(self):
 	def updateCluster(self,read):
 		'''Cluster new read to known clusters and update cluster reads count'''
 		strandDic = {"True":"-","False":"+"}
-		clusterName = "cluster"+"_"+str(len(self.clusters)+1)
+		clusterName = "cluster"+"_"+str(len(self.clusters_plus)+len(self.clusters_minus)+1)
 		newRead = Alignment.ClusterBed(self.originalBAM.getrname(read.tid),read.pos,read.pos+len(read.seq),clusterName,1,strandDic[str(read.is_reverse)])
-		if self.currentCluster.chr == "": #Initiate cluster
-			self.currentCluster = newRead
-			self.clusters.append(self.currentCluster)
+		if read.is_reverse:
+			if self.currentCluster_minus.chr == "": #Initiate cluster
+				self.currentCluster_minus = newRead
+				self.clusters.append(self.currentCluster_minus)
+			else:
+				if self.currentCluster_minus.overlap(newRead):
+					self.currentCluster_minus.merge(newRead)
+					self.clusters_minus[-1]=self.currentCluster_minus
+				else:#New read is a new cluster
+					#self.clusters.append(self.currentCluster)
+					self.currentCluster_minus = newRead
+					self.clusters_minus.append(self.currentCluster_minus)
 		else:
-			if self.currentCluster.overlap(newRead):
-				self.currentCluster.merge(newRead)
-				self.clusters[-1]=self.currentCluster
-			else:#New read is a new cluster
-				#self.clusters.append(self.currentCluster)
-				self.currentCluster = newRead
-				self.clusters.append(self.currentCluster)
+			if self.currentCluster_plus.chr == "": #Initiate cluster
+				self.currentCluster_plus = newRead
+				self.clusters_plus.append(self.currentCluster_plus)
+			else:
+				if self.currentCluster_plus.overlap(newRead):
+					self.currentCluster_plus.merge(newRead)
+					self.clusters_plus[-1]=self.currentCluster_plus
+				else:#New read is a new cluster
+					#self.clusters.append(self.currentCluster)
+					self.currentCluster_plus = newRead
+					self.clusters_plus.append(self.currentCluster_plus)
+
 	
 	def updateMutation(self,read,mis):
 		mutations = []
@@ -393,6 +409,9 @@ def filter(self,matchLen,mismatch,cliptype,duprm):
 		self.posfilteredBAM = pysam.Samfile(self.outprefix+".pos.filtered.bam","rb")# move file pointer to the file head
 		self.negfilteredBAM = pysam.Samfile(self.outprefix+".neg.filtered.bam","rb")# move file pointer to the file head
 		self.originalBAM = None 
+		self.clusters = self.clusters_plus + self.clusters_minus
+		self.clusters_plus = None
+		self.clusters_minus = None
 		logging.debug("After filtering, %d reads left" % (self.filteredAlignment))
 		logging.debug("There are %d clusters in total" % (len(self.clusters)))
 		logging.debug("There are %d mutations in total" % (len(self.mutations)))
diff --git a/pipeclip.py b/pipeclip.py
index 59a4255..09dd875 100755
--- a/pipeclip.py
+++ b/pipeclip.py
@@ -3,7 +3,6 @@
 #Refactored by: eric.roos@utsouthwestern.edu
 #Usage: python pipeclip.py input.sam output_prefix match_length mismatch_number pcr_rm fdr_cluster clip_type fdr_mutation species
 #Required packages: pysam, ghmm, pybedtools
-#Last modification: 18 Sep 2014
 
 
 import sys
@@ -36,6 +35,7 @@ def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluste
 		logging.info("Species info %s" % species)
 		if myClip.readfile():
 			myClip.filter(matchLength,mismatch,clipType,rmdup)
+			#myClip.printClusters()
 			#myClip.printMutations()
 			if len(myClip.clusters)>0:
 				logging.info("Get enriched clusters")

From 98b74e66121c0fdf9f9e265eb854f081f8edc1d5 Mon Sep 17 00:00:00 2001
From: Beibei Chen  Qbri <beibei.chen@utsouthwestern.edu>
Date: Wed, 8 Oct 2014 10:20:45 -0500
Subject: [PATCH 3/6] reduce memory, mutation dic to array

---
 lib/Alignment.py |  21 +++++----
 lib/CLIP.py      |  14 +++---
 lib/Enrich.py    | 110 +++++++++++++++++++++++++++++------------------
 lib/Utils.py     |  12 ++++++
 pipeclip.py      |  92 +++++++++++++++++++--------------------
 5 files changed, 147 insertions(+), 102 deletions(-)

diff --git a/lib/Alignment.py b/lib/Alignment.py
index ea9c3ca..4b1336f 100644
--- a/lib/Alignment.py
+++ b/lib/Alignment.py
@@ -79,15 +79,18 @@ def fishertest(self):
 		self.fisherP = fp
 
 
+class wiglist:
+	def __init__(self):
+		self.pos = []
+		self.value = []
 
-class BAM:
-	def __init__(self,filepath):
-		self.filePath = filePath
-		self.header = None
-		self.sorted = None
-
-	def checkHeader(self):
-		'''Use gzip package to check if this is a bam or sam'''
-		pass
+	def valueByPos(self,p):
+		try:
+			return self.value[self.pos.index(p)]
+		except:
+			return False
 
+	def update(self,p,v):
+		self.pos.append(p)
+		self.value.append(v)
 
diff --git a/lib/CLIP.py b/lib/CLIP.py
index 3c0b875..6576e39 100644
--- a/lib/CLIP.py
+++ b/lib/CLIP.py
@@ -12,7 +12,9 @@
 import OptValidator
 import subprocess
 import datetime
+import gc
 
+gc.enable()
 OptValidator.opt_validate()
 
 
@@ -406,12 +408,14 @@ def filter(self,matchLen,mismatch,cliptype,duprm):
 		pysam.index(self.outprefix+".neg.filtered.bam")
 		#self.filteredBAM = pysam.Samfile(outprefix+".filtered.bam","rb")# move file pointer to the file head
 		
-		self.posfilteredBAM = pysam.Samfile(self.outprefix+".pos.filtered.bam","rb")# move file pointer to the file head
-		self.negfilteredBAM = pysam.Samfile(self.outprefix+".neg.filtered.bam","rb")# move file pointer to the file head
-		self.originalBAM = None 
+		self.posfilteredBAM = self.outprefix+".pos.filtered.bam"
+		self.negfilteredBAM = self.outprefix+".neg.filtered.bam"
+		del self.originalBAM  
 		self.clusters = self.clusters_plus + self.clusters_minus
-		self.clusters_plus = None
-		self.clusters_minus = None
+		del self.clusters_plus
+		del self.clusters_minus
+		self.mutations = self.mutations.values()
+		gc.collect()
 		logging.debug("After filtering, %d reads left" % (self.filteredAlignment))
 		logging.debug("There are %d clusters in total" % (len(self.clusters)))
 		logging.debug("There are %d mutations in total" % (len(self.mutations)))
diff --git a/lib/Enrich.py b/lib/Enrich.py
index 51422a9..04989c4 100755
--- a/lib/Enrich.py
+++ b/lib/Enrich.py
@@ -25,8 +25,11 @@
 import OptValidator
 import datetime
 import Utils
+from memory_profiler import profile
+import gc
 
 OptValidator.opt_validate()
+gc.enable()
 
 stats = importr('stats')
 
@@ -45,6 +48,7 @@ def BH(pvalue,pRank,N):
 	qva = max(pvalue, q)
 	return qva
 
+#@profile
 def KMvalue(posmapfile,negmapfile,mufile):
 		'''
 		Calculate K(coverage) value for each mutation location
@@ -90,9 +94,11 @@ def KMvalue(posmapfile,negmapfile,mufile):
 				else:
 					km_pair[pair_name] = 1
 				#km.append(item)
+		gc.collect()
 		return km_pair
 
-def KMvalue_ignore(posmapfile,negmapfile,mufile):
+@profile
+def KMvalue_test(posmapfile,negmapfile,mufile):
 		'''
 		Calculate K(coverage) value for each mutation location
 		Mutations are already unique.
@@ -100,7 +106,62 @@ def KMvalue_ignore(posmapfile,negmapfile,mufile):
 		km = []#store mutations with updated k value
 		km_pair = {}#Dic of count tuples of (k,m),key:"K_M"
 		count = 0
+		currentChr = ""
+		#logging.debug("make wig %s" % str(datetime.datetime.now()))
+		start_time = datetime.datetime.now()
+		posBAM = pysam.Samfile(posmapfile,"rb")
+		negBAM = pysam.Samfile(negmapfile,"rb")
+		for item in mufile:
+			count += 1
+			if count % 5000 == 0:
+				stop_time = datetime.datetime.now()
+				#logging.debug("Counting K-M for %d mutation sites, using %s" % (count,str(stop_time-start_time)))
+				start_time = stop_time
+			st = []
+			strand = item.strand 
+			M = item.score
+			K = 0
+			if item.chr != currentChr:
+				poswig = Utils.makeWigListByChr(posBAM,item.chr)
+				negwig = Utils.makeWigListByChr(negBAM,item.chr)
+				currentChr = item.chr
+			#logging.debug("Time begin to pileup is %s" % (str(datetime.datetime.now())))
+			if strand == "+":
+				try:
+					K = poswig.valueByPos(item.start)
+				except:
+					continue
+			elif strand == "-":
+				try:
+					K = negwig.valueByPos(item.start)
+				except:
+					contiune
+
+			if K>=M:
+				item.updateK(K)
 
+				pair_name = str(K)+"_"+str(M)
+				if km_pair.has_key(pair_name):
+					km_pair[pair_name] += 1
+				else:
+					km_pair[pair_name] = 1
+				#km.append(item)
+		posBAM.close()
+		negBAM.close()
+		del posBAM
+		del negBAM
+		gc.collect()
+		return km_pair
+
+
+def KMvalue_ignore(posmapfile,negmapfile,mufile):
+		'''
+		Calculate K(coverage) value for each mutation location
+		Mutations are already unique.
+		'''
+		km = []#store mutations with updated k value
+		km_pair = {}#Dic of count tuples of (k,m),key:"K_M"
+		count = 0
 		start_time = datetime.datetime.now()
 		for item in mufile:
 			count += 1
@@ -143,12 +204,12 @@ def uniq(b): #b is a list
 	uniqElements.sort()
 	return uniqElements
 
-
+#@profile
 def mutationEnrich(clip,threshold=0.01):
 	coverage = clip.coverage *1.0
 	totalMuCount = clip.mutationCount
 	#(original_KM,KM_test) = KMvalue(clip.originalBAM, clip.mutations)
-	KM_test = KMvalue(clip.posfilteredBAM,clip.negfilteredBAM, clip.mutations.values())#check after doing KM, if clip.mutations changed
+	KM_test = KMvalue_test(clip.posfilteredBAM,clip.negfilteredBAM, clip.mutations)#check after doing KM, if clip.mutations changed
 	clip.posfilteredBAM = None
 	clip.negfilteredBAM = None
 	#logging.info("Finished K-M counting, starting fitting.")
@@ -164,7 +225,7 @@ def mutationEnrich(clip,threshold=0.01):
 		km_p[k]=p
 	pCount = dict(Counter(pvalues))
 	pRank = freqRank(pCount,True)
-	total_test = len(clip.mutations.keys())
+	total_test = len(clip.mutations)
 	pqDic={}
 	for i in pRank.keys():
 		try:
@@ -175,7 +236,7 @@ def mutationEnrich(clip,threshold=0.01):
 			print >> sys.stderr,"Cannot find p value in dictionary"
 			continue
 	count = 0
-	for mu in clip.mutations.values():
+	for mu in clip.mutations:#.values():
 		name = str(mu.kvalue)+"_"+str(mu.score)
 		try:
 			mu.pvalue = km_p[name]
@@ -190,6 +251,7 @@ def mutationEnrich(clip,threshold=0.01):
 			mu.sig = True
 			clip.sigMutationCount += 1
 			clip.addSigToDic(clip.sigMutations,mu)
+	clip.mutations = None
 	#logging.info("There are %d reliable mutations" % clip.sigMutationCount)
 
 
@@ -255,6 +317,7 @@ def clusterEnrich(clip,threshold=0.01):
 				clip.clusters[i].sig = True
 				clip.sigClusterCount += 1
 			#clip.addSigToDic(clip.sigClusters,clip.clusters[i])
+		nbDic = None
 		if clip.sigClusterCount == 0:
 			return False
 		else:
@@ -275,40 +338,3 @@ def fisherTest(clusterp,mutationp):
 	return fps
 
 
-def mutationEnrich_ignore(clip):
-		coverage = clip.coverage
-		if self.isPar: #input is a par,no need to split the file
-			filename = self.outputRoot+".bed"
-			outputfile = open(filename,"wa")
-			print >> outputfile,"#chr\tstart\tend\tname\tp\tstrand\ttype\tk\tm"
-			for reliable_mu in self.muEvaluate(self.bamFile,self.mutationFile,coverage,self.fdr):
-				print >>outputfile,'\t'.join(reliable_mu)
-		else: #splitfile to insertion, deletion, substitution
-			insertion = []
-			deletion = []
-			substitution = []
-			for item in self.mutationFile:
-				if item[-1].count("Deletion")>0:
-					deletion.append(item)
-				elif item[-1].count("Insertion")>0:
-					insertion.append(item)
-				else:
-					substitution.append(item)
-			del_name = self.outputRoot+"_deletion.bed"
-			ins_name = self.outputRoot+"_insertion.bed"
-			sub_name = self.outputRoot+"_substitution.bed"
-
-			outfile_del = open(del_name,"wa")
-			outfile_ins = open(ins_name,"wa")
-			outfile_sub = open(sub_name,"wa")
-			print >> outfile_ins,"#chr\tstart\tend\tname\t-log(q)\tstrand\ttype\tk\tm"
-			print >> outfile_del,"#chr\tstart\tend\tname\t-log(q)\tstrand\ttype\tk\tm"
-			print >> outfile_sub,"#chr\tstart\tend\tname\t-log(q)\tstrand\ttype\tk\tm"
-			for reliable_mu in self.muEvaluate(self.bamFile,insertion,coverage,self.fdr):
-				print >> outfile_ins,'\t'.join(reliable_mu)
-			for reliable_mu in self.muEvaluate(self.bamFile,deletion,coverage,self.fdr):
-				print >> outfile_del,'\t'.join(reliable_mu)
-			for reliable_mu in self.muEvaluate(self.bamFile,substitution,coverage,self.fdr):
-				print >> outfile_sub,'\t'.join(reliable_mu)
-
-
diff --git a/lib/Utils.py b/lib/Utils.py
index e87da64..2471d1c 100644
--- a/lib/Utils.py
+++ b/lib/Utils.py
@@ -85,6 +85,18 @@ def makeWig(bamfile):
 	return wig
 	
 
+def makeWigByChr(bamfile,chr):
+	wig = {}
+	for pileupColumn in bamfile.pileup(chr):
+			wig[str(pileupColumn.pos)]=pileupColumn.n
+	return wig
+
+
+def makeWigListByChr(bamfile,chr):
+	wig = Alignment.wiglist()
+	for pileupColumn in bamfile.pileup(chr):
+		wig.update(pileupColumn.pos,pileupColumn.n)
+	return wig
 
 def bisort(alist,b):
 	'''List is a list of bed instances, b is also an instance'''
diff --git a/pipeclip.py b/pipeclip.py
index 09dd875..be700b9 100755
--- a/pipeclip.py
+++ b/pipeclip.py
@@ -36,21 +36,21 @@ def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluste
 		if myClip.readfile():
 			myClip.filter(matchLength,mismatch,clipType,rmdup)
 			#myClip.printClusters()
-			#myClip.printMutations()
-			if len(myClip.clusters)>0:
-				logging.info("Get enriched clusters")
-				status = Enrich.clusterEnrich(myClip,fdrEnrichedCluster)
-				if status:
-					logging.info("Found %d enriched clusters" % myClip.sigClusterCount)
-					myClip.printEnrichedClusters()
-				else:
-					logging.error("There is no enriched cluster found. Exit program")
-					sys.exit(1)
-			else:
-				logging.error("There is no clusters found. Please check input.Exit program.")
-				sys.exit(1)
+		#	myClip.printMutations()
+#BC#			if len(myClip.clusters)>0:
+#BC#				logging.info("Get enriched clusters")
+#BC#				status = Enrich.clusterEnrich(myClip,fdrEnrichedCluster)
+#BC#				if status:
+#BC#					logging.info("Found %d enriched clusters" % myClip.sigClusterCount)
+#BC#					myClip.printEnrichedClusters()
+#BC#				else:
+#BC#					logging.error("There is no enriched cluster found. Exit program")
+#BC#					sys.exit(1)
+#BC#			else:
+#BC#				logging.error("There is no clusters found. Please check input.Exit program.")
+#BC#				sys.exit(1)
 			
-			if len(myClip.mutations.keys())>0:
+			if len(myClip.mutations)>0:
 				logging.info("Get reliable mutations")
 				Enrich.mutationEnrich(myClip,fdrReliableMutation)
 				logging.info("There are %d reliable mutations" % myClip.sigMutationCount)
@@ -58,38 +58,38 @@ def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluste
 			else:
 				logging.warning("There is no mutation found in this BAM file.")
 			#Start to get crosslinking sites
-			if myClip.sigClusterCount > 0 and myClip.sigMutationCount>0:
-				logging.info("Get cross-linking sites")
-				myClip.getCrosslinking()
-				if (len(myClip.crosslinking.keys())>0):
-					outfilelist = myClip.printCrosslinkingSites()
-					myClip.printCrosslinkingMutations()
-				else:
-					logging.warning("There is no crosslinking found. May be caused by no reliable mutations in enriched clusters. Print out enriched clusters instead.")
-					outfilelist = myClip.printEnrichClusters()
-			else:
-				if myClip.sigClusterCount <= 0:
-					logging.error("There is no enriched clusters for this sample, please check your input file. Exit.")
-					sys.exit(2)
-				elif myClip.sigMutationCount <=0:
-					logging.warning("There is no reliable mutations found. PIPE-CLIP will provide enriched clusters as crosslinking candidates.")
-					outfilelist = myClip.printEnrichClusters()
-			#annotation if possible
-		if species in ["mm10","mm9","hg19"]:
-			logging.info("Started to annotate cross-linking sits using HOMER")
-			for name in outfilelist:
-				#logging.debug("Start to do annotation for %s" % name)
-				Utils.annotation(name,species)
-		#output a status log file
-		logfile = open(outputPrefix+".pipeclip.summary.log","w")
-		print >>logfile,"PIPE-CLIP run finished. Parameters are:"
-		print >> logfile,"Input BAM: %s \nOutput prefix: %s \nMinimum matched length: %d \nMaximum mismatch count: %d \nPCR duplicate removal code: %d \nFDR for enriched clusters: %f \nFDR for reliable mutations: %f" % (infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,fdrReliableMutation)
-		print >> logfile, "There are %d mapped reads in input BAM file. After filtering,%d reads left" % (myClip.originalMapped,myClip.filteredAlignment)
-		print >> logfile, "%d out of %d clusters are enriched." % (myClip.sigClusterCount,len(myClip.clusters))
-		print >> logfile, "%d out of %d mutations are reliable." % (myClip.sigMutationCount,myClip.mutationCount)
-		print >> logfile, "%d crosslinking site candidates are found, with %d supporting reliable mutations." % (len(myClip.crosslinking.keys()),len(myClip.crosslinkingMutations))
-		logfile.close()
-		logging.info("PIPE-CLIP finished the job, please check your results. :)")	
+#BC#			if myClip.sigClusterCount > 0 and myClip.sigMutationCount>0:
+#BC#				logging.info("Get cross-linking sites")
+#BC#				myClip.getCrosslinking()
+#BC#				if (len(myClip.crosslinking.keys())>0):
+#BC#					outfilelist = myClip.printCrosslinkingSites()
+#BC#					myClip.printCrosslinkingMutations()
+#BC#				else:
+#BC#					logging.warning("There is no crosslinking found. May be caused by no reliable mutations in enriched clusters. Print out enriched clusters instead.")
+#BC#					outfilelist = myClip.printEnrichClusters()
+#BC#			else:
+#BC#				if myClip.sigClusterCount <= 0:
+#BC#					logging.error("There is no enriched clusters for this sample, please check your input file. Exit.")
+#BC#					sys.exit(2)
+#BC#				elif myClip.sigMutationCount <=0:
+#BC#					logging.warning("There is no reliable mutations found. PIPE-CLIP will provide enriched clusters as crosslinking candidates.")
+#BC#					outfilelist = myClip.printEnrichClusters()
+#BC#			#annotation if possible
+#BC#		if species in ["mm10","mm9","hg19"]:
+#BC#			logging.info("Started to annotate cross-linking sits using HOMER")
+#BC#			for name in outfilelist:
+#BC#				#logging.debug("Start to do annotation for %s" % name)
+#BC#				Utils.annotation(name,species)
+#BC#		#output a status log file
+#BC#		logfile = open(outputPrefix+".pipeclip.summary.log","w")
+#BC#		print >>logfile,"PIPE-CLIP run finished. Parameters are:"
+#BC#		print >> logfile,"Input BAM: %s \nOutput prefix: %s \nMinimum matched length: %d \nMaximum mismatch count: %d \nPCR duplicate removal code: %d \nFDR for enriched clusters: %f \nFDR for reliable mutations: %f" % (infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,fdrReliableMutation)
+#BC#		print >> logfile, "There are %d mapped reads in input BAM file. After filtering,%d reads left" % (myClip.originalMapped,myClip.filteredAlignment)
+#BC#		print >> logfile, "%d out of %d clusters are enriched." % (myClip.sigClusterCount,len(myClip.clusters))
+#BC#		print >> logfile, "%d out of %d mutations are reliable." % (myClip.sigMutationCount,myClip.mutationCount)
+#BC#		print >> logfile, "%d crosslinking site candidates are found, with %d supporting reliable mutations." % (len(myClip.crosslinking.keys()),len(myClip.crosslinkingMutations))
+#BC#		logfile.close()
+#BC#		logging.info("PIPE-CLIP finished the job, please check your results. :)")	
 	else:
 		logging.error("File corruputed, program exit.")
 		sys.exit(0)

From 5f5ad791dc5efd9e9750617c63bd1bbc30688bdb Mon Sep 17 00:00:00 2001
From: Beibei Chen  Qbri <beibei.chen@utsouthwestern.edu>
Date: Wed, 8 Oct 2014 12:06:20 -0500
Subject: [PATCH 4/6] bug fix in mutation

---
 lib/Mutation2.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/Mutation2.py b/lib/Mutation2.py
index ee29424..04bc4a6 100644
--- a/lib/Mutation2.py
+++ b/lib/Mutation2.py
@@ -214,7 +214,8 @@ def  mutationLocation(entry,insertLoc):#return mutation location in
 						strand = '+'
 					index1 = loc - match.pos 
 					insertionBefore = countInsertionBefore(index1,insertLoc)
-					index1 += insertionBefore #added 9 Oct
+					index1 += insertionBefore-set_insertion #added 9 Oct
+					set_insertion = insertionBefore
 					mutation = [str(loc),str(loc+1),match.qname,1,strand,"Deletion->"+ch] #change str(index1) to 1
 					yield mutation
 					pre = ch

From 450f5c86489b569b013727caa20d113bae352ab7 Mon Sep 17 00:00:00 2001
From: Beibei Chen  Qbri <beibei.chen@utsouthwestern.edu>
Date: Sun, 2 Nov 2014 20:37:13 -0600
Subject: [PATCH 5/6] clustering bug fix and memory reduce

---
 lib/CLIP.py         | 174 ++++++++++++++++++++++++++++++++++++++++----
 lib/Enrich.py       | 121 +++++++++++++++---------------
 lib/Utils.py        |  11 ++-
 lib/ZTNB_tryCatch.R |   4 -
 pipeclip.py         |   8 +-
 5 files changed, 233 insertions(+), 85 deletions(-)

diff --git a/lib/CLIP.py b/lib/CLIP.py
index 6576e39..ac0b54c 100644
--- a/lib/CLIP.py
+++ b/lib/CLIP.py
@@ -12,7 +12,9 @@
 import OptValidator
 import subprocess
 import datetime
+import time
 import gc
+import os
 
 gc.enable()
 OptValidator.opt_validate()
@@ -23,6 +25,7 @@ def __init__(self,fileaddr,prefix):
 		self.filepath = fileaddr
 		self.originalBAM = None
 		self.originalMapped = 0
+		self.refInfo = []
 		self.posfilteredBAM = None
 		self.negfilteredBAM = None
 		self.filteredAlignment = 0
@@ -31,15 +34,18 @@ def __init__(self,fileaddr,prefix):
 		self.currentGroupKey = "None"
 		self.currentGroup = [] #Used in rmdup
 		self.previousQul = [0,0,0]#for rmdup,[matchlen,mapq,mismatch]
+		self.clusters = []
 		self.clusters_plus = []
 		self.clusters_minus = []
-		self.clusters = []
+		self.clusterCount = 0
 		self.currentCluster_plus = Alignment.BED("",0,0,"",0,"+")
 		self.currentCluster_minus = Alignment.BED("",0,0,"",0,"-")
 		#self.sigClusters = {}
 		self.mutations = {} #Dictionary of bed instance
 		self.mutationCount = 0
+		self.mutationList = []
 		self.sigMutations = {}#same as sigClusters
+		self.kmpair = {}
 		self.sigMutationCount = 0
 		self.sigClusterCount = 0
 		self.coverage = 0 #"reads coverage of this sample"
@@ -95,6 +101,7 @@ def testInput(self):
 	def readfile(self):
 		try:
 			self.originalBAM = pysam.Samfile(self.filepath,"rb")
+			self.refInfo = zip(self.originalBAM.references,self.originalBAM.lengths)
 			return True
 		except IOError,message:
 			logging.error("Cannot open input file"+message)
@@ -105,13 +112,39 @@ def readfile(self):
 #			print i
 	
 	def printClusters(self):
-		for i in self.clusters:
-			print i
+		outfile = open(self.outprefix+".clusters.bed","w")
+		for i in [self.clusters_plus,self.clusters_minus]:
+			for j in i:
+				print >>outfile,j
+		outfile.close()
 
 	def printMutations(self):
+		outfile = open(self.outprefix+".mutations.bed","w")
 		for i in self.mutations.values():
-			print i
+			print >>outfile,i
+		outfile.close()
+
 
+	def printMutations_chr(self):
+		fhs = {}#file handle dic, key is chr, value is file handle
+		for chr,chrlen in self.refInfo:
+			outfile = open(self.outprefix+"."+chr+".mutations.bed","w")
+			fhs[chr] = outfile
+		for i in self.mutations.itervalues():
+			print >>fhs[i.chr],i
+		#clean up
+		for fh in fhs.itervalues():
+			fh.close()
+		del fhs
+
+
+	def printMutationSingleChr(self,chr):
+		outfile = open(self.outprefix+"."+chr+".mutations.bed","w")
+		for i in self.mutations.itervalues():
+			print >>outfile,i
+		#clean up
+		outfile.close()
+	
 	def printReliableMutations(self):
 		outfile = open(self.outprefix+".reliableMutations.pipeclip.bed","w")
 		header = "#chr\tstart\tstop\tmutation_name\tM_value\tstrand\ttype\tK_value\tp_value\tfdr"
@@ -230,7 +263,8 @@ def updateCluster(self,read):
 		if read.is_reverse:
 			if self.currentCluster_minus.chr == "": #Initiate cluster
 				self.currentCluster_minus = newRead
-				self.clusters.append(self.currentCluster_minus)
+				self.clusters_minus.append(self.currentCluster_minus)
+				self.clusterCount += 1
 			else:
 				if self.currentCluster_minus.overlap(newRead):
 					self.currentCluster_minus.merge(newRead)
@@ -239,10 +273,12 @@ def updateCluster(self,read):
 					#self.clusters.append(self.currentCluster)
 					self.currentCluster_minus = newRead
 					self.clusters_minus.append(self.currentCluster_minus)
+					self.clusterCount+=1
 		else:
 			if self.currentCluster_plus.chr == "": #Initiate cluster
 				self.currentCluster_plus = newRead
 				self.clusters_plus.append(self.currentCluster_plus)
+				self.clusterCount+=1
 			else:
 				if self.currentCluster_plus.overlap(newRead):
 					self.currentCluster_plus.merge(newRead)
@@ -251,6 +287,7 @@ def updateCluster(self,read):
 					#self.clusters.append(self.currentCluster)
 					self.currentCluster_plus = newRead
 					self.clusters_plus.append(self.currentCluster_plus)
+					self.clusterCount += 1
 
 	
 	def updateMutation(self,read,mis):
@@ -320,7 +357,7 @@ def getCrosslinking(self):
 							self.crosslinking[cross_key] = Alignment.CrosslinkingBed(cluster.chr,cluster.start,cluster.stop,cluster.name,cluster.score,cluster.strand,cluster.pvalue,cluster.qvalue,mutation.start,mutation.name,mutation.pvalue)
 							self.crosslinkingMutations.append(mutation)
 		#start to calculate fisher test p value
-		for k in self.crosslinking.keys():
+		for k in self.crosslinking.iterkeys():
 			self.crosslinking[k].fishertest()
 
 
@@ -332,6 +369,9 @@ def filter(self,matchLen,mismatch,cliptype,duprm):
 		logging.info("CLIP type:%s" % (str(cliptype)))
 		logging.info("Rmdup code:%s" % (str(duprm)))
 		logging.info("There are %d reads in origianl input file" % (self.originalBAM.mapped))
+		#print >> sys.stderr,zip(self.originalBAM.references,self.originalBAM.lengths)
+		#sys_info = os.system("free -m")
+		#logging.debug(sys_info)
 		#outBAM = pysam.Samfile(outprefix+".filtered.bam","wb",template=self.originalBAM)
 		self.originalMapped = self.originalBAM.mapped
 		outBAM_pos = pysam.Samfile(self.outprefix+".pos.filtered.bam","wb",template=self.originalBAM)
@@ -406,17 +446,123 @@ def filter(self,matchLen,mismatch,cliptype,duprm):
 		#pysam.index(outprefix+".filtered.bam")
 		pysam.index(self.outprefix+".pos.filtered.bam")
 		pysam.index(self.outprefix+".neg.filtered.bam")
-		#self.filteredBAM = pysam.Samfile(outprefix+".filtered.bam","rb")# move file pointer to the file head
-		
 		self.posfilteredBAM = self.outprefix+".pos.filtered.bam"
 		self.negfilteredBAM = self.outprefix+".neg.filtered.bam"
-		del self.originalBAM  
 		self.clusters = self.clusters_plus + self.clusters_minus
-		del self.clusters_plus
-		del self.clusters_minus
-		self.mutations = self.mutations.values()
+		self.printClusters()
+		self.printMutations_chr()
+		#Clean up variables
+		self.originalBAM = None
+		self.clusters_plus = None
+		self.clusters_minus = None
+		self.mutations = None
 		gc.collect()
 		logging.debug("After filtering, %d reads left" % (self.filteredAlignment))
-		logging.debug("There are %d clusters in total" % (len(self.clusters)))
-		logging.debug("There are %d mutations in total" % (len(self.mutations)))
+		logging.debug("There are %d clusters in total" % (self.clusterCount))
+		logging.debug("There are %d mutations in total" % (self.mutationCount))
 
+
+	def filter2(self,matchLen,mismatch,cliptype,duprm):
+		'''Filter the input BAM file according to parameters. Make clusters and mutations at the same time'''
+		logging.info("Start to filter alignment using parameters:")
+		logging.info("match length:%d" % (matchLen))
+		logging.info("mismatch count: %d" % (mismatch))
+		logging.info("CLIP type:%s" % (str(cliptype)))
+		logging.info("Rmdup code:%s" % (str(duprm)))
+		logging.info("There are %d reads in origianl input file" % (self.originalBAM.mapped))
+		
+		self.originalMapped = self.originalBAM.mapped
+		outBAM_pos = pysam.Samfile(self.outprefix+".pos.filtered.bam","wb",template=self.originalBAM)
+		outBAM_neg = pysam.Samfile(self.outprefix+".neg.filtered.bam","wb",template=self.originalBAM)
+		self.type = cliptype
+		if cliptype == 3:#make sure there is no rmdup for iCLIP data
+			duprm = 0
+		count = 0
+		start_time = datetime.datetime.now()
+		currentChr = ""
+		for alignment in self.originalBAM:
+			if not alignment.cigar : #reads is unmapped
+				continue
+			c_chr = self.originalBAM.getrname(alignment.tid)
+			if currentChr == "":
+				currentChr = c_chr
+			elif c_chr!=currentChr:
+				#logging.debug("Mutation sites for Current ref %s is %d" % (currentChr,len(self.mutations.keys())))
+				self.printMutationSingleChr(currentChr)
+				self.mutations = None
+				gc.collect()
+				self.mutations = {}
+				currentChr = c_chr
+			count += 1
+			if count % 5000000 ==0:
+				stop_time = datetime.datetime.now()
+				logging.debug("Processed %d reads in %s" % (count,str(stop_time-start_time)))
+				start_time = stop_time
+			flag,mlen,mis = Utils.readQuaFilter(alignment,matchLen,mismatch)
+			if flag:
+				if duprm > 0:
+					#get group key
+					if duprm == 1:
+						groupkey = Utils.rmdupKey_Start(alignment)
+					elif duprm == 2:
+						groupkey = Utils.rmdupKey_Seq(alignment)
+					#check current group
+					if groupkey == self.currentGroupKey:#overlap with current group, update group
+						self.updateCurrentGroup(alignment,mlen,mis)
+					else:#get read from current group and discard it, use current read to start a new current group
+						if self.currentGroupKey!="None":#current group exists
+							keep = self.rmdup()
+							#logging.debug("Pop out read to keep %s" % keep)
+							self.currentGroup = []
+							self.filteredAlignment += 1
+							flag,mlen,mis = Utils.readQuaFilter(keep,matchLen,mismatch)
+							self.updateCLIPinfo(keep,mlen,mis)
+							#outBAM.write(keep)
+							if keep.is_reverse:
+								outBAM_neg.write(keep)
+							else:
+								outBAM_pos.write(keep)
+						self.iniDupGroupInfo(alignment,groupkey,mlen,mis)#make new group using current alignment
+				else:#there is no rmdup
+					#logging.debug("Good read, update clip info %s" % read.qname)
+					self.filteredAlignment+=1
+					self.updateCLIPinfo(alignment,mlen,mis)
+					#outBAM.write(alignment)
+					if alignment.is_reverse:
+						outBAM_neg.write(alignment)
+					else:
+						outBAM_pos.write(alignment)
+		#clean up the final dupGroup, if rmdup==0, there is no final dupGroup
+		if len(self.currentGroup)>0:
+			keep = self.rmdup()
+			self.currentGroup = []
+			self.filteredAlignment+=1
+			flag,mlen,mis = Utils.readQuaFilter(keep,matchLen,mismatch)
+			self.updateCLIPinfo(keep,mlen,mis)
+			#outBAM.write(alignment)
+			if keep.is_reverse:
+				outBAM_neg.write(keep)
+			else:
+				outBAM_pos.write(keep)
+		#clean up current mutation hash
+		self.printMutationSingleChr(currentChr)
+		self.mutations = None
+		#Logging CLIP sample information
+		#outBAM.close()
+		outBAM_pos.close()
+		outBAM_neg.close()
+		pysam.index(self.outprefix+".pos.filtered.bam")
+		pysam.index(self.outprefix+".neg.filtered.bam")
+		self.posfilteredBAM = self.outprefix+".pos.filtered.bam"
+		self.negfilteredBAM = self.outprefix+".neg.filtered.bam"
+		self.clusters = self.clusters_plus + self.clusters_minus
+		self.printClusters()
+		#Clean up variables
+		self.originalBAM = None
+		self.clusters_plus = None
+		self.clusters_minus = None
+		self.mutations = None
+		gc.collect()
+		logging.debug("After filtering, %d reads left" % (self.filteredAlignment))
+		logging.debug("There are %d clusters in total" % (self.clusterCount))
+		logging.debug("There are %d mutations in total" % (self.mutationCount))
diff --git a/lib/Enrich.py b/lib/Enrich.py
index 04989c4..11bdca6 100755
--- a/lib/Enrich.py
+++ b/lib/Enrich.py
@@ -24,9 +24,12 @@
 import subprocess
 import OptValidator
 import datetime
+import time
 import Utils
-from memory_profiler import profile
+from os
 import gc
+import Alignment
+import os
 
 OptValidator.opt_validate()
 gc.enable()
@@ -57,16 +60,16 @@ def KMvalue(posmapfile,negmapfile,mufile):
 		km = []#store mutations with updated k value
 		km_pair = {}#Dic of count tuples of (k,m),key:"K_M"
 		count = 0
-		#logging.debug("make wig %s" % str(datetime.datetime.now()))
+		logging.debug("make wig %s" % str(datetime.datetime.now()))
 		poswig = Utils.makeWig(posmapfile)
 		negwig = Utils.makeWig(negmapfile)
 		start_time = datetime.datetime.now()
-		#logging.debug("finish making wig %s" %  str(start_time))
+		logging.debug("finish making wig %s" %  str(start_time))
 		for item in mufile:
 			count += 1
 			if count % 5000 == 0:
 				stop_time = datetime.datetime.now()
-				#logging.debug("Counting K-M for %d mutation sites, using %s" % (count,str(stop_time-start_time)))
+				logging.debug("Counting K-M for %d mutation sites, using %s" % (count,str(stop_time-start_time)))
 				start_time = stop_time
 			st = []
 			strand = item.strand 
@@ -97,61 +100,55 @@ def KMvalue(posmapfile,negmapfile,mufile):
 		gc.collect()
 		return km_pair
 
-@profile
-def KMvalue_test(posmapfile,negmapfile,mufile):
+def KMvalue_test(clip,mutations,chr,chrlen):
 		'''
 		Calculate K(coverage) value for each mutation location
 		Mutations are already unique.
 		'''
 		km = []#store mutations with updated k value
-		km_pair = {}#Dic of count tuples of (k,m),key:"K_M"
 		count = 0
-		currentChr = ""
 		#logging.debug("make wig %s" % str(datetime.datetime.now()))
+		posBAM = pysam.Samfile(clip.posfilteredBAM,"rb")
+		negBAM = pysam.Samfile(clip.negfilteredBAM,"rb")
 		start_time = datetime.datetime.now()
-		posBAM = pysam.Samfile(posmapfile,"rb")
-		negBAM = pysam.Samfile(negmapfile,"rb")
-		for item in mufile:
+		poswig = Utils.makeWigByChr(posBAM,chr)
+		negwig = Utils.makeWigByChr(negBAM,chr)
+		stop_time = datetime.datetime.now()
+		#logging.debug("Finished making wig for %s using %s" % (chr,str(stop_time-start_time)))
+		start_time = stop_time
+		for item in mutations:
 			count += 1
-			if count % 5000 == 0:
+			if count % 100000 == 0:
 				stop_time = datetime.datetime.now()
-				#logging.debug("Counting K-M for %d mutation sites, using %s" % (count,str(stop_time-start_time)))
+				logging.debug("Counting K-M for %d mutation sites, using %s" % (count,str(stop_time-start_time)))
 				start_time = stop_time
 			st = []
-			strand = item.strand 
 			M = item.score
 			K = 0
-			if item.chr != currentChr:
-				poswig = Utils.makeWigListByChr(posBAM,item.chr)
-				negwig = Utils.makeWigListByChr(negBAM,item.chr)
-				currentChr = item.chr
-			#logging.debug("Time begin to pileup is %s" % (str(datetime.datetime.now())))
+			strand = item.strand
 			if strand == "+":
 				try:
-					K = poswig.valueByPos(item.start)
+					K = poswig[item.start]
 				except:
 					continue
 			elif strand == "-":
 				try:
-					K = negwig.valueByPos(item.start)
+					K = negwig[item.start]
 				except:
-					contiune
-
-			if K>=M:
-				item.updateK(K)
+					continue
+				if K>=M:
+					item.updateK(K)
 
-				pair_name = str(K)+"_"+str(M)
-				if km_pair.has_key(pair_name):
-					km_pair[pair_name] += 1
-				else:
-					km_pair[pair_name] = 1
-				#km.append(item)
+					pair_name = str(K)+"_"+str(M)
+					if clip.kmpair.has_key(pair_name):
+						clip.kmpair[pair_name] += 1
+					else:
+						clip.kmpair[pair_name] = 1
 		posBAM.close()
 		negBAM.close()
 		del posBAM
 		del negBAM
 		gc.collect()
-		return km_pair
 
 
 def KMvalue_ignore(posmapfile,negmapfile,mufile):
@@ -204,28 +201,40 @@ def uniq(b): #b is a list
 	uniqElements.sort()
 	return uniqElements
 
-#@profile
 def mutationEnrich(clip,threshold=0.01):
 	coverage = clip.coverage *1.0
 	totalMuCount = clip.mutationCount
-	#(original_KM,KM_test) = KMvalue(clip.originalBAM, clip.mutations)
-	KM_test = KMvalue_test(clip.posfilteredBAM,clip.negfilteredBAM, clip.mutations)#check after doing KM, if clip.mutations changed
-	clip.posfilteredBAM = None
-	clip.negfilteredBAM = None
-	#logging.info("Finished K-M counting, starting fitting.")
+	mutations = []
+	total_test = 0
+	for chr,chrlen in clip.refInfo:
+		try:
+			mufile = open(clip.outprefix+"."+chr+".mutations.bed")
+		except:
+			logging.info("Cannot open mutation file %s , move on." % (clip.outprefix+"."+chr+".mutations.bed"))
+			continue
+		for record in mufile:
+			total_test += 1
+			info = record.rstrip().split("\t")
+			new_mu = Alignment.MutationBed(info[0],int(info[1]),int(info[2]),info[3],int(info[4]),info[5],info[6])
+			mutations.append(new_mu)
+			os.remove(clip.outprefix+"."+chr+".mutations.bed")
+		KM_test = KMvalue_test(clip,mutations,chr,chrlen)#check after doing KM, if clip.mutations changed
+	del clip.posfilteredBAM 
+	del clip.negfilteredBAM 
+	gc.collect()#logging.info("Finished K-M counting, starting fitting.")
+	
 	R = robject.r
 	reliableList = []
 	P = totalMuCount/coverage
 	km_p = {}#store km and corresponding p value
 	pvalues = []
-	for k in KM_test:
+	for k in clip.kmpair:#KM_test:
 		parameters = k.split("_")
 		p = R.pbinom(int(parameters[1])-1,int(parameters[0]),P,False)[0]	
 		pvalues.append(p)
 		km_p[k]=p
 	pCount = dict(Counter(pvalues))
 	pRank = freqRank(pCount,True)
-	total_test = len(clip.mutations)
 	pqDic={}
 	for i in pRank.keys():
 		try:
@@ -236,7 +245,7 @@ def mutationEnrich(clip,threshold=0.01):
 			print >> sys.stderr,"Cannot find p value in dictionary"
 			continue
 	count = 0
-	for mu in clip.mutations:#.values():
+	for mu in mutations:
 		name = str(mu.kvalue)+"_"+str(mu.score)
 		try:
 			mu.pvalue = km_p[name]
@@ -249,34 +258,27 @@ def mutationEnrich(clip,threshold=0.01):
 			new_mutationName = "Mutation_"+str(count)
 			mu.name = new_mutationName
 			mu.sig = True
-			clip.sigMutationCount += 1
+			clip.sigMutationCount+=1
 			clip.addSigToDic(clip.sigMutations,mu)
-	clip.mutations = None
-	#logging.info("There are %d reliable mutations" % clip.sigMutationCount)
-
+			
+	mutations = None
+	gc.collect()
 
 
 def clusterEnrich(clip,threshold=0.01):
-	#write temp file
-	temp_filename = clip.outprefix+".merge"#clip.filepath.split("/")[-1].split(".")[0]
-	fh = open(temp_filename,"w")
-	for i in clip.clusters:
-		print >> fh,i
-	fh.close()
+	cluster_filename = clip.outprefix+".clusters.bed"#clip.filepath.split("/")[-1].split(".")[0]
 	#Call R code and get result
 	epsilon = [0.01,0.15,0.1]
 	step = [0.1,0.08,0.05]
 	for index in range(len(epsilon)):
 		e = epsilon[index]
 		s = step[index]
-		r_args = ['Rscript','lib/ZTNB_tryCatch.R',temp_filename,str(threshold),str(e),str(s)]
+		r_args = ['Rscript','lib/ZTNB_tryCatch.R',cluster_filename,str(threshold),str(e),str(s)]
+		gc.collect()
 		p = subprocess.Popen(r_args)
 		stdout_value = p.communicate()[0]
-		#output = subprocess.check_output['ls','-l','test.merge.ztnb']
-		#output_log = subprocess.check_output['ls','-l','test.merge.ztnblog']
-		#If regression converged, there is no need to try other epsilon or step,check log file flag: Y means coverged, N means not converged 
 		try:
-			r_output_log = open(temp_filename+".pipeclip.ztnblog","r")
+			r_output_log = open(cluster_filename+".pipeclip.ztnblog","r")
 			#logging.debug("Log file opened")
 			flag = r_output_log.read(1)
 			if flag == "Y":#converged
@@ -288,10 +290,8 @@ def clusterEnrich(clip,threshold=0.01):
 			continue
 
 	#check ztnb file
-	#r_output = subprocess.check_output(['ls','-l',temp_filename+'.ztnb'])
-	#if int(r_output.split()[4])>100: #more than header,file OK
 	try:
-		enrich_parameter = open(temp_filename+".pipeclip.ztnb","r")
+		enrich_parameter = open(cluster_filename+".pipeclip.ztnb","r")
 	except IOError,message:
 		logging.error("Cannot open ztnb result file")
 		return False
@@ -316,7 +316,6 @@ def clusterEnrich(clip,threshold=0.01):
 				clip.clusters[i].qvalue = nbDic[r_key][1]
 				clip.clusters[i].sig = True
 				clip.sigClusterCount += 1
-			#clip.addSigToDic(clip.sigClusters,clip.clusters[i])
 		nbDic = None
 		if clip.sigClusterCount == 0:
 			return False
@@ -326,8 +325,6 @@ def clusterEnrich(clip,threshold=0.01):
 def fisherTest(clusterp,mutationp):
 	R = robject.r
 	min_mp = min(mutationp)
-	#logging.debug("clusterP %f,%s" % (clusterp, type(clusterp)))
-	#logging.debug("mutationP %f,%s" % (min_mp, type(min_mp)))
 	product = clusterp * min_mp
 	if product == 0:
 		fps = 0
diff --git a/lib/Utils.py b/lib/Utils.py
index 2471d1c..fc3887d 100644
--- a/lib/Utils.py
+++ b/lib/Utils.py
@@ -88,7 +88,7 @@ def makeWig(bamfile):
 def makeWigByChr(bamfile,chr):
 	wig = {}
 	for pileupColumn in bamfile.pileup(chr):
-			wig[str(pileupColumn.pos)]=pileupColumn.n
+			wig[pileupColumn.pos]=pileupColumn.n
 	return wig
 
 
@@ -98,6 +98,15 @@ def makeWigListByChr(bamfile,chr):
 		wig.update(pileupColumn.pos,pileupColumn.n)
 	return wig
 
+
+def makeWigListByChr_array(bamfile,chr,chrlen):
+	wig = []
+	for i in range(chrlen):
+		wig.append(0)
+	for pileupColumn in bamfile.pileup(chr):
+		wig[pileupColumn.pos]=pileupColumn.n
+	return wig
+
 def bisort(alist,b):
 	'''List is a list of bed instances, b is also an instance'''
 	if isinstance(alist[0],Alignment.BED) and isinstance(b,Alignment.BED):
diff --git a/lib/ZTNB_tryCatch.R b/lib/ZTNB_tryCatch.R
index f890599..e4cae7f 100755
--- a/lib/ZTNB_tryCatch.R
+++ b/lib/ZTNB_tryCatch.R
@@ -116,11 +116,7 @@ for (i in intercept1)
 {
   for (j in intercept2)
   {
-    #print(paste("Coef start",i,j));
-    #print(paste("Recent likelihood",biggest_likelihood))
-    #print(paste("NB parameters",khat,muhat))
     nb<-tryCatch({vglm(tread_fit ~ 1, posnegbinomial(), weight = twts_fit, maxit = 200, trace = FALSE, step = vglm_step,offset = logmu,epsilon=vglm_epsilon,silent=FALSE,coefstart=c(i,j))},warning = function(w){
-    #print(paste("typeof warning",typeof(w["message"])))
     warnmsg = strsplit(toString(w["message"])," iteration")[[1]][1];
     if(grepl("convergence not obtained",warnmsg))
     {
diff --git a/pipeclip.py b/pipeclip.py
index be700b9..6497592 100755
--- a/pipeclip.py
+++ b/pipeclip.py
@@ -34,10 +34,10 @@ def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluste
 		logging.info("Input file OK,start to run PIPE-CLIP")
 		logging.info("Species info %s" % species)
 		if myClip.readfile():
-			myClip.filter(matchLength,mismatch,clipType,rmdup)
+			myClip.filter2(matchLength,mismatch,clipType,rmdup)
 			#myClip.printClusters()
-		#	myClip.printMutations()
-#BC#			if len(myClip.clusters)>0:
+			#myClip.printMutations()
+#BC#			if myClip.clusterCount>0:
 #BC#				logging.info("Get enriched clusters")
 #BC#				status = Enrich.clusterEnrich(myClip,fdrEnrichedCluster)
 #BC#				if status:
@@ -50,7 +50,7 @@ def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluste
 #BC#				logging.error("There is no clusters found. Please check input.Exit program.")
 #BC#				sys.exit(1)
 			
-			if len(myClip.mutations)>0:
+			if myClip.mutationCount>0:
 				logging.info("Get reliable mutations")
 				Enrich.mutationEnrich(myClip,fdrReliableMutation)
 				logging.info("There are %d reliable mutations" % myClip.sigMutationCount)

From e700ab421cbcd33883ced12624ab6ef1b61c92d2 Mon Sep 17 00:00:00 2001
From: Beibei Chen  Qbri <beibei.chen@utsouthwestern.edu>
Date: Sun, 2 Nov 2014 21:24:31 -0600
Subject: [PATCH 6/6] Add file cleaning

---
 lib/Enrich.py | 61 ++++++++++-------------------------
 pipeclip.py   | 88 +++++++++++++++++++++++++--------------------------
 2 files changed, 60 insertions(+), 89 deletions(-)

diff --git a/lib/Enrich.py b/lib/Enrich.py
index 11bdca6..ce4ea79 100755
--- a/lib/Enrich.py
+++ b/lib/Enrich.py
@@ -26,7 +26,7 @@
 import datetime
 import time
 import Utils
-from os
+import os
 import gc
 import Alignment
 import os
@@ -150,49 +150,6 @@ def KMvalue_test(clip,mutations,chr,chrlen):
 		del negBAM
 		gc.collect()
 
-
-def KMvalue_ignore(posmapfile,negmapfile,mufile):
-		'''
-		Calculate K(coverage) value for each mutation location
-		Mutations are already unique.
-		'''
-		km = []#store mutations with updated k value
-		km_pair = {}#Dic of count tuples of (k,m),key:"K_M"
-		count = 0
-		start_time = datetime.datetime.now()
-		for item in mufile:
-			count += 1
-			if count % 5000 == 0:
-				stop_time = datetime.datetime.now()
-				logging.info("Counting K-M for %d mutation sites, using %s" % (count,str(stop_time-start_time)))
-				start_time = stop_time
-			st = []
-			strand = item.strand 
-			M = item.score
-			K = 0
-			logging.debug("Time begin to pileup is %s, mutation %d" % (str(datetime.datetime.now()),count))
-			if strand == "+":
-				for pileupColumn in posmapfile.pileup(item.chr,int(item.start),int(item.stop)):
-					if pileupColumn == int(item.start):
-						K = pileupColumn.n
-						break
-			elif strand == "-":
-				for pileupColumn in negmapfile.pileup(item.chr,int(item.start),int(item.stop)):
-					if pileupColumn == int(item.start):
-						K = pileupColumn.n
-						break
-			if K>=M:
-				item.updateK(K)
-				#logging.debug("K value for item %s is %d" % (item, K))
-				pair_name = str(K)+"_"+str(M)
-				if km_pair.has_key(pair_name):
-					km_pair[pair_name] += 1
-				else:
-					km_pair[pair_name] = 1
-				#km.append(item)
-		return km_pair
-
-
 def uniq(b): #b is a list
 	uniqElements = []
 	for i in b:
@@ -217,8 +174,18 @@ def mutationEnrich(clip,threshold=0.01):
 			info = record.rstrip().split("\t")
 			new_mu = Alignment.MutationBed(info[0],int(info[1]),int(info[2]),info[3],int(info[4]),info[5],info[6])
 			mutations.append(new_mu)
-			os.remove(clip.outprefix+"."+chr+".mutations.bed")
+			try:
+				os.remove(clip.outprefix+"."+chr+".mutations.bed")
+			except:
+				pass
 		KM_test = KMvalue_test(clip,mutations,chr,chrlen)#check after doing KM, if clip.mutations changed
+	try:
+		os.remove(clip.posfilteredBAM)
+		os.remove(clip.negfilteredBAM)
+		os.remove(clip.posfilteredBAM+".bai")
+		os.remove(clip.negfilteredBAM+".bai")
+	except:
+		pass
 	del clip.posfilteredBAM 
 	del clip.negfilteredBAM 
 	gc.collect()#logging.info("Finished K-M counting, starting fitting.")
@@ -317,6 +284,10 @@ def clusterEnrich(clip,threshold=0.01):
 				clip.clusters[i].sig = True
 				clip.sigClusterCount += 1
 		nbDic = None
+		try:
+			os.remove(cluster_filename)
+		except:
+			pass
 		if clip.sigClusterCount == 0:
 			return False
 		else:
diff --git a/pipeclip.py b/pipeclip.py
index 6497592..e193b70 100755
--- a/pipeclip.py
+++ b/pipeclip.py
@@ -37,18 +37,18 @@ def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluste
 			myClip.filter2(matchLength,mismatch,clipType,rmdup)
 			#myClip.printClusters()
 			#myClip.printMutations()
-#BC#			if myClip.clusterCount>0:
-#BC#				logging.info("Get enriched clusters")
-#BC#				status = Enrich.clusterEnrich(myClip,fdrEnrichedCluster)
-#BC#				if status:
-#BC#					logging.info("Found %d enriched clusters" % myClip.sigClusterCount)
-#BC#					myClip.printEnrichedClusters()
-#BC#				else:
-#BC#					logging.error("There is no enriched cluster found. Exit program")
-#BC#					sys.exit(1)
-#BC#			else:
-#BC#				logging.error("There is no clusters found. Please check input.Exit program.")
-#BC#				sys.exit(1)
+			if myClip.clusterCount>0:
+				logging.info("Get enriched clusters")
+				status = Enrich.clusterEnrich(myClip,fdrEnrichedCluster)
+				if status:
+					logging.info("Found %d enriched clusters" % myClip.sigClusterCount)
+					myClip.printEnrichedClusters()
+				else:
+					logging.error("There is no enriched cluster found. Exit program")
+					sys.exit(1)
+			else:
+				logging.error("There is no clusters found. Please check input.Exit program.")
+				sys.exit(1)
 			
 			if myClip.mutationCount>0:
 				logging.info("Get reliable mutations")
@@ -58,38 +58,38 @@ def runPipeClip(infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluste
 			else:
 				logging.warning("There is no mutation found in this BAM file.")
 			#Start to get crosslinking sites
-#BC#			if myClip.sigClusterCount > 0 and myClip.sigMutationCount>0:
-#BC#				logging.info("Get cross-linking sites")
-#BC#				myClip.getCrosslinking()
-#BC#				if (len(myClip.crosslinking.keys())>0):
-#BC#					outfilelist = myClip.printCrosslinkingSites()
-#BC#					myClip.printCrosslinkingMutations()
-#BC#				else:
-#BC#					logging.warning("There is no crosslinking found. May be caused by no reliable mutations in enriched clusters. Print out enriched clusters instead.")
-#BC#					outfilelist = myClip.printEnrichClusters()
-#BC#			else:
-#BC#				if myClip.sigClusterCount <= 0:
-#BC#					logging.error("There is no enriched clusters for this sample, please check your input file. Exit.")
-#BC#					sys.exit(2)
-#BC#				elif myClip.sigMutationCount <=0:
-#BC#					logging.warning("There is no reliable mutations found. PIPE-CLIP will provide enriched clusters as crosslinking candidates.")
-#BC#					outfilelist = myClip.printEnrichClusters()
-#BC#			#annotation if possible
-#BC#		if species in ["mm10","mm9","hg19"]:
-#BC#			logging.info("Started to annotate cross-linking sits using HOMER")
-#BC#			for name in outfilelist:
-#BC#				#logging.debug("Start to do annotation for %s" % name)
-#BC#				Utils.annotation(name,species)
-#BC#		#output a status log file
-#BC#		logfile = open(outputPrefix+".pipeclip.summary.log","w")
-#BC#		print >>logfile,"PIPE-CLIP run finished. Parameters are:"
-#BC#		print >> logfile,"Input BAM: %s \nOutput prefix: %s \nMinimum matched length: %d \nMaximum mismatch count: %d \nPCR duplicate removal code: %d \nFDR for enriched clusters: %f \nFDR for reliable mutations: %f" % (infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,fdrReliableMutation)
-#BC#		print >> logfile, "There are %d mapped reads in input BAM file. After filtering,%d reads left" % (myClip.originalMapped,myClip.filteredAlignment)
-#BC#		print >> logfile, "%d out of %d clusters are enriched." % (myClip.sigClusterCount,len(myClip.clusters))
-#BC#		print >> logfile, "%d out of %d mutations are reliable." % (myClip.sigMutationCount,myClip.mutationCount)
-#BC#		print >> logfile, "%d crosslinking site candidates are found, with %d supporting reliable mutations." % (len(myClip.crosslinking.keys()),len(myClip.crosslinkingMutations))
-#BC#		logfile.close()
-#BC#		logging.info("PIPE-CLIP finished the job, please check your results. :)")	
+			if myClip.sigClusterCount > 0 and myClip.sigMutationCount>0:
+				logging.info("Get cross-linking sites")
+				myClip.getCrosslinking()
+				if (len(myClip.crosslinking.keys())>0):
+					outfilelist = myClip.printCrosslinkingSites()
+					myClip.printCrosslinkingMutations()
+				else:
+					logging.warning("There is no crosslinking found. May be caused by no reliable mutations in enriched clusters. Print out enriched clusters instead.")
+					outfilelist = myClip.printEnrichClusters()
+			else:
+				if myClip.sigClusterCount <= 0:
+					logging.error("There is no enriched clusters for this sample, please check your input file. Exit.")
+					sys.exit(2)
+				elif myClip.sigMutationCount <=0:
+					logging.warning("There is no reliable mutations found. PIPE-CLIP will provide enriched clusters as crosslinking candidates.")
+					outfilelist = myClip.printEnrichClusters()
+			#annotation if possible
+		if species in ["mm10","mm9","hg19"]:
+			logging.info("Started to annotate cross-linking sits using HOMER")
+			for name in outfilelist:
+				#logging.debug("Start to do annotation for %s" % name)
+				Utils.annotation(name,species)
+		#output a status log file
+		logfile = open(outputPrefix+".pipeclip.summary.log","w")
+		print >>logfile,"PIPE-CLIP run finished. Parameters are:"
+		print >> logfile,"Input BAM: %s \nOutput prefix: %s \nMinimum matched length: %d \nMaximum mismatch count: %d \nPCR duplicate removal code: %d \nFDR for enriched clusters: %f \nFDR for reliable mutations: %f" % (infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,fdrReliableMutation)
+		print >> logfile, "There are %d mapped reads in input BAM file. After filtering,%d reads left" % (myClip.originalMapped,myClip.filteredAlignment)
+		print >> logfile, "%d out of %d clusters are enriched." % (myClip.sigClusterCount,len(myClip.clusters))
+		print >> logfile, "%d out of %d mutations are reliable." % (myClip.sigMutationCount,myClip.mutationCount)
+		print >> logfile, "%d crosslinking site candidates are found, with %d supporting reliable mutations." % (len(myClip.crosslinking.keys()),len(myClip.crosslinkingMutations))
+		logfile.close()
+		logging.info("PIPE-CLIP finished the job, please check your results. :)")	
 	else:
 		logging.error("File corruputed, program exit.")
 		sys.exit(0)