-
Notifications
You must be signed in to change notification settings - Fork 0
/
motif_pwm_scan_cov.py
354 lines (318 loc) · 14.5 KB
/
motif_pwm_scan_cov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
# This module is free software. You can redistribute it and/or modify it under
# the terms of the MIT License, see the file COPYING included with this
# distribution.
from __future__ import division
"""
Perform operations to scan PWM input file and find coverage and number of motifs required (The motif selection problem)
"""
#python imports
import sys
import os
import argparse
import shutil
#add the util folder path to use util files in it
inPath = os.path.realpath(__file__)
split = inPath.split('/')
inPath = '/'.join(split[:len(split)-1])
sys.path.append(inPath + '/utils')
import conf
import general_utils
import Fimo
import Tomtom
#add the alg folder path to call the different algorithms
sys.path.append('./algo')
import greedy
import ILP
import branch_cut
import required_cover
def outputResults(outFileName, depthDict, backFimoDict, filterMinNumSeq, idMotifDict, foreNumSeqs, backNumSeqs):
"""Output the results from the depth dictionary
Args:
outFileName: name of file to output results to
"""
# a list that stores the fianl list of IDs that passed the threshold filter
finalSelectMotifList = []
depthOut = open(outFileName, 'wb')
for depthVal in sorted(depthDict.iterkeys(), key=int):
print 'depth is:', depthVal
depthOut.write('\nDepth:' + str(depthVal) + '\n')
depthOut.write('#Name,foreground_motif_seqCount,foreground_motif_seqCoverage(%),foreground_seqs_added,cumulative_foreground_seq_cov(%)'
+ ',background_motif_seqCount,background_motif_seqCoverage(%),background_seqs_added,cumulative_background_seq_cov(%)' +'\n')
cumCov = 0
backCumCov = 0
backSeqsAdded = []
motifIdList = depthDict[depthVal][0]
motifSeqSet = depthDict[depthVal][1]
newSeqsAddedList = depthDict[depthVal][2]
#add new information to the motif objects as well
for i in range(len(motifIdList)):
lineList = []
motifId = motifIdList[i]
motifSeqset = motifSeqSet[i]
seqsAdded = newSeqsAddedList[i]
#apply the filter threshold
if seqsAdded < filterMinNumSeq:
continue
if motifId not in idMotifDict:
motifName = motifId
else:
motifName = idMotifDict[motifId]
finalSelectMotifList.append(motifName)
lineList.append(motifName)
lineList.append(str(len(motifSeqset)))
cov = 100 * (len(motifSeqset)/foreNumSeqs)
lineList.append(str(cov))
lineList.append(str(seqsAdded))
cumCov += (seqsAdded/foreNumSeqs)*100
lineList.append(str(cumCov))
#add the background information for this motif
#in the backFimoDict the IDs are string type
if motifName in backFimoDict:
backSeqList = backFimoDict[motifName]
backSeqCount = len(backSeqList)
backSeqCov = (backSeqCount/backNumSeqs)*100
backSeqsAddedCount = 0
for backSeq in backSeqList:
if backSeq not in backSeqsAdded:
backSeqsAddedCount += 1
backSeqsAdded.append(backSeq)
backCumCov += (backSeqsAddedCount/backNumSeqs)*100
else:
backSeqCount = 0
backSeqCov = 0
backSeqsAddedCount=0
backCumCov += 0
lineList.append(str(backSeqCount))
lineList.append(str(backSeqCov))
lineList.append(str(backSeqsAddedCount))
lineList.append(str(backCumCov))
line = ','.join(lineList)
depthOut.write(line + '\n')
depthOut.close()
return finalSelectMotifList
def callMotifPwmScanCov(args, confDict, fileList):
"""
The main function to perfrom motif PWM scanning and coverage on a testing file
Args:
args: the object that holds the input args
confDict: the dict of the input configuration file
fileList: list of files to be added to the results directory
Returns:
"""
#get the PWM file to scan for
pwmFileName = confDict['input']['pwm_file']
#read the PWM file and make a motif dict between motif IDs (names) and the motif objects which have PWM lines. Also return two dicts between the motif anmes and IDs and vice versa
motifDict = general_utils.makeMotifDictFromPwmFile(pwmFileName)
print 'len motifDict:', len(motifDict)
#find the total number of seqs in the testing file
foreNumSeqs = general_utils.findNumSeqs(confDict['input']['fore_testing_file'])
#make a list of all seq names in the foreground data set
foreSeqList = general_utils.findSeqList(confDict['input']['fore_testing_file'])
#make a list of all seq names in the background data set
backSeqList = general_utils.findSeqList(confDict['input']['back_testing_file'])
#run the FIMO tool on the foreground data set
fimoParaDict = {}
fimoParaDict['thresh'] = confDict['fimo']['pvalue']
#fimo output directory name
fimoOutDir = args.jid + '_foreground_Fimo'
Fimo.callFimo(confDict['input']['fore_testing_file'] , pwmFileName, fimoParaDict, fimoOutDir)
fileList.append(fimoOutDir)
#parse the fimo results folder
strand = confDict['fimo']['strand']
#fimoDict between motif ID and seq hits in the foeground testing file
fimoDict = Fimo.parseFimo(fimoOutDir+'/fimo.txt', strand)
#write the motifs in the format for sequence coverge algorithms
motifSeqFileName = args.jid + '_fore_motif_hits_in_seqs'
general_utils.writeMotifSeqFile(fimoDict, motifSeqFileName)
fileList.append(motifSeqFileName)
#check if motif refinement requested to remove redundant motifs when applying the seq_cov algthms
#the tomtomDict will be used with the seq coverage algorithm. The dict is between all the motifs to check their similarity
tomtomDict = {}
if confDict['motif_refine']['app'] == 'tomtom_refine':
print 'Peform motif refine using tomtom'
#dict that stores parameters for the tomtom tool
tomParaDict = {}
tomParaDict['evalue'] = confDict['tomtom_refine']['evalue']
#tomtom output directory name
tomtomOutDir = args.jid + '_Tomtom_refine'
#we are comparing the motifs against each other, so use the same PWM file
Tomtom.callTomtom(confDict['input']['pwm_file'], confDict['input']['pwm_file'], tomParaDict, tomtomOutDir)
fileList.append(tomtomOutDir)
#parse the tomtom output file and return a dict between motif Name and list[] of motifs that match it
tomtomDict = Tomtom.parse(tomtomOutDir+'/tomtom.txt')
#process the seq cov file
motifIdDict, idMotifDict, seqIdDict, idSeqDict, Uset, Sdict = general_utils.processMotifHitFile_1(motifSeqFileName)
#call the seq cov algthm
minSetIdList = []
minSeqSet = set()
#set the filtering values for later
filterThresh = 0
filterMinNumSeq = 0
#this dict is between a depth value and a tuple of motif Ids, seq sets, and newly added seqs by each motif
depthDict = {}
#just a tmp check will be removed later
makeLogo = 'false'
#####
#ILP
####
if confDict['sequence.coverage']['app'] == 'ILP' and confDict['motif_refine']['app'] == 'none':
print 'ILP algorithm'
depth = int(confDict['ILP']['depth'])
#this dict is between a depth value and a list of motif IDs
depthDict = ILP.callILPDepth(args.jid, fimoDict, tomtomDict, depth, fileList)
print 'len:', len(depthDict)
#if there is a background file use it for comparison purposes
if confDict['input']['back_testing_file'] != 'none':
#run the FIMO tool for the background sequences
fimoParaDict = {}
fimoParaDict['thresh'] = confDict['fimo']['pvalue']
#fimo output directory name
fimoOutDir = args.jid + '_background_Fimo'
Fimo.callFimo(confDict['input']['back_testing_file'] , pwmFileName, fimoParaDict, fimoOutDir)
fileList.append(fimoOutDir)
#find number of sequences in the background file
backNumSeqs = general_utils.findNumSeqs(confDict['input']['back_testing_file'])
#parse the fimo results folder
strand = confDict['fimo']['strand']
#fimoDict between motif names and seq hits in the background file
backFimoDict = Fimo.parseFimo(fimoOutDir+'/fimo.txt', strand)
#write the motifs in the format for sequence coverge algorithms
motifSeqFileName = args.jid + '_back_motif_hits_in_seqs'
general_utils.writeMotifSeqFile(backFimoDict, motifSeqFileName)
fileList.append(motifSeqFileName)
#no filtering
filterMinNumSeq = 0
#output results
for depthVal in sorted(depthDict.iterkeys(), key=int):
depthOutFileName = args.jid + '_depth_results_'+ str(depthVal) + '.csv'
finalSelectMotifList = outputResults(depthOutFileName, depthDict, backFimoDict, filterMinNumSeq, idMotifDict,foreNumSeqs, backNumSeqs)
print 'final:', finalSelectMotifList
fileList.append(depthOutFileName)
####
#branch and cut
####
if confDict['sequence.coverage']['app'] == 'branch_cut' and confDict['motif_refine']['app'] == 'none':
print 'Branch and cut algorithm'
depth = int(confDict['branch_cut']['depth'])
#this dict is between a depth value and a list of motif IDs
depthDict = branch_cut.callBranchDepth(args.jid, fimoDict, tomtomDict, depth, fileList)
print 'len:', len(depthDict)
#if there is a background file use it for comparison purposes
if confDict['input']['back_testing_file'] != 'none':
#run the FIMO tool for the background sequences
fimoParaDict = {}
fimoParaDict['thresh'] = confDict['fimo']['pvalue']
#fimo output directory name
fimoOutDir = args.jid + '_background_Fimo'
Fimo.callFimo(confDict['input']['back_testing_file'] , pwmFileName, fimoParaDict, fimoOutDir)
fileList.append(fimoOutDir)
#find number of sequences in the background file
backNumSeqs = general_utils.findNumSeqs(confDict['input']['back_testing_file'])
#parse the fimo results folder
strand = confDict['fimo']['strand']
#fimoDict between motif names and seq hits in the background file
backFimoDict = Fimo.parseFimo(fimoOutDir+'/fimo.txt', strand)
#write the motifs in the format for sequence coverge algorithms
motifSeqFileName = args.jid + '_back_motif_hits_in_seqs'
general_utils.writeMotifSeqFile(backFimoDict, motifSeqFileName)
fileList.append(motifSeqFileName)
#no filtering
filterMinNumSeq = 0
#output results
for depthVal in sorted(depthDict.iterkeys(), key=int):
depthOutFileName = args.jid + '_depth_results_'+ str(depthVal) + '.csv'
finalSelectMotifList = outputResults(depthOutFileName, depthDict, backFimoDict, filterMinNumSeq, idMotifDict,foreNumSeqs, backNumSeqs)
print 'final:', finalSelectMotifList
fileList.append(depthOutFileName)
####
#required cover
####
if confDict['sequence.coverage']['app'] == 'required_cover' and confDict['motif_refine']['app'] == 'none':
print 'Required cover algorithm'
depth = int(confDict['required_cover']['depth'])
#this dict is between a depth value and a list of motif IDs
depthDict = required_cover.callRequiredDepth(args.jid, fimoDict, tomtomDict, depth, fileList)
print 'len:', len(depthDict)
#if there is a background file use it for comparison purposes
if confDict['input']['back_testing_file'] != 'none':
#run the FIMO tool for the background sequences
fimoParaDict = {}
fimoParaDict['thresh'] = confDict['fimo']['pvalue']
#fimo output directory name
fimoOutDir = args.jid + '_background_Fimo'
Fimo.callFimo(confDict['input']['back_testing_file'] , pwmFileName, fimoParaDict, fimoOutDir)
fileList.append(fimoOutDir)
#find number of sequences in the background file
backNumSeqs = general_utils.findNumSeqs(confDict['input']['back_testing_file'])
#parse the fimo results folder
strand = confDict['fimo']['strand']
#fimoDict between motif names and seq hits in the background file
backFimoDict = Fimo.parseFimo(fimoOutDir+'/fimo.txt', strand)
#write the motifs in the format for sequence coverge algorithms
motifSeqFileName = args.jid + '_back_motif_hits_in_seqs'
general_utils.writeMotifSeqFile(backFimoDict, motifSeqFileName)
fileList.append(motifSeqFileName)
#no filtering
filterMinNumSeq = 0
#output results
for depthVal in sorted(depthDict.iterkeys(), key=int):
depthOutFileName = args.jid + '_depth_results_'+ str(depthVal) + '.csv'
finalSelectMotifList = outputResults(depthOutFileName, depthDict, backFimoDict, filterMinNumSeq, idMotifDict,foreNumSeqs, backNumSeqs)
print 'final:', finalSelectMotifList
fileList.append(depthOutFileName)
####
#greedy filtered (greedyFilt) and refinement
####
if confDict['sequence.coverage']['app'] == 'greedyFilt' and confDict['motif_refine']['app'] == 'tomtom_refine':
depth = int(confDict['greedyFilt']['depth'])
if confDict['greedyFilt']['filter'] == 'true':
#get the threshold value from the configure file
filterThresh = float(confDict['greedyFilt']['filter_threshold'])
filterMinNumSeq = filterThresh * foreNumSeqs
print 'Perform filtered greedy sequence coverage with refinement and depth:', depth
#this dict is between a depth value and a tuple of motif Ids, seq sets, and newly added seqs by each motif
#depthDict = greedy.callGreedyDepthRefine(Uset, Sdict, depth, tomtomDict, motifIdDict, idMotifDict, filterMinNumSeq)
depthDict = greedy.callGreedyFiltRefine(Uset, Sdict, depth, tomtomDict, motifIdDict, idMotifDict, filterMinNumSeq)
#if there is a background file use it for comaprison purposes
if confDict['input']['back_testing_file'] != 'none':
#run the FIMO tool for the background sequences
fimoParaDict = {}
fimoParaDict['thresh'] = confDict['fimo']['pvalue']
#fimo output directory name
fimoOutDir = args.jid + '_background_Fimo'
Fimo.callFimo(confDict['input']['back_testing_file'] , pwmFileName, fimoParaDict, fimoOutDir)
fileList.append(fimoOutDir)
#find number of sequences in the background file
backNumSeqs = general_utils.findNumSeqs(confDict['input']['back_testing_file'])
#parse the fimo results folder
strand = confDict['fimo']['strand']
#fimoDict between motif names and seq hits in the background file
backFimoDict = Fimo.parseFimo(fimoOutDir+'/fimo.txt', strand)
#write the motifs in the format for sequence coverge algorithms
motifSeqFileName = args.jid + '_back_motif_hits_in_seqs'
general_utils.writeMotifSeqFile(backFimoDict, motifSeqFileName)
fileList.append(motifSeqFileName)
#output results
depthOutFileName = args.jid + '_depth_results.csv'
finalSelectMotifList = outputResults(depthOutFileName, depthDict, backFimoDict, filterMinNumSeq, idMotifDict, foreNumSeqs, backNumSeqs)
fileList.append(depthOutFileName)
####
#check if want to produce motif logos
####
if confDict['motif.logo']['opt'] == 'true':
if makeLogo == 'false':
#get all the motif IDs from all the depths
mIdList = []
for depthVal in sorted(depthDict.iterkeys(), key=int):
mIdList.extend(depthDict[depthVal][0])
seqCovPwmFileName = args.jid + '_seqCovMotifs.pwm'
logoDirName = args.jid + '_seqCovLogo'
fileList.append(seqCovPwmFileName)
fileList.append(logoDirName)
general_utils.makeMotifLogo(mIdList, motifDict, logoDirName, seqCovPwmFileName, idMotifDict, 'cov', finalSelectMotifList)
def main():
pass
if( __name__ == "__main__" ):
main(sys.argv)
else:
pass