/
lib.py
329 lines (286 loc) · 9.72 KB
/
lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
"""
This file defines useful functions
"""
import os,sys
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from scipy import stats
from itertools import combinations
def generateConfoldscript(confoldPath, contact_new, secondary_structure_file, output_path_new):
"""
Generate bash script for CONFOLD2
Parameters
----------
confoldPath: str
the path of confold2 software
contact_new: str
the updated contact filename
secondary_structure_file: str
the filename of the secondary structure provided in the input file (*.inp)
output_path_new: str
the output path for CONFOLD2 results
Returns
----------
sge_file: str
the submission script for CONFOLD2
"""
jobname=os.path.basename(output_path_new)
sge_file=os.path.dirname(output_path_new)+"/sge.confold."+jobname
f_sge=open(sge_file,"w")
f_sge.write("#$ -S /bin/bash"+"\n"
+"#$ -q analysis.q"+"\n"
+"#$ -pe orte 1"+"\n"
+"#$ -o "+os.path.dirname(output_path_new)+"/confold_"+jobname+".log"+"\n"
+"#$ -e "+os.path.dirname(output_path_new)+"/confold_"+jobname+".err"+"\n"
+"#$ -N confold_"+jobname+"\n"
+""+"\n"
+"cd "+os.path.dirname(output_path_new)+"\n"
+""+"\n"
+confoldPath+"/core.pl -mcount 20 -rr "+contact_new+" -ss "+secondary_structure_file+" -selectrr all -o "+output_path_new+"\n"
+""+"\n"
+"cd "+output_path_new+"\n"
+"cp "+output_path_new+"/stage2/*model1*pdb "+output_path_new+"/\n"
+"tar -czf stage1n2.tar.gz stage1 stage2 input"+"\n"
+"rm -r stage1 stage2 input"+"\n"
+"mv "+sge_file+" "+output_path_new+"/\n"
)
f_sge.close()
return sge_file
def submitConfoldscript(sge_file):
"""
Submit CONFOLD2 job
Parameters
----------
sge_file: str
the submission script for CONFOLD2
"""
cmd="chmod +x "+sge_file
os.system(cmd)
cmd="qsub "+sge_file
os.system(cmd)
def update_contact_file(contact_default, cluster, contact_new):
"""
Compute the new contact file based on the state vector
Parameters
----------
contact_default: str
the filename of the default contact set genearted by generateDefault.py
cluster: numpy.ndarray (int)
the cluster vector with the shape N*2, where N is the number of to be removed contacts
contact_new: str
the output filename for the new contact set
Returns
----------
contact_new: str
the updated contact file serves as input for CONFOLD2
contact_num: int
the updated number of contacts
"""
f_new=open(contact_new, "w")
f_default=open(contact_default, "r")
indx=0
contact_num=0
for line in f_default:
if indx==0:
# copy the first line: sequence info
f_new.write(line)
elif indx-1 not in cluster:
# copy the active contact info
f_new.write(line)
contact_num+=1
indx+=1
f_default.close()
f_new.close()
return contact_num
def fold(confoldPath, contact_new, secondary_structure_file, output_path_new):
"""
Compute the best 3D structure using CONFOLD2
Parameters
----------
confoldPath: str
the path of confold2 software
contact_new: str
the updated contact filename
secondary_structure_file: str
the filename of the secondary structure provided in the input file (*.inp)
output_path_new: str
the output path for CONFOLD2 results
Returns
----------
contact_new: str
the updated contact file serves as input for CONFOLD2
contact_num: int
the updated number of contacts
"""
sge_file=generateConfoldscript(confoldPath, contact_new, secondary_structure_file, output_path_new)
submitConfoldscript(sge_file)
def tm_score(confoldPath, structure_new, structure_default):
"""
Compute the TM score and RMSD between the new and default structure
Parameters
----------
confoldPath: str
the path of confold2 software
structure_new: str
the predicted structure based on the updated contact set
structure_default: str
the predicted structure based on the default contact set
Returns
----------
tmscore: float (range: (0, 1])
the TM score between the new and default structure
rmsd: float
the RMSD between the new and default structure
"""
output=structure_new.split(".")[0]+"-evaluation.txt"
cmd=confoldPath+"/third-party-programs/TMscore "+structure_new+" "+structure_default+" &> "+output
os.system(cmd)
# read TM output file
f=open(output,"r")
for line in f:
columns=line.strip().split()
if len(columns)>3 and columns[0]=="RMSD" and columns[3]=="common":
rmsd=float(columns[-1])
if len(columns)>1 and columns[0]=="TM-score" and columns[1]=="=":
tmscore=float(columns[2])
f.close()
return tmscore, rmsd
def get_atom_position(structure_new, residue_id):
"""
Compute the xyz position of CB atom (CA atom for GLY) for the given residue
Parameters
----------
structure_new: str
the predicted structure based on the updated contact set
residue_id: int
the 1-indexed residue index
Returns
----------
xyz: numpy.ndarray (float)
the atom position vector with the shape (3, )
"""
f=open(structure_new, "r")
for line in f:
columns=line.strip().split()
if len(columns)==11 and columns[0]=="ATOM" and columns[4]==str(residue_id):
if columns[3]=="GLY" and columns[2]=="CA":
# For GLY, return the position vector of CA atom
xyz=[float(columns[5]), float(columns[6]), float(columns[7])]
elif columns[2]=="CB":
xyz=[float(columns[5]), float(columns[6]), float(columns[7])]
if len(columns)==12 and columns[0]=="ATOM" and columns[4]=="A" and columns[5]==str(residue_id):
if columns[3]=="GLY" and columns[2]=="CA":
# For GLY, return the position vector of CA atom
xyz=[float(columns[6]), float(columns[7]), float(columns[8])]
elif columns[2]=="CB":
xyz=[float(columns[6]), float(columns[7]), float(columns[8])]
f.close()
xyz=np.array(xyz)
return xyz
def contact_center(structure_new, contact_pair):
"""
Compute the center of two vectors defining xyz positions
Parameters
----------
structure_new: str
the predicted structure based on the updated contact set
contact_pair: list (int)
the residue index vector with the shape (2, )
Returns
----------
center: numpy.ndarray (float)
the center of two atom position vectors with the shape (3, )
"""
r_1=get_atom_position(structure_new, contact_pair[0])
r_2=get_atom_position(structure_new, contact_pair[1])
center=0.5*(r_1+r_2)
return center
def contact_filter(structure_new, contact_pair):
"""
Check whether a contact pair is satified in the given structure
Definition of a satisfied contact pair: CB-CB distance <= 8 Angstorms (CA for Glycine)
Parameters
----------
structure_new: str
the predicted structure based on the updated contact set
contact_pair: list (int)
the residue index vector with the shape (2, )
Returns
----------
contact_fil: bool
1: contact pair is satisfied in the given structure
0: contact pair is not satisfied in the given structure
"""
r_1=get_atom_position(structure_new, contact_pair[0])
r_2=get_atom_position(structure_new, contact_pair[1])
dist=np.sqrt(np.sum((r_1-r_2)**2))
if dist <= 8:
contact_fil=1
else:
contact_fil=0
return contact_fil
def contact_satisfaction(structure_new, contact_default_info, Q_max, optimal_num):
"""
Compute the normalized contact satisfaction score for the given structure
Definition of the normalized contact satisfaction score: Q_s=Q/Q_max
Q: the sum of the weights of satisfied contacts in the given structure
Q_max (the maximum contact satisfication): the sum of all contact weights
Parameters
----------
structure_new: str
the predicted structure based on the updated contact set
contact_default_info: pandas.DataFrame (three columns: res_i, res_j, p)
the information of the default contacts containing the resiude index and weight of each contact
Q_max: float
the maximum contact satisfication (the sum of all contact weights)
optimal_num: int
the total number of the default contacts
Returns
----------
Q_s: float (range: (0, 1])
the normalized contact satisfaction score for the given structure
"""
Q=0
for i in range(optimal_num):
contact_pair=[int(contact_default_info.iloc[i]["res_i"]), int(contact_default_info.iloc[i]["res_j"])]
if contact_filter(structure_new, contact_pair)==1:
Q+=float(contact_default_info.iloc[i]["p"])
Q_s=Q/Q_max
return Q_s
def cal_reward(confoldPath, structure_new, structure_default, contact_default_info, Q_max, optimal_num, tm_thres, Q_s_thres):
"""
Compute the reward function
Reward=1/tmscore-1
Q_s: the normalized contact satisfaction score for the new structure
tmscore: the TM score between the new and default structure
Parameters
----------
confoldPath: str
the path of confold2 software
structure_new: str
the predicted structure based on the updated contact set
structure_default: str
the predicted structure based on the default contact set
contact_default_info: pandas.DataFrame (three columns: res_i, res_j, p)
the information of the default contacts containing the resiude index and weight of each contact
Q_max: float
the maximum contact satisfication (the sum of all contact weights)
optimal_num: int
the total number of the default contacts
tm_thres: float
the threshold value of tmscore
Q_s_thres: float
the threshold value of Q_s (the normalized contact satisfaction score for the given structure)
Returns
----------
reward: float
the reward of the new structure
"""
tmscore, rmsd=tm_score(confoldPath, structure_new, structure_default)
Q_s=contact_satisfaction(structure_new, contact_default_info, Q_max, optimal_num)
if tmscore>=tm_thres and Q_s>=Q_s_thres:
reward=1.0/tmscore-1
else:
reward=0
return reward, Q_s, tmscore