-
Notifications
You must be signed in to change notification settings - Fork 0
/
update_target_mmseqs_database.py
237 lines (200 loc) · 11.8 KB
/
update_target_mmseqs_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import argparse
import gzip
import json
import logging
import multiprocessing
import pathlib
import traceback
from itertools import repeat
import numpy as np
from CONFIG.FOLDER_STRUCTURE import TARGET_MMSEQS_DB_NAME, ATOMS, SEQUENCES, STRUCTURE_FILES_PATH, \
DEFAULT_NAME, SEQ_ATOMS_DATASET_PATH, MMSEQS_DATABASES_PATH, MERGED_SEQUENCES, TARGET_DB_CONFIG
from CONFIG.RUNTIME_PARAMETERS import CPU_COUNT
from CONFIG.get_config_dict import target_db_config
from utils.bio_utils import PROTEIN_LETTERS
from structure_files_parsers.parse_pdb import parse_pdb
from structure_files_parsers.parse_mmcif import parse_mmcif
from CPP_lib.libAtomDistanceIO import save_atoms
from CPP_lib.libAtomDistanceIO import initialize as initialize_CPP_LIB
from utils.mmseqs_utils import mmseqs_createdb
from utils.mmseqs_utils import mmseqs_createindex
from utils.utils import create_unix_timestamp_folder, merge_files_binary, parse_input_paths
# need to parse different type of files? Add a pattern with a parser in this dict.
# read structure_files_parsers/README.md for more information about how to create new parser.
STRUCTURE_FILES_PARSERS = {
'.pdb': parse_pdb,
'.pdb.gz': parse_pdb,
'.cif': parse_mmcif,
'.cif.gz': parse_mmcif,
'.ent': parse_pdb,
'.ent.gz': parse_pdb
}
###################################################################################################################
# main:
# 1. iterates over --input searching for file extensions that match the STRUCTURE_FILES_PARSERS keys.
# 2. filter out protein_ids that already exists in SEQ_ATOMS_DATASET_PATH / target_db_name / ATOMS
# 3. process_structure_file in parallel
# 4. merges all sequences from SEQ_ATOMS_DATASET_PATH / target_db_name / SEQUENCES
# 5. creates and index a new mmseqs target database inside MMSEQS_DATABASES_PATH / target_db_name / timestamp
# 6. creates a new_structure_ids.json, so it is easier to remove unwanted ATOM and SEQUENCE after mistake
#
# process_structure_file:
# 1. reads the structure file extracting sequence and atom positions
# 2. skip short sequences and truncate ones that size is over MAX_TARGET_CHAIN_LENGTH inside target_db_config.json
# 3. save extracted data:
# sequence - protein_id.faa file containing single sequence f.write(f">{protein_id}\n{sequence}\n")
# SEQ_ATOMS_DATASET_PATH / target_db_name / SEQUENCES / (protein_id + ".faa")
# atom positions - binary file containing positions of all atoms and correlated amino acid chain index
# SEQ_ATOMS_DATASET_PATH / target_db_name / ATOMS / (protein_id + ".bin")
# For more information on how those binary are saved check out source code at CPP_lib/atoms_file_io.h
###################################################################################################################
def parse_args():
parser = argparse.ArgumentParser(description="Read structure files from folders --input to extract sequence and atom positions. "
"Create and index new --output MMSEQS2 database")
parser.add_argument("-p", "--project_name", required=False, default=DEFAULT_NAME, help="Name for the target database")
parser.add_argument("-i", "--input", nargs='+', required=False, default=None,
help=f"List of folder or file paths containing structure files. Both absolute and relative to {STRUCTURE_FILES_PATH} are accepted."
f"If not provided pipeline will search in {STRUCTURE_FILES_PATH}/--name. "
f"Use '--input .' to process all files within {STRUCTURE_FILES_PATH}") # logic described here is implemented in parse_input_paths
# todo add max_target_chain_length argument and inform user if there is difference between this arg and existing target_db_config.json
# parser.add_argument("-m", "--max_target_chain_length", required=False, default=None, help="If protein chain is longer than this value, it will be truncated")
parser.add_argument("--overwrite", action="store_true", help="Flag to override existing sequences and atom positions")
return parser.parse_args()
def process_structure_file(structure_file, save_path, max_target_chain_length):
protein_id = structure_file.name
# extract sequence and atom positions from structure file
try:
for pattern in STRUCTURE_FILES_PARSERS.keys():
if protein_id.endswith(pattern):
if protein_id.endswith('.gz'):
f = gzip.open(structure_file, 'rt')
else:
f = open(structure_file, 'r')
atom_amino_group, positions, groups = STRUCTURE_FILES_PARSERS[pattern](f)
f.close()
protein_id = structure_file.name.replace(pattern, '')
break
except Exception:
print("EXCEPTION WHILE READING FILE ", str(structure_file))
logging.error(traceback.format_exc())
return "file reading exceptions"
# process and store sequence and atom positions in SEQ_ATOMS_DATASET_PATH / output_name
sequence_path = save_path / SEQUENCES / (protein_id + ".faa")
atoms_path = save_path / ATOMS / (protein_id + ".bin")
try:
_, groups = np.unique(groups, return_index=True)
groups.sort()
if len(groups) < 9:
# print("Files containing protein chains shorter than 9 amino acids might be corrupted or contain DNA ", file)
return "sequences too short, probably DNA or corrupted"
truncated = False
if len(groups) > max_target_chain_length:
# print(f"Protein chains with more than {max_target_chain_length} are truncated. {str(structure_file)}\n"
# f"Change CONFIG/RUNTIME_PARAMETERS.py :max_target_chain_length and rerun this script with --overwrite to process this file again")
truncated = True
group_indexes = groups[:max_target_chain_length]
group_indexes = np.append(group_indexes, groups[max_target_chain_length]).astype(np.int32)
else:
group_indexes = np.append(groups, positions.shape[0]).astype(np.int32)
sequence = ''.join([PROTEIN_LETTERS[atom_amino_group[i]] for i in group_indexes[:-1]])
with open(sequence_path, "w") as f:
f.write(f">{protein_id}\n{sequence}\n")
save_atoms(positions, group_indexes, str(atoms_path))
if truncated:
return f"SUCCEED, but sequences and contact maps got truncated to {max_target_chain_length}"
else:
return "SUCCEED"
except Exception:
print("EXCEPTION DURING FILE PROCESSING ", str(structure_file))
logging.error(traceback.format_exc())
sequence_path.unlink(missing_ok=True)
atoms_path.unlink(missing_ok=True)
return "file processing exceptions"
def main(input_paths, project_name, overwrite):
seq_atoms_path = SEQ_ATOMS_DATASET_PATH / project_name
seq_atoms_path.mkdir(exist_ok=True, parents=True)
print("Sequences and Atoms positions will be stored in: ", seq_atoms_path)
(seq_atoms_path / SEQUENCES).mkdir(exist_ok=True)
(seq_atoms_path / ATOMS).mkdir(exist_ok=True)
# get MAX_TARGET_CHAIN_LENGTH from existing config or create one
target_db_path = MMSEQS_DATABASES_PATH / project_name
target_db_path.mkdir(exist_ok=True, parents=True)
if (target_db_path / TARGET_DB_CONFIG).exists():
print(f"Using existing TARGET_DB_CONFIG {target_db_path / TARGET_DB_CONFIG}")
max_target_chain_length = json.load(open(target_db_path / TARGET_DB_CONFIG))["MAX_TARGET_CHAIN_LENGTH"]
else:
print(f"Creating a new PROJECT_CONFIG {target_db_path / TARGET_DB_CONFIG}")
config = target_db_config()
json.dump(config, open(target_db_path / TARGET_DB_CONFIG, "w"), indent=4)
max_target_chain_length = config["MAX_TARGET_CHAIN_LENGTH"]
print(f"\t{max_target_chain_length} - MAX_TARGET_CHAIN_LENGTH")
print("Searching for structure files in paths:")
for input_path in input_paths:
print(f"\t{str(input_path)}")
# dict [ structure_file_id ] = pathlib.Path( structure_file_path )
structure_files = dict()
# Iterate input_paths and check if file extension matches any key in STRUCTURE_FILES_PARSERS
for input_path in input_paths:
print(f"{str(input_path)}")
if input_path.is_file():
for pattern in STRUCTURE_FILES_PARSERS.keys():
if str(input_path).endswith(pattern):
print(f"\tFile {input_path} is {pattern}")
structure_file_id = input_path.name[:-len(pattern)]
structure_files[structure_file_id] = input_path
continue
elif input_path.is_dir():
files_inside_input_path = list(input_path.glob("**/*"))
for pattern in STRUCTURE_FILES_PARSERS.keys():
structure_file_paths = list(filter(lambda x: str(x).endswith(pattern), files_inside_input_path))
if len(structure_file_paths) == 0:
continue
print(f"\tFound {len(structure_file_paths)} {pattern} files in {input_path}")
structure_file_ids = [x.name[:-len(pattern)] for x in structure_file_paths]
structure_files.update(zip(structure_file_ids, structure_file_paths))
else:
print(f"\tUnable to find {str(input_path)}")
if len(structure_files) == 0:
print("No structure files found")
return
print(f"Found {len(structure_files)} structure files to extract")
# search for already processed protein_ids to skip them
if not overwrite:
duplicated_ids_counter = 0
for structure_id in list(structure_files.keys()):
if (seq_atoms_path / ATOMS / (structure_id + ".bin")).exists():
structure_files.pop(structure_id)
duplicated_ids_counter += 1
print(f"Found {duplicated_ids_counter} duplicated IDs")
# actual processing of structure files takes place here
print("\nProcessing", len(structure_files), "files")
initialize_CPP_LIB()
with multiprocessing.Pool(processes=CPU_COUNT) as p:
results = p.starmap(process_structure_file, zip(structure_files.values(), repeat(seq_atoms_path.absolute()), repeat(max_target_chain_length)))
result_types, types_counts = np.unique(results, return_counts=True)
for i in range(len(result_types)):
print(f"\t{types_counts[i]} {result_types[i]}")
freshly_added_ids = []
structure_file_ids = list(structure_files.keys())
for i in range(len(results)):
if results[i].startswith("SUCCEED"):
freshly_added_ids.append(structure_file_ids[i])
if len(freshly_added_ids) == 0:
print(f"\nNo new protein structures added.\nNo new target database will be created.")
return
sequence_files = list((seq_atoms_path / SEQUENCES).glob("**/*.faa"))
print("\nMerging " + str(len(sequence_files)) + " sequence files for mmseqs2")
merge_files_binary(sequence_files, seq_atoms_path / MERGED_SEQUENCES)
mmseqs2_db_path = create_unix_timestamp_folder(MMSEQS_DATABASES_PATH / project_name)
print("Creating new target mmseqs2 database " + str(mmseqs2_db_path))
mmseqs_createdb(seq_atoms_path / MERGED_SEQUENCES, mmseqs2_db_path / TARGET_MMSEQS_DB_NAME)
print("Indexing new target mmseqs2 database " + str(mmseqs2_db_path))
mmseqs_createindex(mmseqs2_db_path / TARGET_MMSEQS_DB_NAME)
print("Saving freshly added sequence ids to " + str(mmseqs2_db_path / "new_structure_ids.json"))
json.dump(sorted(freshly_added_ids), open(mmseqs2_db_path / "new_structure_ids.json", "w"), indent=4)
if __name__ == '__main__':
args = parse_args()
project_name = pathlib.Path(args.project_name)
overwrite = args.overwrite
input_paths = parse_input_paths(args.input, project_name, STRUCTURE_FILES_PATH)
main(input_paths, project_name, overwrite)