-
Notifications
You must be signed in to change notification settings - Fork 28
/
protein.py
2369 lines (1990 loc) · 124 KB
/
protein.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import logging
import requests
import shutil
import pandas as pd
import numpy as np
import os.path as op
import seaborn as sns
from slugify import Slugify
from collections import defaultdict
from six.moves.urllib.error import URLError
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.PDB.PDBExceptions import PDBException, PDBConstructionException
from msgpack.exceptions import ExtraData
from cobra.core import DictList
import ssbio.utils
import ssbio.databases.pdb
import ssbio.protein.sequence.utils.alignment
import ssbio.protein.sequence.utils.fasta
import ssbio.protein.structure.properties.quality
from ssbio.core.object import Object
from ssbio.protein.sequence.seqprop import SeqProp
from ssbio.databases.kegg import KEGGProp
from ssbio.databases.uniprot import UniProtProp
from ssbio.protein.structure.structprop import StructProp
from ssbio.databases.pdb import PDBProp
from ssbio.protein.structure.homology.itasser.itasserprop import ITASSERProp
from ssbio.protein.structure.homology.itasser.itasserprep import ITASSERPrep
custom_slugify = Slugify(safe_chars='-_.')
log = logging.getLogger(__name__)
class Protein(Object):
"""Store information on a protein that represents the translated unit of a gene.
The main utilities of this class are to:
#. Load, parse, and store the same (ie. from different database sources) or similar (ie. from different strains) \
protein sequences as ``SeqProp`` objects in the ``sequences`` attribute
#. Load, parse, and store multiple experimental or predicted protein structures as ``StructProp`` \
objects in the ``structures`` attribute
#. Set a single representative sequence and structure
#. Calculate, store, and access pairwise sequence alignments to the representative sequence or structure
#. Provide summaries of alignments and mutations seen
#. Map between residue numbers of sequences and structures
Args:
ident (str): Unique identifier for this protein
description (str): Optional description for this protein
root_dir (str): Path to where the folder named by this protein's ID will be created.
Default is current working directory.
pdb_file_type (str): ``pdb``, ``pdb.gz``, ``mmcif``, ``cif``, ``cif.gz``, ``xml.gz``, ``mmtf``, ``mmtf.gz`` -
choose a file type for files downloaded from the PDB
Todo:
- Implement structural alignment objects
"""
__representative_sequence_attributes = ['gene', 'uniprot', 'kegg', 'pdbs',
'sequence_path', 'metadata_path']
__representative_structure_attributes = ['is_experimental', 'reference_seq_top_coverage', 'date', 'description',
'resolution','taxonomy_name']
def __init__(self, ident, description=None, root_dir=None, pdb_file_type='mmtf'):
Object.__init__(self, id=ident, description=description)
self.pdb_file_type = pdb_file_type
"""str: ``pdb``, ``pdb.gz``, ``mmcif``, ``cif``, ``cif.gz``, ``xml.gz``, ``mmtf``, ``mmtf.gz`` - choose a file
type for files downloaded from the PDB"""
# Create directories
self._root_dir = None
if root_dir:
self.root_dir = root_dir
# Sequences
self.sequences = DictList()
"""DictList: Stored protein sequences which are related to this protein"""
self.representative_sequence = None
"""SeqProp: Sequence set to represent this protein"""
# Structures
self.structures = DictList()
"""DictList: Stored protein structures which are related to this protein"""
self.representative_structure = None
"""StructProp: Structure set to represent this protein, usually in monomeric form"""
self.representative_chain = None
"""str: Chain ID in the representative structure which best represents a sequence"""
self.representative_chain_seq_coverage = 0
"""float: Percent identity of sequence coverage for the representative chain"""
# Alignments
self.sequence_alignments = DictList()
"""DictList: Pairwise or multiple sequence alignments stored as ``Bio.Align.MultipleSeqAlignment`` objects"""
self.structure_alignments = DictList()
"""DictList: Pairwise or multiple structure alignments - currently a placeholder"""
@property
def root_dir(self):
"""str: Path to where the folder named by this protein's ID will be created. Default is current working
directory."""
return self._root_dir
@root_dir.setter
def root_dir(self, path):
if not path:
raise ValueError('No path specified')
if not op.exists(path):
raise ValueError('{}: folder does not exist'.format(path))
if self._root_dir:
log.debug('Changing root directory of Protein "{}" from {} to {}'.format(self.id, self.root_dir, path))
if not op.exists(op.join(path, self.id)) and not op.exists(op.join(path, self.id) + '_protein'):
raise IOError('Protein "{}" does not exist in folder {}'.format(self.id, path))
self._root_dir = path
for d in [self.protein_dir, self.sequence_dir, self.structure_dir]:
ssbio.utils.make_dir(d)
# Propagate changes to SeqProps
if hasattr(self, 'sequences'):
for seq in self.sequences:
seq.sequence_dir = self.sequence_dir
seq.metadata_dir = self.sequence_dir
if hasattr(self, 'representative_sequence'):
if self.representative_sequence:
self.representative_sequence.sequence_dir = self.sequence_dir
self.representative_sequence.metadata_dir = self.sequence_dir
# Propagate changes to StructProps
if hasattr(self, 'structures'):
for struct in self.structures:
struct.structure_dir = self.structure_dir
if hasattr(self, 'representative_structure'):
if self.representative_structure:
self.representative_structure.structure_dir = self.structure_dir
@property
def protein_dir(self):
"""str: Protein folder"""
if self.root_dir:
# Add a _protein suffix to the folder if it has the same name as its root folder
folder_name = self.id
if folder_name == op.basename(self.root_dir):
folder_name = self.id + '_protein'
return op.join(self.root_dir, folder_name)
else:
log.warning('Root directory not set')
return None
@property
def sequence_dir(self):
"""str: Directory where sequence related files are stored"""
if self.root_dir:
return op.join(self.protein_dir, 'sequences')
else:
log.debug('Root directory not set')
return None
@property
def structure_dir(self):
"""str: Directory where structure related files are stored"""
if self.root_dir:
return op.join(self.protein_dir, 'structures')
else:
log.debug('Root directory not set')
return None
@property
def protein_statistics(self):
"""Get a dictionary of basic statistics describing this protein"""
d = {}
d['id'] = self.id
d['sequences'] = [x.id for x in self.sequences]
d['num_sequences'] = self.num_sequences
if self.representative_sequence:
d['representative_sequence'] = self.representative_sequence.id
d['num_structures'] = self.num_structures
d['experimental_structures'] = [x.id for x in self.get_experimental_structures()]
d['num_experimental_structures'] = self.num_structures_experimental
d['homology_models'] = [x.id for x in self.get_homology_models()]
d['num_homology_models'] = self.num_structures_homology
if self.representative_structure:
d['representative_structure'] = self.representative_structure.id
d['representative_chain'] = self.representative_chain
d['representative_chain_seq_coverage'] = self.representative_chain_seq_coverage
d['num_sequence_alignments'] = len(self.sequence_alignments)
d['num_structure_alignments'] = len(self.structure_alignments)
return d
@property
def num_sequences(self):
"""int: Return the total number of sequences"""
return len(self.sequences)
@property
def num_structures(self):
"""int: Return the total number of structures"""
return len(self.structures)
@property
def num_structures_experimental(self):
"""int: Return the total number of experimental structures"""
return len(self.get_experimental_structures())
@property
def num_structures_homology(self):
"""int: Return the total number of homology models"""
return len(self.get_homology_models())
def get_experimental_structures(self):
"""DictList: Return a DictList of all experimental structures in self.structures"""
return DictList(x for x in self.structures if x.is_experimental)
def get_homology_models(self):
"""DictList: Return a DictList of all homology models in self.structures"""
return DictList(x for x in self.structures if not x.is_experimental)
def filter_sequences(self, seq_type):
"""Return a DictList of only specified types in the sequences attribute.
Args:
seq_type (SeqProp): Object type
Returns:
DictList: A filtered DictList of specified object type only
"""
return DictList(x for x in self.sequences if isinstance(x, seq_type))
def load_kegg(self, kegg_id, kegg_organism_code=None, kegg_seq_file=None, kegg_metadata_file=None,
set_as_representative=False, download=False, outdir=None, force_rerun=False):
"""Load a KEGG ID, sequence, and metadata files into the sequences attribute.
Args:
kegg_id (str): KEGG ID
kegg_organism_code (str): KEGG organism code to prepend to the kegg_id if not part of it already.
Example: ``eco:b1244``, ``eco`` is the organism code
kegg_seq_file (str): Path to KEGG FASTA file
kegg_metadata_file (str): Path to KEGG metadata file (raw KEGG format)
set_as_representative (bool): If this KEGG ID should be set as the representative sequence
download (bool): If the KEGG sequence and metadata files should be downloaded if not provided
outdir (str): Where the sequence and metadata files should be downloaded to
force_rerun (bool): If ID should be reloaded and files redownloaded
Returns:
KEGGProp: object contained in the sequences attribute
"""
if download:
if not outdir:
outdir = self.sequence_dir
if not outdir:
raise ValueError('Output directory must be specified')
if kegg_organism_code:
kegg_id = kegg_organism_code + ':' + kegg_id
# If we have already loaded the KEGG ID
if self.sequences.has_id(kegg_id):
# Remove it if we want to force rerun things
if force_rerun:
existing = self.sequences.get_by_id(kegg_id)
self.sequences.remove(existing)
# Otherwise just get that KEGG object
else:
log.debug('{}: KEGG ID already present in list of sequences'.format(kegg_id))
kegg_prop = self.sequences.get_by_id(kegg_id)
# Check again (instead of else) in case we removed it if force rerun
if not self.sequences.has_id(kegg_id):
kegg_prop = KEGGProp(id=kegg_id, seq=None, fasta_path=kegg_seq_file, txt_path=kegg_metadata_file)
if download:
kegg_prop.download_seq_file(outdir, force_rerun)
kegg_prop.download_metadata_file(outdir, force_rerun)
# Check if KEGG sequence matches a potentially set representative sequence
# Do not add any info if a UniProt ID was already mapped though, we want to use that
if self.representative_sequence:
if not self.representative_sequence.uniprot:
if kegg_prop.equal_to(self.representative_sequence):
# Update the representative sequence field with KEGG metadata
self.representative_sequence.update(kegg_prop.get_dict(), only_keys=['sequence_path',
'metadata_path',
'kegg',
'description',
'taxonomy',
'id',
'pdbs',
'uniprot',
'seq_record',
'gene_name',
'refseq'])
else:
# TODO: add option to use manual or kegg sequence if things do not match
log.warning('{}: representative sequence does not match mapped KEGG sequence.'.format(self.id))
self.sequences.append(kegg_prop)
if set_as_representative:
self.representative_sequence = kegg_prop
return self.sequences.get_by_id(kegg_id)
def load_uniprot(self, uniprot_id, uniprot_seq_file=None, uniprot_xml_file=None, download=False, outdir=None,
set_as_representative=False, force_rerun=False):
"""Load a UniProt ID and associated sequence/metadata files into the sequences attribute.
Sequence and metadata files can be provided, or alternatively downloaded with the download flag set to True.
Metadata files will be downloaded as XML files.
Args:
uniprot_id (str): UniProt ID/ACC
uniprot_seq_file (str): Path to FASTA file
uniprot_xml_file (str): Path to UniProt XML file
download (bool): If sequence and metadata files should be downloaded
outdir (str): Output directory for sequence and metadata files
set_as_representative (bool): If this sequence should be set as the representative one
force_rerun (bool): If files should be redownloaded and metadata reloaded
Returns:
UniProtProp: Sequence that was loaded into the ``sequences`` attribute
"""
if download:
if not outdir:
outdir = self.sequence_dir
if not outdir:
raise ValueError('Output directory must be specified')
# If we have already loaded the UniProt ID
if self.sequences.has_id(uniprot_id):
# Remove it if we want to force rerun things
if force_rerun:
existing = self.sequences.get_by_id(uniprot_id)
self.sequences.remove(existing)
# Otherwise just get that UniProt object
else:
log.debug('{}: UniProt ID already present in list of sequences'.format(uniprot_id))
uniprot_prop = self.sequences.get_by_id(uniprot_id)
if not self.sequences.has_id(uniprot_id):
uniprot_prop = UniProtProp(id=uniprot_id,
seq=None,
fasta_path=uniprot_seq_file,
xml_path=uniprot_xml_file)
if download:
uniprot_prop.download_metadata_file(outdir=outdir,
force_rerun=force_rerun)
uniprot_prop.download_seq_file(outdir=outdir, force_rerun=force_rerun)
# Also check if UniProt sequence matches a potentially set representative sequence
if self.representative_sequence:
# Test equality
if uniprot_prop.equal_to(self.representative_sequence):
# Update the representative sequence field with UniProt metadata if it is the same
self.representative_sequence.update(uniprot_prop.get_dict(), only_keys=['sequence_path',
'metadata_path',
'kegg',
'description',
'taxonomy',
'id',
'pdbs',
'uniprot',
'seq_record',
'gene_name',
'refseq'])
else:
# TODO: add option to use manual or uniprot sequence if things do not match
log.warning('{}: representative sequence does not match mapped UniProt sequence'.format(self.id))
self.sequences.append(uniprot_prop)
if set_as_representative:
self.representative_sequence = uniprot_prop
return self.sequences.get_by_id(uniprot_id)
def load_manual_sequence_file(self, ident, seq_file, copy_file=False, outdir=None, set_as_representative=False):
"""Load a manual sequence, given as a FASTA file and optionally set it as the representative sequence.
Also store it in the sequences attribute.
Args:
ident (str): Sequence ID
seq_file (str): Path to sequence FASTA file
copy_file (bool): If the FASTA file should be copied to the protein's sequences folder or the ``outdir``, if
protein folder has not been set
outdir (str): Path to output directory
set_as_representative (bool): If this sequence should be set as the representative one
Returns:
SeqProp: Sequence that was loaded into the ``sequences`` attribute
"""
if copy_file:
if not outdir:
outdir = self.sequence_dir
if not outdir:
raise ValueError('Output directory must be specified')
shutil.copy(seq_file, outdir)
seq_file = op.join(outdir, seq_file)
manual_sequence = SeqProp(id=ident, sequence_path=seq_file, seq=None)
self.sequences.append(manual_sequence)
if set_as_representative:
self.representative_sequence = manual_sequence
return self.sequences.get_by_id(ident)
def load_manual_sequence(self, seq, ident=None, write_fasta_file=False, outdir=None,
set_as_representative=False, force_rewrite=False):
"""Load a manual sequence given as a string and optionally set it as the representative sequence.
Also store it in the sequences attribute.
Args:
seq (str, Seq, SeqRecord): Sequence string, Biopython Seq or SeqRecord object
ident (str): Optional identifier for the sequence, required if seq is a string. Also will override existing
IDs in Seq or SeqRecord objects if set.
write_fasta_file (bool): If this sequence should be written out to a FASTA file
outdir (str): Path to output directory
set_as_representative (bool): If this sequence should be set as the representative one
force_rewrite (bool): If the FASTA file should be overwritten if it already exists
Returns:
SeqProp: Sequence that was loaded into the ``sequences`` attribute
"""
if write_fasta_file:
if not outdir:
outdir = self.sequence_dir
if not outdir:
raise ValueError('Output directory must be specified')
outfile = op.join(outdir, '{}.faa'.format(ident))
else:
outfile = None
if isinstance(seq, str) or isinstance(seq, Seq):
if not ident:
raise ValueError('ID must be specified if sequence is a string or Seq object')
else:
if not ident:
# Use ID from SeqRecord ID if new ID not provided
ident = seq.id
else:
# Overwrite SeqRecord ID with new ID if provided
seq.id = ident
manual_sequence = SeqProp(id=ident, seq=seq)
if write_fasta_file:
manual_sequence.write_fasta_file(outfile=outfile, force_rerun=force_rewrite)
self.sequences.append(manual_sequence)
if set_as_representative:
self.representative_sequence = manual_sequence
return self.sequences.get_by_id(ident)
def set_representative_sequence(self, force_rerun=False):
"""Automatically consolidate loaded sequences (manual, UniProt, or KEGG) and set a single representative
sequence.
Manually set representative sequences override all existing mappings. UniProt mappings override KEGG mappings
except when KEGG mappings have PDBs associated with them and UniProt doesn't.
Args:
force_rerun (bool): Set to True to recheck stored sequences
Returns:
SeqProp: Which sequence was set as representative
"""
if len(self.sequences) == 0:
log.error('{}: no sequences mapped'.format(self.id))
return self.representative_sequence
kegg_mappings = self.filter_sequences(KEGGProp)
if len(kegg_mappings) > 0:
kegg_to_use = kegg_mappings[0]
if len(kegg_mappings) > 1:
log.warning('{}: multiple KEGG mappings found, using the first entry {}'.format(self.id, kegg_to_use.id))
uniprot_mappings = self.filter_sequences(UniProtProp)
# If a representative sequence has already been set, nothing needs to be done
if self.representative_sequence and not force_rerun:
log.debug('{}: representative sequence already set'.format(self.id))
# If there is a KEGG annotation and no UniProt annotations, set KEGG as representative
elif len(kegg_mappings) > 0 and len(uniprot_mappings) == 0:
self.representative_sequence = kegg_to_use
log.debug('{}: representative sequence set from KEGG ID {}'.format(self.id, kegg_to_use.id))
# If there are UniProt annotations and no KEGG annotations, set UniProt as representative
elif len(kegg_mappings) == 0 and len(uniprot_mappings) > 0:
# If there are multiple uniprots rank them by the sum of reviewed (bool) + num_pdbs
# This way, UniProts with PDBs get ranked to the top, or if no PDBs, reviewed entries
u_ranker = []
for u in uniprot_mappings:
u_ranker.append((u.id, u.ranking_score()))
sorted_by_second = sorted(u_ranker, key=lambda tup: tup[1], reverse=True)
best_u_id = sorted_by_second[0][0]
best_u = uniprot_mappings.get_by_id(best_u_id)
self.representative_sequence = best_u
log.debug('{}: representative sequence set from UniProt ID {}'.format(self.id, best_u_id))
# If there are both UniProt and KEGG annotations...
elif len(kegg_mappings) > 0 and len(uniprot_mappings) > 0:
# Use KEGG if the mapped UniProt is unique, and it has PDBs
if kegg_to_use.num_pdbs > 0 and not uniprot_mappings.has_id(kegg_to_use.uniprot):
self.representative_sequence = kegg_to_use
log.debug('{}: representative sequence set from KEGG ID {}'.format(self.id, kegg_to_use.id))
else:
# If there are multiple uniprots rank them by the sum of reviewed (bool) + num_pdbs
u_ranker = []
for u in uniprot_mappings:
u_ranker.append((u.id, u.ranking_score()))
sorted_by_second = sorted(u_ranker, key=lambda tup: tup[1], reverse=True)
best_u_id = sorted_by_second[0][0]
best_u = uniprot_mappings.get_by_id(best_u_id)
self.representative_sequence = best_u
log.debug('{}: representative sequence set from UniProt ID {}'.format(self.id, best_u_id))
return self.representative_sequence
def pairwise_align_sequences_to_representative(self, gapopen=10, gapextend=0.5, outdir=None,
engine='needle', parse=True, force_rerun=False):
"""Pairwise all sequences in the sequences attribute to the representative sequence. Stores the alignments
in the ``sequence_alignments`` DictList attribute.
Args:
gapopen (int): Only for ``engine='needle'`` - Gap open penalty is the score taken away when a gap is created
gapextend (float): Only for ``engine='needle'`` - Gap extension penalty is added to the standard gap penalty
for each base or residue in the gap
outdir (str): Only for ``engine='needle'`` - Path to output directory. Default is the protein sequence
directory.
engine (str): ``biopython`` or ``needle`` - which pairwise alignment program to use.
``needle`` is the standard EMBOSS tool to run pairwise alignments.
``biopython`` is Biopython's implementation of needle. Results can differ!
parse (bool): Store locations of mutations, insertions, and deletions in the alignment object (as an
annotation)
force_rerun (bool): Only for ``engine='needle'`` - Default False, set to True if you want to rerun the
alignment if outfile exists.
"""
if not outdir:
outdir = self.sequence_dir
if not outdir:
raise ValueError('Output directory must be specified')
for seq in self.sequences:
aln_id = '{}_{}'.format(self.id, seq.id)
outfile = '{}.needle'.format(aln_id)
if not self.representative_sequence:
log.error('{}: no representative sequence set, skipping gene'.format(self.id))
continue
if self.sequence_alignments.has_id(aln_id):
log.debug('{}: alignment already completed'.format(seq.id))
continue
if not seq.seq_str:
log.error('{}: no sequence stored, skipping alignment'.format(seq.id))
continue
# Don't need to compare sequence to itself
if seq.id == self.representative_sequence.id:
continue
aln = ssbio.protein.sequence.utils.alignment.pairwise_sequence_alignment(a_seq=self.representative_sequence.seq_str,
a_seq_id=self.id,
b_seq=seq.seq_str,
b_seq_id=seq.id,
gapopen=gapopen, gapextend=gapextend,
engine=engine,
outdir=outdir,
outfile=outfile,
force_rerun=force_rerun)
# Add an identifier to the MultipleSeqAlignment object for storage in a DictList
aln.id = aln_id
aln.annotations['a_seq'] = self.representative_sequence.id
aln.annotations['b_seq'] = seq.id
if parse:
aln_df = ssbio.protein.sequence.utils.alignment.get_alignment_df(a_aln_seq=str(list(aln)[0].seq),
b_aln_seq=str(list(aln)[1].seq))
aln.annotations['ssbio_type'] = 'seqalign'
aln.annotations['mutations'] = ssbio.protein.sequence.utils.alignment.get_mutations(aln_df)
aln.annotations['deletions'] = ssbio.protein.sequence.utils.alignment.get_deletions(aln_df)
aln.annotations['insertions'] = ssbio.protein.sequence.utils.alignment.get_insertions(aln_df)
self.sequence_alignments.append(aln)
def get_sequence_properties(self, representative_only=True):
"""Run Biopython ProteinAnalysis and EMBOSS pepstats to summarize basic statistics of the protein sequences.
Results are stored in the protein's respective SeqProp objects at ``.annotations``
Args:
representative_only (bool): If analysis should only be run on the representative sequence
"""
if representative_only:
# Check if a representative sequence was set
if not self.representative_sequence:
log.warning('{}: no representative sequence set, cannot get sequence properties'.format(self.id))
return
# Also need to check if a sequence has been stored
if not self.representative_sequence.seq:
log.warning('{}: representative sequence {} set, but no sequence stored. '
'Cannot get sequence properties.'.format(self.id, self.representative_sequence.id))
return
self.representative_sequence.get_biopython_pepstats()
self.representative_sequence.get_emboss_pepstats()
if not representative_only:
for s in self.sequences:
# Need to check if a sequence has been stored
if not s.seq:
log.warning('{}: no sequence stored. '
'Cannot get sequence properties.'.format(s.id))
continue
else:
s.get_biopython_pepstats()
s.get_emboss_pepstats()
def prep_itasser_modeling(self, itasser_installation, itlib_folder, runtype, create_in_dir=None,
execute_from_dir=None, print_exec=False, **kwargs):
"""Prepare to run I-TASSER homology modeling for the representative sequence.
Args:
itasser_installation (str): Path to I-TASSER folder, i.e. ``~/software/I-TASSER4.4``
itlib_folder (str): Path to ITLIB folder, i.e. ``~/software/ITLIB``
runtype: How you will be running I-TASSER - local, slurm, or torque
create_in_dir (str): Local directory where folders will be created
execute_from_dir (str): Optional path to execution directory - use this if you are copying the homology
models to another location such as a supercomputer for running
all_genes (bool): If all genes should be prepped, or only those without any mapped structures
print_exec (bool): If the execution statement should be printed to run modelling
Todo:
* Document kwargs - extra options for I-TASSER, SLURM or Torque execution
* Allow modeling of any sequence in sequences attribute, select by ID or provide SeqProp?
"""
if not create_in_dir:
if not self.structure_dir:
raise ValueError('Output directory must be specified')
self.homology_models_dir = op.join(self.structure_dir, 'homology_models')
else:
self.homology_models_dir = create_in_dir
ssbio.utils.make_dir(self.homology_models_dir)
if not execute_from_dir:
execute_from_dir = self.homology_models_dir
repseq = self.representative_sequence
itasser_kwargs = {'light': True,
'java_home': None,
'binding_site_pred': False,
'ec_pred': False,
'go_pred': False,
'job_scheduler_header': None,
'additional_options': None}
if kwargs:
itasser_kwargs.update(kwargs)
ITASSERPrep(ident=self.id, seq_str=repseq.seq_str, root_dir=self.homology_models_dir,
itasser_path=itasser_installation, itlib_path=itlib_folder,
runtype=runtype, print_exec=print_exec, execute_dir=execute_from_dir,
java_home=itasser_kwargs['java_home'],
light=itasser_kwargs['light'],
binding_site_pred=itasser_kwargs['binding_site_pred'],
ec_pred=itasser_kwargs['ec_pred'],
go_pred=itasser_kwargs['go_pred'],
job_scheduler_header=itasser_kwargs['job_scheduler_header'],
additional_options=itasser_kwargs['additional_options'])
log.debug('Prepared I-TASSER modeling folder {}'.format(self.homology_models_dir))
def blast_representative_sequence_to_pdb(self, seq_ident_cutoff=0, evalue=0.0001, display_link=False,
outdir=None, force_rerun=False):
"""BLAST the representative protein sequence to the PDB. Saves a raw BLAST result file (XML file).
Args:
seq_ident_cutoff (float, optional): Cutoff results based on percent coverage (in decimal form)
evalue (float, optional): Cutoff for the E-value - filters for significant hits. 0.001 is liberal,
0.0001 is stringent (default).
display_link (bool, optional): Set to True if links to the HTML results should be displayed
outdir (str): Path to output directory of downloaded XML files, must be set if protein directory
was not initialized
force_rerun (bool, optional): If existing BLAST results should not be used, set to True. Default is False.
Returns:
list: List of new ``PDBProp`` objects added to the ``structures`` attribute
"""
# Check if a representative sequence was set
if not self.representative_sequence:
log.warning('{}: no representative sequence set, cannot BLAST'.format(self.id))
return None
# Also need to check if a sequence has been stored
if not self.representative_sequence.seq:
log.warning('{}: no representative sequence loaded, cannot BLAST'.format(self.id))
return None
# BLAST the sequence to the PDB
blast_results = self.representative_sequence.blast_pdb(seq_ident_cutoff=seq_ident_cutoff,
evalue=evalue,
display_link=display_link,
outdir=outdir,
force_rerun=force_rerun)
new_pdbs = []
# Add BLAST results to the list of structures
if blast_results:
# Filter for new BLASTed PDBs
pdbs = [x['hit_pdb'].lower() for x in blast_results]
new_pdbs = [y for y in pdbs if not self.structures.has_id(y)]
if new_pdbs:
log.debug('{}: adding {} PDBs from BLAST results'.format(self.id, len(new_pdbs)))
blast_results = [z for z in blast_results if z['hit_pdb'].lower() in new_pdbs]
for blast_result in blast_results:
pdb = blast_result['hit_pdb'].lower()
chains = blast_result['hit_pdb_chains']
for chain in chains:
# load_pdb will append this protein to the list of structures
new_pdb = self.load_pdb(pdb_id=pdb, mapped_chains=chain)
new_pdb.add_chain_ids(chain)
new_chain = new_pdb.chains.get_by_id(chain)
# Store BLAST results within the chain
new_chain.blast_results = blast_result
return new_pdbs
@property
def df_pdb_blast(self):
"""DataFrame: Get a dataframe of PDB BLAST results"""
blast_results_pre_df = []
for p in self.get_experimental_structures():
for c in p.chains:
if hasattr(c, 'blast_results'):
# Summary dataframe
infodict = p.get_dict_with_chain(chain=c.id)['blast_results']
infodict['pdb_id'] = p.id
infodict['pdb_chain_id'] = c.id
blast_results_pre_df.append(infodict)
cols = ['pdb_id', 'pdb_chain_id', 'hit_score', 'hit_evalue', 'hit_percent_similar',
'hit_percent_ident', 'hit_percent_gaps', 'hit_num_ident', 'hit_num_similar', 'hit_num_gaps']
df = pd.DataFrame.from_records(blast_results_pre_df, columns=cols).set_index('pdb_id')
return ssbio.utils.clean_df(df)
def map_uniprot_to_pdb(self, seq_ident_cutoff=0.0, outdir=None, force_rerun=False):
"""Map the representative sequence's UniProt ID to PDB IDs using the PDBe "Best Structures" API.
Will save a JSON file of the results to the protein sequences folder.
The "Best structures" API is available at https://www.ebi.ac.uk/pdbe/api/doc/sifts.html
The list of PDB structures mapping to a UniProt accession sorted by coverage of the protein and,
if the same, resolution.
Args:
seq_ident_cutoff (float): Sequence identity cutoff in decimal form
outdir (str): Output directory to cache JSON results of search
force_rerun (bool): Force re-downloading of JSON results if they already exist
Returns:
list: A rank-ordered list of PDBProp objects that map to the UniProt ID
"""
if not self.representative_sequence:
log.error('{}: no representative sequence set, cannot use best structures API'.format(self.id))
return None
# Check if a UniProt ID is attached to the representative sequence
uniprot_id = self.representative_sequence.uniprot
if not uniprot_id:
log.error('{}: no representative UniProt ID set, cannot use best structures API'.format(self.id))
return None
if '-' in uniprot_id:
log.debug('{}: "-" detected in UniProt ID, isoform specific sequences are ignored with best structures API'.format(self.id))
uniprot_id = uniprot_id.split('-')[0]
if not outdir:
outdir = self.sequence_dir
if not outdir:
raise ValueError('Output directory must be specified')
best_structures = ssbio.databases.pdb.best_structures(uniprot_id,
outname='{}_best_structures'.format(custom_slugify(uniprot_id)),
outdir=outdir,
seq_ident_cutoff=seq_ident_cutoff,
force_rerun=force_rerun)
new_pdbs = []
if best_structures:
rank = 1
for best_structure in best_structures:
currpdb = str(best_structure['pdb_id'].lower())
new_pdbs.append(currpdb)
currchain = str(best_structure['chain_id'])
# load_pdb will append this protein to the list
new_pdb = self.load_pdb(pdb_id=currpdb, mapped_chains=currchain)
# Also add this chain to the chains attribute so we can save the
# info we get from best_structures
new_pdb.add_chain_ids(currchain)
pdb_specific_keys = ['experimental_method', 'resolution']
chain_specific_keys = ['coverage', 'start', 'end', 'unp_start', 'unp_end']
new_pdb.update(best_structure, only_keys=pdb_specific_keys)
new_chain = new_pdb.chains.get_by_id(currchain)
new_chain.update(best_structure, only_keys=chain_specific_keys)
new_chain.update({'rank': rank})
rank += 1
log.debug('{}, {}: {} PDB/chain pairs mapped'.format(self.id, uniprot_id, len(best_structures)))
else:
log.debug('{}, {}: no PDB/chain pairs mapped'.format(self.id, uniprot_id))
return new_pdbs
@property
def df_pdb_ranking(self):
"""DataFrame: Get a dataframe of UniProt -> best structure in PDB results"""
best_structures_pre_df = []
chain_specific_keys = ['coverage', 'start', 'end', 'unp_start', 'unp_end', 'rank']
for p in self.get_experimental_structures():
for c in p.chains:
if hasattr(c, 'rank'):
# Summary dataframe
infodict = p.get_dict_with_chain(chain=c.id, df_format=True, chain_keys=chain_specific_keys)
infodict['pdb_id'] = p.id
infodict['pdb_chain_id'] = c.id
infodict['uniprot'] = self.representative_sequence.uniprot
best_structures_pre_df.append(infodict)
cols = ['uniprot', 'pdb_id', 'pdb_chain_id', 'experimental_method', 'resolution', 'coverage',
'taxonomy_name', 'start', 'end', 'unp_start', 'unp_end', 'rank']
df = pd.DataFrame.from_records(best_structures_pre_df, columns=cols).set_index(['pdb_id', 'pdb_chain_id'])
return ssbio.utils.clean_df(df)
def load_pdb(self, pdb_id, mapped_chains=None, pdb_file=None, file_type=None, is_experimental=True,
set_as_representative=False, representative_chain=None, force_rerun=False):
"""Load a structure ID and optional structure file into the structures attribute.
Args:
pdb_id (str): PDB ID
mapped_chains (str, list): Chain ID or list of IDs which you are interested in
pdb_file (str): Path to PDB file
file_type (str): Type of PDB file
is_experimental (bool): If this structure file is experimental
set_as_representative (bool): If this structure should be set as the representative structure
representative_chain (str): If ``set_as_representative`` is ``True``, provide the representative chain ID
force_rerun (bool): If the PDB should be reloaded if it is already in the list of structures
Returns:
PDBProp: The object that is now contained in the structures attribute
"""
if self.structures.has_id(pdb_id):
# Remove the structure if set to force rerun
if force_rerun:
existing = self.structures.get_by_id(pdb_id)
self.structures.remove(existing)
# Otherwise just retrieve it
else:
log.debug('{}: PDB ID already present in list of structures'.format(pdb_id))
pdb = self.structures.get_by_id(pdb_id)
if pdb_file:
pdb.load_structure_path(pdb_file, file_type)
if mapped_chains:
pdb.add_mapped_chain_ids(mapped_chains)
# Create a new StructProp entry
if not self.structures.has_id(pdb_id):
if is_experimental:
pdb = PDBProp(ident=pdb_id, mapped_chains=mapped_chains, structure_path=pdb_file, file_type=file_type)
else:
pdb = StructProp(ident=pdb_id, mapped_chains=mapped_chains, structure_path=pdb_file, file_type=file_type)
self.structures.append(pdb)
if set_as_representative:
# Parse structure so chains are stored before setting representative
pdb.parse_structure()
self._representative_structure_setter(structprop=pdb, keep_chain=representative_chain, force_rerun=force_rerun)
return self.structures.get_by_id(pdb_id)
def load_itasser_folder(self, ident, itasser_folder, organize=False, outdir=None, organize_name=None,
set_as_representative=False, representative_chain='X', force_rerun=False):
"""Load the results folder from an I-TASSER run (local, not from the website) and copy relevant files over to
the protein structures directory.
Args:
ident (str): I-TASSER ID
itasser_folder (str): Path to results folder
organize (bool): If select files from modeling should be copied to the Protein directory
outdir (str): Path to directory where files will be copied and organized to
organize_name (str): Basename of files to rename results to. If not provided, will use id attribute.
set_as_representative: If this structure should be set as the representative structure
representative_chain (str): If ``set_as_representative`` is ``True``, provide the representative chain ID
force_rerun (bool): If the PDB should be reloaded if it is already in the list of structures
Returns:
ITASSERProp: The object that is now contained in the structures attribute
"""
if organize:
if not outdir:
outdir = self.structure_dir
if not outdir:
raise ValueError('Directory to copy results to must be specified')
if self.structures.has_id(ident):
if force_rerun:
existing = self.structures.get_by_id(ident)
self.structures.remove(existing)
else:
log.debug('{}: already present in list of structures'.format(ident))
itasser = self.structures.get_by_id(ident)
if not self.structures.has_id(ident):
itasser = ITASSERProp(ident, itasser_folder)
self.structures.append(itasser)
if set_as_representative:
self._representative_structure_setter(structprop=itasser, keep_chain=representative_chain, force_rerun=force_rerun)
if organize:
if itasser.structure_file:
# The name of the actual pdb file will be $GENEID_model1.pdb
if not organize_name:
new_itasser_name = self.id + '_model1'
else:
new_itasser_name = organize_name
# Additional results will be stored in a subdirectory
dest_itasser_extra_dir = op.join(outdir, '{}_itasser'.format(new_itasser_name))
ssbio.utils.make_dir(dest_itasser_extra_dir)
# Copy the model1.pdb and also create summary dataframes
itasser.copy_results(copy_to_dir=outdir, rename_model_to=new_itasser_name, force_rerun=force_rerun)
return self.structures.get_by_id(ident)
@property
def df_homology_models(self):
"""DataFrame: Get a dataframe of I-TASSER homology model results"""
itasser_pre_df = []
df_cols = ['id', 'structure_file', 'model_date', 'difficulty',
'top_template_pdb', 'top_template_chain', 'c_score',
'tm_score', 'tm_score_err', 'rmsd', 'rmsd_err',
'top_bsite_site_num', 'top_bsite_c_score', 'top_bsite_cluster_size', 'top_bsite_algorithm',
'top_bsite_pdb_template_id', 'top_bsite_pdb_template_chain', 'top_bsite_pdb_ligand',
'top_bsite_binding_location_coords', 'top_bsite_c_score_method', 'top_bsite_binding_residues',
'top_bsite_ligand_cluster_counts',
'top_ec_pdb_template_id', 'top_ec_pdb_template_chain', 'top_ec_tm_score', 'top_ec_rmsd',
'top_ec_seq_ident', 'top_ec_seq_coverage', 'top_ec_c_score', 'top_ec_ec_number',
'top_ec_binding_residues',
'top_go_mf_go_id', 'top_go_mf_go_term', 'top_go_mf_c_score', 'top_go_bp_go_id', 'top_go_bp_go_term',
'top_go_bp_c_score', 'top_go_cc_go_id', 'top_go_cc_go_term', 'top_go_cc_c_score']