# README

This notebook gives an example of how to run the GEM-PRO pipeline on a **one gene ID** for simplicity.

### Installation

See: https://github.com/nmih/ssbio/blob/master/README.md
- If something isn't working, make sure to update the repository before you do anything (git pull)

In [1]:
from ssbio.pipeline.gempro import GEMPRO

  warn("No LP solvers found")


In [2]:
# # Create logger
# import logging
# logger = logging.getLogger()

# ############# SET YOUR LOGGING LEVEL HERE #############
# logger.setLevel(logging.DEBUG)
# #######################################################

In [3]:
gene_id2 = 'SRR1753782_00918'
gene_seq2 = 'MSKQQIGVVGMAVMGRNLALNIESRGYTVSVFNRSREKTEEVIAENPGKKLVPYYTVKEFVESLETPRRILLMVKAGAGTDAAIDSLKPYLEKGDIIIDGGNTFFQDTIRRNRELSAEGFNFIGTGVSGGEEGALKGPSIMPGGQKDAYELVAPILTKIAAVAEDGEPCVTYIGADGAGHYVKMVHNGIEYGDMQLIAEAYSLLKGGLNLSNEELANTFTEWNNGELSSYLIDITKDIFTKKDEDGNYLVDVILDEAANKGTGKWTSQSALDLGEPLSLITESVFARYISSLKAQRVAASKVLSGPKAQPAGDKAEFIEKVRRALYLGKIVSYAQGFSQLRAASDEYHWDLNYGEIAKIFRAGCIIRAQFLQKITDAYAENADIANLLLAPYFKKIADEYQQALRDVVAYAVQNGIPVPTFSAAVAYYDSYRAAVLPANLIQAQRDYFGAHTYKRTDKEGIFHTEWLE'

In [4]:
GENES_AND_SEQUENCES2 = {gene_id2:gene_seq2}

In [5]:
my_gempro = GEMPRO(gem_name='test3', root_dir='/home/nathan/Downloads/', genes_and_sequences=GENES_AND_SEQUENCES2)


In [6]:
my_gempro.blast_seqs_to_pdb(all_genes=True, seq_ident_cutoff=.05, evalue=0.00001)




In [7]:
my_gempro.set_representative_structure()




In [8]:
my_gempro.genes[0].protein.structures[0].chains[0].seq_record

SeqRecord(seq=Seq('XXKQQIGVVGMAVMGRNLALNIESRGYTVSIFNRSREKTEEVIAENPGKKLVPY...TEW', IUPACProtein()), id='A', name='<unknown name>', description='<unknown description>', dbxrefs=[])

In [9]:
my_gempro.genes[0].protein.structures[0].chains[1].seq_record

SeqRecord(seq=Seq('XSKQQIGVVGMAVMGRNLALNIESRGYTVSIFNRSREKTEEVIAENPGKKLVPY...EWL', IUPACProtein()), id='B', name='<unknown name>', description='<unknown description>', dbxrefs=[])

In [10]:
my_gempro.genes[0].protein.structures[0].reference_seq

<SeqProp SRR1753782_00918 at 0x7f25eb649780>

In [11]:
my_gempro.genes[0].protein.structures[0].representative_chain.seq_record

SeqRecord(seq=Seq('XXKQQIGVVGMAVMGRNLALNIESRGYTVSIFNRSREKTEEVIAENPGKKLVPY...TEW', IUPACProtein()), id='A', name='<unknown name>', description='<unknown description>', dbxrefs=[])

In [12]:
my_gempro.genes[0].protein.representative_structure.__dict__

{'chains': [<ChainProp A at 0x7f25e9b02f28>, <ChainProp B at 0x7f25e9fbf908>],
 'date': ['2009-09-01', '2011-07-13', '2014-01-22'],
 'description': '6-phosphogluconate dehydrogenase, decarboxylating (E.C.1.1.1.44)',
 'file_type': 'pdb',
 'id': '2zyd-A',
 'is_experimental': True,
 'mapped_chains': ['A'],
 'reference_seq': <SeqProp SRR1753782_00918 at 0x7f25eb649780>,
 'reference_seq_top_coverage': 95.7,
 'representative_chain': <ChainProp A at 0x7f25e9b02f28>,
 'resolution': 1.5,
 'structure_path': '/home/nathan/Downloads/test3/structures/by_gene/SRR1753782_00918/2zyd-A_clean.pdb',
 'taxonomy_name': 'Escherichia coli'}

In [13]:
print(str(my_gempro.genes[0].protein.structures.get_by_id('2zyd').reference_seq.structure_alignments[0][0].seq))
print(str(my_gempro.genes[0].protein.structures.get_by_id('2zyd').reference_seq.structure_alignments[0][1].seq))

MSKQQIGVVGMAVMGRNLALNIESRGYTVSVFNRSREKTEEVIAENPGKKLVPYYTVKEFVESLETPRRILLMVKAGAGTDAAIDSLKPYLEKGDIIIDGGNTFFQDTIRRNRELSAEGFNFIGTGVSGGEEGALKGPSIMPGGQKDAYELVAPILTKIAAVAEDGEPCVTYIGADGAGHYVKMVHNGIEYGDMQLIAEAYSLLKGGLNLSNEELANTFTEWNNGELSSYLIDITKDIFTKKDEDGNYLVDVILDEAANKGTGKWTSQSALDLGEPLSLITESVFARYISSLKAQRVAASKVLSGPKAQPAGDKAEFIEKVRRALYLGKIVSYAQGFSQLRAASDEYHWDLNYGEIAKIFRAGCIIRAQFLQKITDAYAENADIANLLLAPYFKKIADEYQQALRDVVAYAVQNGIPVPTFSAAVAYYDSYRAAVLPANLIQAQRDYFGAHTYKRTDKEGIFHTEWLE
XXKQQIGVVGMAVMGRNLALNIESRGYTVSIFNRSREKTEEVIAENPGKKLVPYYTVKEFVESLETPRRILLMVKAGAGTDAAIDSLKPYLDKGDIIIDGGNTFFQDTIRRNRELSAEGFNFIGTGVSGGEEGALKGPSIMPGGQKEAYELVAPILTKIAAVAEDGEPCVTYIGADGAGHYVKMVHNGIEYGDMQLIAEAYSLLKGGLNLTNEELAQTFTEWNNGELSSYLIDITKDIFTKKDEDGNYLVDVILDEAANKGTGKWTSQSALDLGEPLSLITESVFARYISSLKDQRVAASKVLSGPQAQPAGDKAEFIEKVRRALYLGKIVSYAQGFSQLRAASEEYNWDLNYGEIAKIFRAGCIIRAQFLQKITDACAENPQIANLLLAPYFKQIADDYQQALRDVVAYAVQNGIPVPTFSAAVAYYDSYRAAVLPANLIQAQRDYFGAHTYKRIDKEGVFHTEW--


In [14]:
my_protein = my_gempro.genes[0].protein

In [15]:
mutated_seq = 'MSKQQIGVVGMAVMGRPLALNIESRGYTVSVFNRSREKTEEVIAENPGKKLVPYYTVKEFVESLETPRRILLMVKAGAGTDAAIDSLKPYLEKGDIIIDGGNTFFQDTIRRNRELSAEGFNFIGTGVSGGEEGALKGPSIMPGGQKDAYELVAPILTKIAAVAEDGEPCVTYIGADGAGHYVKMVHNGIEYGDMQLIAEAYSLLKGGLNLSNEELANTFTEWNNGELSSYLIDITKDIFTKKDEDGNYLVDVILDEAANKGTGKWTSQSALDLGEPLSLITESVFARYISSLKAQRVAASKVLSGPKAQPAGDKAEFIEKVRRALYLGKIVSYAQGFSQLRAASDEYHWDLNYGEIAKIFRAGCIIRAQFLQKITDAYAENADIANLLLAPYFKKIADEYQQALRDVVAYAVQNGIPVPTFSAAVAYYDSYRAAVLPANLIQAQRDYFGAHTYKRTDKEGIFHTEWLE'

In [16]:
my_protein.load_manual_sequence_str(ident='mutated', seq_str=mutated_seq)

<SeqProp mutated at 0x7f25ebb14c88>

In [17]:
my_protein.sequences

[<SeqProp SRR1753782_00918 at 0x7f262b25a4a8>,
 <SeqProp mutated at 0x7f25ebb14c88>]

In [18]:
my_protein.align_sequences_to_representative()

In [19]:
my_protein.representative_sequence.sequence_alignments[0]

<<class 'Bio.Align.MultipleSeqAlignment'> instance (2 records of length 468, SingleLetterAlphabet()) at 7f25e964d0f0>

In [20]:
print(str(my_gempro.genes[0].protein.representative_sequence.sequence_alignments[0][0].seq))
print(str(my_gempro.genes[0].protein.representative_sequence.sequence_alignments[0][1].seq))

MSKQQIGVVGMAVMGRNLALNIESRGYTVSVFNRSREKTEEVIAENPGKKLVPYYTVKEFVESLETPRRILLMVKAGAGTDAAIDSLKPYLEKGDIIIDGGNTFFQDTIRRNRELSAEGFNFIGTGVSGGEEGALKGPSIMPGGQKDAYELVAPILTKIAAVAEDGEPCVTYIGADGAGHYVKMVHNGIEYGDMQLIAEAYSLLKGGLNLSNEELANTFTEWNNGELSSYLIDITKDIFTKKDEDGNYLVDVILDEAANKGTGKWTSQSALDLGEPLSLITESVFARYISSLKAQRVAASKVLSGPKAQPAGDKAEFIEKVRRALYLGKIVSYAQGFSQLRAASDEYHWDLNYGEIAKIFRAGCIIRAQFLQKITDAYAENADIANLLLAPYFKKIADEYQQALRDVVAYAVQNGIPVPTFSAAVAYYDSYRAAVLPANLIQAQRDYFGAHTYKRTDKEGIFHTEWLE
MSKQQIGVVGMAVMGRPLALNIESRGYTVSVFNRSREKTEEVIAENPGKKLVPYYTVKEFVESLETPRRILLMVKAGAGTDAAIDSLKPYLEKGDIIIDGGNTFFQDTIRRNRELSAEGFNFIGTGVSGGEEGALKGPSIMPGGQKDAYELVAPILTKIAAVAEDGEPCVTYIGADGAGHYVKMVHNGIEYGDMQLIAEAYSLLKGGLNLSNEELANTFTEWNNGELSSYLIDITKDIFTKKDEDGNYLVDVILDEAANKGTGKWTSQSALDLGEPLSLITESVFARYISSLKAQRVAASKVLSGPKAQPAGDKAEFIEKVRRALYLGKIVSYAQGFSQLRAASDEYHWDLNYGEIAKIFRAGCIIRAQFLQKITDAYAENADIANLLLAPYFKKIADEYQQALRDVVAYAVQNGIPVPTFSAAVAYYDSYRAAVLPANLIQAQRDYFGAHTYKRTDKEGIFHTEWLE


In [21]:
my_protein.representative_sequence.sequence_alignments[0].annotations

{'a_seq': 'SRR1753782_00918',
 'b_seq': 'mutated',
 'deletions': [],
 'insertions': [],
 'mutations': [('N', 17, 'P')],
 'percent_gaps': 0.0,
 'percent_identity': 99.8,
 'percent_similarity': 99.8,
 'score': 2381.0}

In [22]:
my_protein.representative_structure.structure_path

'/home/nathan/Downloads/test3/structures/by_gene/SRR1753782_00918/2zyd-A_clean.pdb'

In [23]:
import nglview as nv
nv.show_structure_file(my_protein.representative_structure.structure_path)

In [24]:
my_protein.representative_structure.view_structure()

In [25]:
my_protein.view_all_mutations(gui=True, scale_range=(5,7))