# README

This notebook is to redo the GEM-PRO I did for the 212 class for iNJ661

In [1]:
import os.path as op
import pandas as pd
from ssbio.gempro import GEMPRO

In [2]:
# Create logger
import sys
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Create STDERR handler
handler = logging.StreamHandler(sys.stderr)
handler.setLevel(logging.INFO)

# Create formatter and add it to the handler
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

# Set STDERR handler as the only handler 
logger.handlers = [handler]

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# name of the folder which will be created, and the directory where it will be made
GEM_NAME = 'iNJ661'
ROOT_DIR = '/home/nathan/projects/'
GEM_FILE = '/home/nathan/projects/iNJ661/model/iNJ661.json'

In [4]:
my_gempro = GEMPRO(gem_name=GEM_NAME, root_dir=ROOT_DIR, gem_file_path=GEM_FILE, gem_file_type='json')

ssbio.gempro.pipeline - INFO - Loaded model: /home/nathan/projects/iNJ661/model/iNJ661.json
ssbio.gempro.pipeline - INFO - Number of reactions: 1025
ssbio.gempro.pipeline - INFO - Number of reactions linked to a gene: 720
ssbio.gempro.pipeline - INFO - Number of genes (excluding spontaneous): 661
ssbio.gempro.pipeline - INFO - Number of metabolites: 826


In [5]:
my_gempro.kegg_mapping_and_metadata(kegg_organism_code='mtu')

100%|██████████| 661/661 [00:02<00:00, 229.82it/s]
ssbio.gempro.pipeline - INFO - Created KEGG metadata dataframe. See the "df_kegg_metadata" attribute.


In [6]:
my_gempro.uniprot_mapping_and_metadata(model_gene_source='TUBERCULIST_ID')

root - INFO - getUserAgent: Begin
root - INFO - getUserAgent: user_agent: EBI-Sample-Client/ (services.py; Python 3.5.2; Linux) Python-requests/2.11.1
root - INFO - getUserAgent: End
100%|██████████| 661/661 [00:00<00:00, 1096.54it/s]
ssbio.gempro.pipeline - INFO - Created UniProt metadata dataframe. See the "df_uniprot_metadata" attribute.


In [7]:
manual_uniprot = pd.read_csv(op.join(my_gempro.data_dir, '161019-gene_to_uniprot.in'))
manual_uniprot_dict = {}
for i,r in manual_uniprot.iterrows():
    manual_uniprot_dict[r[0]] = r[1]
my_gempro.manual_uniprot_mapping(manual_uniprot_dict)

ssbio.gempro.pipeline - INFO - Updated existing UniProt dataframe.


In [8]:
my_gempro.set_representative_sequence()

ssbio.gempro.pipeline - INFO - Created sequence mapping dataframe. Inspect the "df_sequence_mapping" attribute for more info.


In [9]:
my_gempro.missing_mapping

['Rv2233']

In [11]:
my_gempro.map_uniprot_to_pdb()

100%|██████████| 661/661 [00:06<00:00, 104.92it/s]
ssbio.gempro.pipeline - INFO - Completed UniProt -> best PDB mapping. See the "df_pdb_ranking" attribute.


In [14]:
my_gempro.genes[3].annotation

{'sequence': {'kegg': {'kegg_id': 'mtu:Rv1295',
   'metadata_file': 'mtu-Rv1295.kegg',
   'pdbs': ['2D1F'],
   'seq_file': 'mtu-Rv1295.faa',
   'seq_len': 360,
   'uniprot_acc': 'P9WG59'},
  'representative': {'kegg_id': ['mtu:Rv1295'],
   'metadata_file': 'P9WG59.txt',
   'pdbs': ['2D1F'],
   'seq_file': 'P9WG59.fasta',
   'seq_len': 360,
   'uniprot_acc': 'P9WG59'},
  'uniprot': {'P9WG59': {'description': ['TS', 'Threonine synthase'],
    'ec_number': ['4.2.3.1'],
    'entry_version': '2016-10-05',
    'gene': 'Rv1295',
    'gene_name': 'thrC',
    'kegg_id': ['mtu:Rv1295'],
    'metadata_file': 'P9WG59.txt',
    'pdbs': ['2D1F'],
    'pfam': ['PF00291'],
    'refseq': ['NP_215811.1',
     'NC_000962.3',
     'WP_003406652.1',
     'NZ_KK339370.1'],
    'reviewed': True,
    'seq_file': 'P9WG59.fasta',
    'seq_len': 360,
    'seq_version': '2014-04-16',
    'uniprot_acc': 'P9WG59'}}},
 'structure': {'homology': {},
  'pdb': OrderedDict([('2d1f_A',
                {'experimental_meth

In [13]:
my_gempro.genes[3].annotation['structure']

{'homology': {},
 'pdb': OrderedDict([('2d1f_A',
               {'experimental_method': 'X-ray diffraction',
                'pdb_chain_id': 'A',
                'pdb_end': 360,
                'pdb_id': '2d1f',
                'pdb_start': 1,
                'rank': 1,
                'release_date': '2006-09-05',
                'resolution': 2.5,
                'seq_coverage': 1,
                'taxonomy_id': 1773,
                'uniprot_acc': 'P9WG59',
                'unp_end': 360,
                'unp_start': 1}),
              ('2d1f_B',
               {'experimental_method': 'X-ray diffraction',
                'pdb_chain_id': 'B',
                'pdb_end': 360,
                'pdb_id': '2d1f',
                'pdb_start': 1,
                'rank': 2,
                'release_date': '2006-09-05',
                'resolution': 2.5,
                'seq_coverage': 1,
                'taxonomy_id': 1773,
                'uniprot_acc': 'P9WG59',
                'unp_end': 3

In [11]:
my_gempro.blast_seqs_to_pdb(seq_ident_cutoff=.99, all_genes=True)

  0%|          | 0/661 [00:00<?, ?it/s]ssbio.gempro.pipeline - INFO - Rv3846: Adding 20 PDBs from BLAST results.
  5%|▌         | 35/661 [00:00<00:01, 329.24it/s]ssbio.gempro.pipeline - INFO - Rv2539c: Adding 4 PDBs from BLAST results.
 10%|█         | 69/661 [00:00<00:01, 332.24it/s]ssbio.gempro.pipeline - INFO - Rv1603: Adding 2 PDBs from BLAST results.
 15%|█▌        | 100/661 [00:00<00:01, 315.63it/s]ssbio.gempro.pipeline - INFO - Rv3628: Adding 1 PDBs from BLAST results.
 36%|███▋      | 240/661 [00:00<00:01, 397.91it/s]ssbio.gempro.pipeline - INFO - Rv2764c: Adding 1 PDBs from BLAST results.
ssbio.gempro.pipeline - INFO - Rv0467: Adding 4 PDBs from BLAST results.
 43%|████▎     | 285/661 [00:00<00:00, 411.27it/s]ssbio.gempro.pipeline - INFO - Rv1415: Adding 2 PDBs from BLAST results.
 50%|████▉     | 329/661 [00:00<00:00, 411.19it/s]ssbio.gempro.pipeline - INFO - Rv2220: Adding 24 PDBs from BLAST results.
 58%|█████▊    | 384/661 [00:00<00:00, 444.40it/s]ssbio.gempro.pipeline - I

In [12]:
old_gene_to_homology = pd.read_csv('~/projects/iNJ661/data_frames/DF_01_RXN_GENE_UNIPROT_PDB.csv')
old_gene_to_homology = old_gene_to_homology[['m_gene','u_uniprot_acc']].drop_duplicates().reset_index(drop=True)
old_gene_to_homology = old_gene_to_homology[pd.notnull(old_gene_to_homology.u_uniprot_acc)]
old_gene_to_homology.to_csv(op.join(my_gempro.data_dir,'161031-old_gene_to_uniprot_mapping.csv'))
gene_to_uniprot = old_gene_to_homology.set_index('m_gene').to_dict()['u_uniprot_acc']

In [13]:
my_gempro.get_itasser_models(homology_raw_dir='/home/nathan/projects_unsynced/iNJ661/structure_files/homology_models_prep', custom_itasser_name_mapping=gene_to_uniprot)

100%|██████████| 661/661 [00:00<00:00, 5759.53it/s]
ssbio.gempro.pipeline - INFO - Completed copying of I-TASSER models to GEM-PRO directory. See the "df_itasser" attribute.


In [47]:
my_gempro.set_representative_structure()

100%|██████████| 661/661 [00:00<00:00, 340510.31it/s]


In [49]:
my_gempro.genes[33].annotation['structure']

{'homology': {'Rv1731': {'c_score': -0.22,
   'difficulty': 'easy',
   'model_date': '2016-01-08',
   'model_file': 'Rv1731.pdb',
   'rmsd': 7.9,
   'rmsd_err': 4.4,
   'seq_coverage': 1,
   'tm_score': 0.69,
   'tm_score_err': 0.12,
   'top_template_chain': 'A',
   'top_template_pdb': '4h73'}},
 'pdb': OrderedDict(),
 'representative': {'clean_pdb_file': None,
  'original_pdb_file': None,
  'seq_coverage': 0,
  'structure_id': None}}

In [45]:
my_gempro.genes[3].annotation['structure']['pdb']

OrderedDict([('2d1f_A',
              {'experimental_method': 'X-ray diffraction',
               'pdb_chain_id': 'A',
               'pdb_end': 360,
               'pdb_id': '2d1f',
               'pdb_start': 1,
               'rank': 1,
               'release_date': '2006-09-05',
               'resolution': 2.5,
               'seq_coverage': 1,
               'taxonomy_id': 1773,
               'uniprot_acc': 'P9WG59',
               'unp_end': 360,
               'unp_start': 1}),
             ('2d1f_B',
              {'experimental_method': 'X-ray diffraction',
               'pdb_chain_id': 'B',
               'pdb_end': 360,
               'pdb_id': '2d1f',
               'pdb_start': 1,
               'rank': 2,
               'release_date': '2006-09-05',
               'resolution': 2.5,
               'seq_coverage': 1,
               'taxonomy_id': 1773,
               'uniprot_acc': 'P9WG59',
               'unp_end': 360,
               'unp_start': 1})])

In [21]:
import time
import os


'20160104'

In [18]:
d = my_gempro.genes.get_by_id('Rv2043c').annotation['structure']['homology']

In [40]:
d['rvvv'] = {'c_score': 26,
  'difficulty': 'easy',
  'model_date': '2016-12-29',
  'model_file': 'Rv2043c.pdb',
  'rmsd': 1.7,
  'rmsd_err': 1.5,
  'seq_coverage': 1,
  'tm_score': 0,
  'tm_score_err': 0.05,
  'top_template_chain': 'A',
  'top_template_pdb': '3gbc'}

In [41]:
d

{'Rv2043c': {'c_score': 1.86,
  'difficulty': 'easy',
  'model_date': '2015-12-29',
  'model_file': 'Rv2043c.pdb',
  'rmsd': 1.7,
  'rmsd_err': 1.5,
  'seq_coverage': 1,
  'tm_score': 0.98,
  'tm_score_err': 0.05,
  'top_template_chain': 'A',
  'top_template_pdb': '3gbc'},
 'rvvv': {'c_score': 26,
  'difficulty': 'easy',
  'model_date': '2016-12-29',
  'model_file': 'Rv2043c.pdb',
  'rmsd': 1.7,
  'rmsd_err': 1.5,
  'seq_coverage': 1,
  'tm_score': 0,
  'tm_score_err': 0.05,
  'top_template_chain': 'A',
  'top_template_pdb': '3gbc'}}

In [42]:
sorted(d, key=lambda x: d[x]['model_date'], reverse=True)

['rvvv', 'Rv2043c']