# Table of Contents
 <p><div class="lev1"><a href="#Load-functions-(ssbio-package-on-212-machine)"><span class="toc-item-num">1 - </span>Load functions (ssbio package on 212 machine)</a></div><div class="lev1"><a href="#Load-the-GEM-PRO"><span class="toc-item-num">2 - </span>Load the GEM-PRO</a></div><div class="lev1"><a href="#Keep-only-the-representative-structures"><span class="toc-item-num">3 - </span>Keep only the representative structures</a></div><div class="lev2"><a href="#For-testing"><span class="toc-item-num">3.1 - </span>For testing</a></div><div class="lev1"><a href="#Calculate-the-number-of-potential-sulfide-bridges-(Biopython)"><span class="toc-item-num">4 - </span>Calculate the number of potential sulfide bridges (Biopython)</a></div><div class="lev1"><a href="#Calculate-SASA,-%-surface/buried,-%-secondary-structure-(DSSP)"><span class="toc-item-num">5 - </span>Calculate SASA, % surface/buried, % secondary structure (DSSP)</a></div><div class="lev1"><a href="#Calculate-surface-&amp;-residue-depth-using-MSMS"><span class="toc-item-num">6 - </span>Calculate surface &amp; residue depth using MSMS</a></div><div class="lev1"><a href="#Calculating-%-polar,-nonpolar,-etc.-residues"><span class="toc-item-num">7 - </span>Calculating % polar, nonpolar, etc. residues</a></div><div class="lev1"><a href="#Merge-all-information"><span class="toc-item-num">8 - </span>Merge all information</a></div>

In [1]:
import os
import pandas as pd

# tqdm is a nice progress bar
from tqdm import tqdm

pd.set_option('display.max_columns', 500)

In [2]:
GEM_NAME = 'iJO1366'
ROOT_DIR = os.getcwd()

In [3]:
MODEL_DIR = os.path.join(ROOT_DIR, GEM_NAME)

# data_frames - directory where all data frames will be stored (all stages)
DATA_FRAMES = os.path.join(MODEL_DIR, 'data_frames')

# structure_files - directory where structure related files will be downloaded/are located
STRUCT_FILES = os.path.join(MODEL_DIR, 'structure_files')
STRUCT_EXP_FILES = os.path.join(STRUCT_FILES, 'experimental')
STRUCT_HOMOLOGY_FILES = os.path.join(STRUCT_FILES, 'homology_models')

STRUCT_BEST_SET = os.path.join(STRUCT_FILES, 'best_set')

# Load functions (ssbio package on 212 machine)

In [4]:
# ssbio contains functions to calculate properties
import sys
sys.path.append(os.path.join(ROOT_DIR, 'ssbio'))

# Load the GEM-PRO

In [5]:
DF_GEM_PRO_EC = pd.read_pickle(os.path.join(DATA_FRAMES, 'DF_03_GEMPRO_pub.pckl'))
DF_GEM_PRO_EC.head()

Unnamed: 0,m_reaction,m_subsystem,m_formula,m_metabolites,m_gene_reaction_rule,m_gene,u_uniprot_acc,u_reviewed,u_gene_name,u_ec_number,u_description,u_seq,u_seq_len,u_pfam,u_kegg_id,u_refseq,u_go,u_pdb_count,u_pdb,p_experiment,p_resolution,p_chemicals,p_chains,p_chain_uniprot_map,p_ec_numbers,p_deposition_date,p_doi,p_pmid,p_space_group,i_entry_name,i_length,i_native,i_tm_helix,i_tm_score,i_url,i_label,i_model_type,ssb_p_aln_score,ssb_p_aln_coverage,ssb_p_percent_seq_ident,ssb_p_no_deletions_in_pdb,ssb_p_aln_coverage_sim,normalized,ssb_si_score,ssb_p_chains_from_org,ssb_p_alpha_beta_comp,ssb_alpha_beta_diff_abs,ssb_alpha_beta_diff,ssb_alpha_beta_res_sim,ssb_alpha_beta_res_sim_score,p_resolution_scaled,ssb_rez_score,ssb_raw_score,ssb_above_cutoffs,ssb_rank,ssb_i_alpha_comp,ssb_i_beta_comp,ssb_i_alpha_beta_comp,ssb_best_file
0,12PPDRtex,"Transport, Outer Membrane Porin",12ppd_DASH_R_e <=> 12ppd_DASH_R_p,"['12ppd_DASH_R_e', '12ppd_DASH_R_p']",(b0241 or b0929 or b1377 or b2215),b0241,P02932,True,phoE,,['Outer membrane pore protein E'],MKKSTLALVVMGIVASASVQAAEIYNKDGNKLDVYGKVKAMHYMSD...,351,['PF00267'],"['ecj:Y75_p0232', 'eco:b0241']","['NP_414776.1', 'NC_000913.3', 'YP_488536.1', ...",['GO:0009279; C:cell outer membrane; IEA:UniPr...,1,1PHO,X-RAY DIFFRACTION,3,,['A'],{'A': ['P02932']},,15-JAN-93,10.1038/358727A0,1380671,P 3 2 1,PHOE_ECOLI,351,1phoA,7,,http://zhanglab.ccmb.med.umich.edu/QUARK/ecoli...,E12496,template-based,1710,330,0.940171,True,330,0.889829,1.125009,True,0.575758,0.020202,0.020202,0.350195,0.698345,0.922169,1,2.823354,False,,0.019943,0.535613,0.555556,PHOE_ECOLI_model1.pdb
1,12PPDStex,"Transport, Outer Membrane Porin",12ppd_DASH_S_e <=> 12ppd_DASH_S_p,"['12ppd_DASH_S_e', '12ppd_DASH_S_p']",(b0241 or b0929 or b1377 or b2215),b0241,P02932,True,phoE,,['Outer membrane pore protein E'],MKKSTLALVVMGIVASASVQAAEIYNKDGNKLDVYGKVKAMHYMSD...,351,['PF00267'],"['ecj:Y75_p0232', 'eco:b0241']","['NP_414776.1', 'NC_000913.3', 'YP_488536.1', ...",['GO:0009279; C:cell outer membrane; IEA:UniPr...,1,1PHO,X-RAY DIFFRACTION,3,,['A'],{'A': ['P02932']},,15-JAN-93,10.1038/358727A0,1380671,P 3 2 1,PHOE_ECOLI,351,1phoA,7,,http://zhanglab.ccmb.med.umich.edu/QUARK/ecoli...,E12496,template-based,1710,330,0.940171,True,330,0.889829,1.125009,True,0.575758,0.020202,0.020202,0.350195,0.698345,0.922169,1,2.823354,False,,0.019943,0.535613,0.555556,PHOE_ECOLI_model1.pdb
2,23CAMPtex,"Transport, Outer Membrane Porin",23camp_e <=> 23camp_p,"['23camp_e', '23camp_p']",(b0241 or b0929 or b1377 or b2215),b0241,P02932,True,phoE,,['Outer membrane pore protein E'],MKKSTLALVVMGIVASASVQAAEIYNKDGNKLDVYGKVKAMHYMSD...,351,['PF00267'],"['ecj:Y75_p0232', 'eco:b0241']","['NP_414776.1', 'NC_000913.3', 'YP_488536.1', ...",['GO:0009279; C:cell outer membrane; IEA:UniPr...,1,1PHO,X-RAY DIFFRACTION,3,,['A'],{'A': ['P02932']},,15-JAN-93,10.1038/358727A0,1380671,P 3 2 1,PHOE_ECOLI,351,1phoA,7,,http://zhanglab.ccmb.med.umich.edu/QUARK/ecoli...,E12496,template-based,1710,330,0.940171,True,330,0.889829,1.125009,True,0.575758,0.020202,0.020202,0.350195,0.698345,0.922169,1,2.823354,False,,0.019943,0.535613,0.555556,PHOE_ECOLI_model1.pdb
3,23CCMPtex,"Transport, Outer Membrane Porin",23ccmp_e <=> 23ccmp_p,"['23ccmp_e', '23ccmp_p']",(b0241 or b0929 or b1377 or b2215),b0241,P02932,True,phoE,,['Outer membrane pore protein E'],MKKSTLALVVMGIVASASVQAAEIYNKDGNKLDVYGKVKAMHYMSD...,351,['PF00267'],"['ecj:Y75_p0232', 'eco:b0241']","['NP_414776.1', 'NC_000913.3', 'YP_488536.1', ...",['GO:0009279; C:cell outer membrane; IEA:UniPr...,1,1PHO,X-RAY DIFFRACTION,3,,['A'],{'A': ['P02932']},,15-JAN-93,10.1038/358727A0,1380671,P 3 2 1,PHOE_ECOLI,351,1phoA,7,,http://zhanglab.ccmb.med.umich.edu/QUARK/ecoli...,E12496,template-based,1710,330,0.940171,True,330,0.889829,1.125009,True,0.575758,0.020202,0.020202,0.350195,0.698345,0.922169,1,2.823354,False,,0.019943,0.535613,0.555556,PHOE_ECOLI_model1.pdb
4,23CGMPtex,"Transport, Outer Membrane Porin",23cgmp_e <=> 23cgmp_p,"['23cgmp_e', '23cgmp_p']",(b0241 or b0929 or b1377 or b2215),b0241,P02932,True,phoE,,['Outer membrane pore protein E'],MKKSTLALVVMGIVASASVQAAEIYNKDGNKLDVYGKVKAMHYMSD...,351,['PF00267'],"['ecj:Y75_p0232', 'eco:b0241']","['NP_414776.1', 'NC_000913.3', 'YP_488536.1', ...",['GO:0009279; C:cell outer membrane; IEA:UniPr...,1,1PHO,X-RAY DIFFRACTION,3,,['A'],{'A': ['P02932']},,15-JAN-93,10.1038/358727A0,1380671,P 3 2 1,PHOE_ECOLI,351,1phoA,7,,http://zhanglab.ccmb.med.umich.edu/QUARK/ecoli...,E12496,template-based,1710,330,0.940171,True,330,0.889829,1.125009,True,0.575758,0.020202,0.020202,0.350195,0.698345,0.922169,1,2.823354,False,,0.019943,0.535613,0.555556,PHOE_ECOLI_model1.pdb


# Keep only the representative structures

In [6]:
best_files = DF_GEM_PRO_EC[['m_gene','ssb_best_file']].drop_duplicates().reset_index(drop=True)
best_files.head()

Unnamed: 0,m_gene,ssb_best_file
0,b0241,PHOE_ECOLI_model1.pdb
1,b0929,2zfg.pdb
2,b1377,OMPN_ECOLI_model1.pdb
3,b2215,2j1n.pdb
4,b4032,3pux.pdb


## For testing

In [7]:
STRUCT_BEST_SET = 'ssbio/properties/test_structures'
list_of_pdbs = os.listdir(STRUCT_BEST_SET)

# Calculate the number of potential sulfide bridges (Biopython)

In [8]:
from ssbio.properties.ssbond import count_ss_bond

In [9]:
?count_ss_bond

In [10]:
ss_bond_list = []
for f in tqdm(list_of_pdbs, leave=True):
    ss_bond = count_ss_bond(os.path.join(STRUCT_BEST_SET, f))
    ss_bond_list.append({'ssb_file':f, 'ssb_cys_bridge':ss_bond})

DF_PROP_SSBOND = pd.DataFrame(ss_bond_list)
DF_PROP_SSBOND.to_csv(os.path.join(DATA_FRAMES, 'DF_PROP_SSBOND.csv'))
DF_PROP_SSBOND.head()

100%|██████████| 11/11 [00:01<00:00,  5.30it/s]


Unnamed: 0,ssb_cys_bridge,ssb_file
0,0,PHOE_ECOLI_model1.pdb
1,0,OMPC_ECOLI_model1.pdb
2,3,NAGK_ECOLI_model1.pdb
3,1,OMPA_ECOLI_model1.pdb
4,0,OMPG_ECOLI_model1.pdb


# Calculate SASA, % surface/buried, % secondary structure (DSSP)

In [11]:
from ssbio.properties.dsspprops import all_dssp_props

In [12]:
?all_dssp_props

In [28]:
dssp_props_list = []
for f in tqdm(list_of_pdbs, leave=True):
    dssp_props = all_dssp_props(os.path.join(STRUCT_BEST_SET, f))
    dssp_props['ssb_file'] = f
    dssp_props_list.append(dssp_props)

DF_PROP_DSSP = pd.DataFrame(dssp_props_list)
DF_PROP_DSSP.to_csv(os.path.join(DATA_FRAMES, 'DF_PROP_DSSP.csv'))
DF_PROP_DSSP.head()

100%|██████████| 11/11 [00:02<00:00,  4.30it/s]


Unnamed: 0,ssb_file,ssb_mean_rel_exposed,ssb_per_310_helix,ssb_per_5_helix,ssb_per_B,ssb_per_B_NP,ssb_per_B_P,ssb_per_B_neg,ssb_per_B_pos,ssb_per_S,ssb_per_S_NP,ssb_per_S_P,ssb_per_S_neg,ssb_per_S_pos,ssb_per_alpha,ssb_per_bent,ssb_per_beta_bridge,ssb_per_ext_beta,ssb_per_hbond_turn,ssb_per_irr,ssb_sasa,ssb_size
0,PHOE_ECOLI_model1.pdb,0.289705,0.022792,0,0.182336,0.094017,0.068376,0.017094,0.002849,0.817664,0.353276,0.247863,0.111111,0.105413,0.011396,0.125356,0.011396,0.524217,0.094017,0.210826,16901,351
1,OMPC_ECOLI_model1.pdb,0.297066,0.024523,0,0.163488,0.087193,0.06267,0.010899,0.002725,0.836512,0.376022,0.27248,0.106267,0.081744,0.043597,0.106267,0.019074,0.52861,0.095368,0.182561,17732,367
2,NAGK_ECOLI_model1.pdb,0.276746,0.052805,0,0.254125,0.211221,0.033003,0.006601,0.0033,0.745875,0.356436,0.161716,0.115512,0.112211,0.359736,0.112211,0.019802,0.188119,0.09901,0.168317,13819,303
3,OMPA_ECOLI_model1.pdb,0.535638,0.0,0,0.014451,0.008671,0.00289,0.00289,0.0,0.985549,0.511561,0.268786,0.098266,0.106936,0.16763,0.138728,0.00289,0.150289,0.078035,0.462428,29514,346
4,OMPG_ECOLI_model1.pdb,0.474059,0.0,0,0.023256,0.019934,0.003322,0.0,0.0,0.976744,0.435216,0.272425,0.166113,0.10299,0.0,0.189369,0.0,0.421927,0.013289,0.375415,24231,301


# Calculate surface & residue depth using MSMS

In [14]:
from ssbio.properties.msmsprops import msms_output, residue_depth

In [15]:
?msms_output

In [16]:
?residue_depth

In [17]:
msmsinfo = []
redinfo = []

msms_errors = []

for i in tqdm(list_of_pdbs, leave=True):
    try:
        msms_stuff = msms_output(os.path.join(STRUCT_BEST_SET, i))
    except:
        msms_errors.append(i)
        continue
        
    msmsinfo.append([i, msms_stuff])
    
    red_dict = residue_depth(msms_stuff)
    red_dict['ssb_file'] = i
    redinfo.append(red_dict)
    
DF_PROP_MSMS = pd.DataFrame(msmsinfo)
DF_PROP_MSMS.columns=['ssb_file','ssb_msms']
DF_PROP_MSMS.to_csv('DF_PROP_MSMS.csv')
DF_PROP_MSMS.head()

100%|██████████| 11/11 [01:33<00:00, 12.35s/it]


Unnamed: 0,ssb_file,ssb_msms
0,PHOE_ECOLI_model1.pdb,"[[X, 1, 1.82956377516, 1.99945048291], [X, 2, ..."
1,OMPC_ECOLI_model1.pdb,"[[X, 1, 1.82965439368, 1.9996068297], [X, 2, 2..."
2,NAGK_ECOLI_model1.pdb,"[[X, 1, 1.96540883302, 2.00019860426], [X, 2, ..."
3,OMPA_ECOLI_model1.pdb,"[[X, 1, 1.82960163044, 1.99979346159], [X, 2, ..."
4,OMPG_ECOLI_model1.pdb,"[[X, 1, 1.82956501513, 1.99973236858], [X, 2, ..."


In [18]:
DF_PROP_DEPTH = pd.DataFrame(redinfo)
DF_PROP_DEPTH.to_csv(os.path.join(DATA_FRAMES, 'DF_PROP_DEPTH.csv'))
DF_PROP_DEPTH.head()

Unnamed: 0,ssb_avg_res_depth,ssb_ca_depth,ssb_file
0,2.466982,2.641946,PHOE_ECOLI_model1.pdb
1,2.414181,2.627883,OMPC_ECOLI_model1.pdb
2,2.731797,2.815295,NAGK_ECOLI_model1.pdb
3,1.98012,2.195048,OMPA_ECOLI_model1.pdb
4,1.982505,2.280696,OMPG_ECOLI_model1.pdb


In [19]:
print(msms_errors)

['1u8f.cif']


# Calculating % polar, nonpolar, etc. residues

In [20]:
from ssbio.properties.resprops import residue_props

In [21]:
?residue_props

In [22]:
nlist = []

for i in tqdm(list_of_pdbs, leave=True):
    res_dict = residue_props(os.path.join(STRUCT_BEST_SET, i))
    res_dict['ssb_file'] = i
    nlist.append(res_dict)
    
DF_PROP_RES = pd.DataFrame(nlist)
DF_PROP_RES.to_csv(os.path.join(DATA_FRAMES, 'DF_PROP_RES.csv'))
DF_PROP_RES.head()

100%|██████████| 11/11 [00:01<00:00,  5.25it/s]


Unnamed: 0,ssb_file,ssb_per_NP,ssb_per_P,ssb_per_neg,ssb_per_pos
0,PHOE_ECOLI_model1.pdb,0.447293,0.316239,0.128205,0.108262
1,OMPC_ECOLI_model1.pdb,0.463215,0.33515,0.117166,0.084469
2,NAGK_ECOLI_model1.pdb,0.567657,0.194719,0.122112,0.115512
3,OMPA_ECOLI_model1.pdb,0.520231,0.271676,0.101156,0.106936
4,OMPG_ECOLI_model1.pdb,0.45515,0.275748,0.166113,0.10299


# Merge all information

In [25]:
DF_PROP_ALL = DF_PROP_SSBOND.merge(DF_PROP_DEPTH).merge(DF_PROP_RES).merge(DF_PROP_DSSP)
DF_PROP_ALL.head()

Unnamed: 0,ssb_cys_bridge,ssb_file,ssb_avg_res_depth,ssb_ca_depth,ssb_per_NP,ssb_per_P,ssb_per_neg,ssb_per_pos,ssb_mean_rel_exposed,ssb_per_310_helix,ssb_per_5_helix,ssb_per_B,ssb_per_B_NP,ssb_per_B_P,ssb_per_B_neg,ssb_per_B_pos,ssb_per_S,ssb_per_S_NP,ssb_per_S_P,ssb_per_S_neg,ssb_per_S_pos,ssb_per_alpha,ssb_per_bent,ssb_per_beta_bridge,ssb_per_ext_beta,ssb_per_hbond_turn,ssb_per_irr,ssb_sasa,ssb_size
0,0,PHOE_ECOLI_model1.pdb,2.466982,2.641946,0.447293,0.316239,0.128205,0.108262,0.289705,0.022792,0,0.182336,0.094017,0.068376,0.017094,0.002849,0.817664,0.353276,0.247863,0.111111,0.105413,0.011396,0.125356,0.011396,0.524217,0.094017,0.210826,16901,351
1,0,OMPC_ECOLI_model1.pdb,2.414181,2.627883,0.463215,0.33515,0.117166,0.084469,0.297066,0.024523,0,0.163488,0.087193,0.06267,0.010899,0.002725,0.836512,0.376022,0.27248,0.106267,0.081744,0.043597,0.106267,0.019074,0.52861,0.095368,0.182561,17732,367
2,3,NAGK_ECOLI_model1.pdb,2.731797,2.815295,0.567657,0.194719,0.122112,0.115512,0.276746,0.052805,0,0.254125,0.211221,0.033003,0.006601,0.0033,0.745875,0.356436,0.161716,0.115512,0.112211,0.359736,0.112211,0.019802,0.188119,0.09901,0.168317,13819,303
3,1,OMPA_ECOLI_model1.pdb,1.98012,2.195048,0.520231,0.271676,0.101156,0.106936,0.535638,0.0,0,0.014451,0.008671,0.00289,0.00289,0.0,0.985549,0.511561,0.268786,0.098266,0.106936,0.16763,0.138728,0.00289,0.150289,0.078035,0.462428,29514,346
4,0,OMPG_ECOLI_model1.pdb,1.982505,2.280696,0.45515,0.275748,0.166113,0.10299,0.474059,0.0,0,0.023256,0.019934,0.003322,0.0,0.0,0.976744,0.435216,0.272425,0.166113,0.10299,0.0,0.189369,0.0,0.421927,0.013289,0.375415,24231,301


In [27]:
DF_PROP_ALL.to_pickle(os.path.join(DATA_FRAMES, 'DF_PROP_ALL.pckl'))