# Table of Contents
 <p><div class="lev1"><a href="#Introduction"><span class="toc-item-num">1 - </span>Introduction</a></div><div class="lev1"><a href="#Folder-initialization-&amp;-imports"><span class="toc-item-num">2 - </span>Folder initialization &amp; imports</a></div><div class="lev1"><a href="#Loading-provided-dataframes"><span class="toc-item-num">3 - </span>Loading provided dataframes</a></div><div class="lev2"><a href="#Loading-the-GEM-PRO"><span class="toc-item-num">3.1 - </span>Loading the GEM-PRO</a></div><div class="lev2"><a href="#Loading-orthologous-genes-and-sequences"><span class="toc-item-num">3.2 - </span>Loading orthologous genes and sequences</a></div><div class="lev3"><a href="#Various-cleaning-operations"><span class="toc-item-num">3.2.1 - </span>Various cleaning operations</a></div><div class="lev3"><a href="#Functions-to-map-between-strain/model-name,-and-strain-genes-to-k12-genes"><span class="toc-item-num">3.2.2 - </span>Functions to map between strain/model name, and strain genes to k12 genes</a></div><div class="lev1"><a href="#Writing-FASTA-files-for-alignments"><span class="toc-item-num">4 - </span>Writing FASTA files for alignments</a></div><div class="lev1"><a href="#Running-alignments"><span class="toc-item-num">5 - </span>Running alignments</a></div>

# Introduction

<span style="background-color:#66FF99">**This IPython notebook will guide you through conducting alignments throughout the different strains of *E. coli***</span>

There are 2 main stages to this workflow.

1. Loading dataframes which include the representative structures, orthologous genes, and gene sequences
2. Running all alignments

# Folder initialization & imports

<span style="background-color:#66FF99">**Before we begin, we are going to create the folder structure for our files. **</span>

In [1]:
%run 'ssbio_01.ipynb'

@> ProDy is configured: verbosity='none'
INFO:.prody:ProDy is configured: verbosity='none'


In [2]:
GEM_NAME = 'iJO1366'
GEM_GENE_SOURCE = 'ENSEMBLGENOME_ID'
UNIPROT_REVIEWED_ONLY = True
ROOT_DIR = os.getcwd()

In [3]:
MODEL_DIR = os.path.join(ROOT_DIR, GEM_NAME)

# data_frames - directory where all data frames will be stored (all stages)
DATA_FRAMES = os.path.join(MODEL_DIR, 'data_frames')

# model_files - directory where original GEMs and GEM-related files are stored
MODEL_FILES = os.path.join(MODEL_DIR, 'model_files')

# sequence_files - directory where all sequence related files are stored
SEQ_FILES = os.path.join(MODEL_DIR, 'sequence_files')
SEQ_UNIPROT_FILES = os.path.join(SEQ_FILES, 'uniprot_sequences')
SEQ_PDB_FILES = os.path.join(SEQ_FILES, 'pdb_sequences')
SEQ_ALIGN_FILES = os.path.join(SEQ_FILES, 'alignment')

# structure_files - directory where structure related files will be downloaded/are located
STRUCT_FILES = os.path.join(MODEL_DIR, 'structure_files')
STRUCT_EXP_FILES = os.path.join(STRUCT_FILES, 'experimental')
STRUCT_HOMOLOGY_FILES = os.path.join(STRUCT_FILES, 'homology_models')

STRUCT_BEST_EXP_PERFECT = MODEL_DIR + 'structure_files/best_set/experimental_perfect/'
STRUCT_BEST_EXP_MUTATED = MODEL_DIR + 'structure_files/best_set/experimental_mutated/'
STRUCT_BEST_HOMOLOGY = MODEL_DIR + 'structure_files/best_set/homology_models/'

# Loading provided dataframes

## Loading the GEM-PRO

In [4]:
iJO1366_GEMPRO_path = os.path.join(DATA_FRAMES, 'DF_03_GEMPRO_pub.pckl')

In [5]:
iJO1366_GEMPRO = pd.read_pickle(iJO1366_GEMPRO_path)
droplist = ['m_reaction','m_gene_reaction_rule','m_metabolites','m_subsystem','m_formula']
iJO1366_GEMPRO_min = iJO1366_GEMPRO.drop(droplist,axis=1).drop_duplicates().sort_index()
iJO1366_GEMPRO_min.head()

Unnamed: 0,m_gene,u_uniprot_acc,u_reviewed,u_gene_name,u_ec_number,u_description,u_seq,u_seq_len,u_pfam,u_kegg_id,u_refseq,u_go,u_pdb_count,u_pdb,p_experiment,p_resolution,p_chemicals,p_chains,p_chain_uniprot_map,p_ec_numbers,p_deposition_date,p_doi,p_pmid,p_space_group,i_entry_name,i_length,i_native,i_tm_helix,i_tm_score,i_url,i_label,i_model_type,ssb_p_aln_score,ssb_p_aln_coverage,ssb_p_percent_seq_ident,ssb_p_no_deletions_in_pdb,ssb_p_aln_coverage_sim,normalized,ssb_si_score,ssb_p_chains_from_org,ssb_p_alpha_beta_comp,ssb_alpha_beta_diff_abs,ssb_alpha_beta_diff,ssb_alpha_beta_res_sim,ssb_alpha_beta_res_sim_score,p_resolution_scaled,ssb_rez_score,ssb_raw_score,ssb_above_cutoffs,ssb_rank,ssb_i_alpha_comp,ssb_i_beta_comp,ssb_i_alpha_beta_comp,ssb_best_file
0,b0241,P02932,True,phoE,,['Outer membrane pore protein E'],MKKSTLALVVMGIVASASVQAAEIYNKDGNKLDVYGKVKAMHYMSD...,351,['PF00267'],"['ecj:Y75_p0232', 'eco:b0241']","['NP_414776.1', 'NC_000913.3', 'YP_488536.1', ...",['GO:0009279; C:cell outer membrane; IEA:UniPr...,1,1PHO,X-RAY DIFFRACTION,3.0,,['A'],{'A': ['P02932']},,15-JAN-93,10.1038/358727A0,1380671.0,P 3 2 1,PHOE_ECOLI,351,1phoA,7,,http://zhanglab.ccmb.med.umich.edu/QUARK/ecoli...,E12496,template-based,1710,330,0.940171,True,330,0.889829,1.125009,True,0.575758,0.020202,0.020202,0.350195,0.698345,0.922169,1.0,2.823354,False,,0.019943,0.535613,0.555556,PHOE_ECOLI_model1.pdb
270,b0929,P02931,True,ompF,,"['Porin OmpF', 'Outer membrane protein IA', 'O...",MMKRNILAVIVPALLVAGTANAAEIYNKDGNKVDLYGKAVGLHYFS...,362,['PF00267'],"['ecj:Y75_p0901', 'eco:b0929']","['NP_415449.1', 'NC_000913.3', 'YP_489201.1', ...",['GO:0009279; C:cell outer membrane; IDA:EcoCy...,30,1BT9,X-RAY DIFFRACTION,3.0,,['A'],{'A': ['P02931']},,01-SEP-98,10.1021/BI981215C,9843370.0,P 3 2 1,OMPF_ECOLI,362,1opfA,13,,http://zhanglab.ccmb.med.umich.edu/QUARK/ecoli...,E12394,template-based,1797,339,0.936464,True,339,0.914097,1.120574,True,0.623529,0.037894,0.037894,0.792453,1.58028,0.922169,1.0,3.700854,True,10.0,0.030387,0.555249,0.585635,2zfg.pdb
541,b0929,P02931,True,ompF,,"['Porin OmpF', 'Outer membrane protein IA', 'O...",MMKRNILAVIVPALLVAGTANAAEIYNKDGNKVDLYGKAVGLHYFS...,362,['PF00267'],"['ecj:Y75_p0901', 'eco:b0929']","['NP_415449.1', 'NC_000913.3', 'YP_489201.1', ...",['GO:0009279; C:cell outer membrane; IDA:EcoCy...,30,4JFB,X-RAY DIFFRACTION,3.8,,"['A', 'C', 'B', 'F', 'D', 'E']","{'A': ['P02931'], 'C': ['P02931'], 'B': ['P029...",,28-FEB-13,,,C 1 2 1,OMPF_ECOLI,362,1opfA,13,,http://zhanglab.ccmb.med.umich.edu/QUARK/ecoli...,E12394,template-based,1805,340,0.939227,True,340,0.916793,1.123879,True,0.640196,0.054561,0.054561,0.788991,1.573377,0.89266,0.968,3.665256,False,,0.030387,0.555249,0.585635,2zfg.pdb
812,b0929,P02931,True,ompF,,"['Porin OmpF', 'Outer membrane protein IA', 'O...",MMKRNILAVIVPALLVAGTANAAEIYNKDGNKVDLYGKAVGLHYFS...,362,['PF00267'],"['ecj:Y75_p0901', 'eco:b0929']","['NP_415449.1', 'NC_000913.3', 'YP_489201.1', ...",['GO:0009279; C:cell outer membrane; IDA:EcoCy...,30,3FYX,X-RAY DIFFRACTION,3.4,"['EPE', '451']",['A'],{'A': ['P02931']},,23-JAN-09,10.1002/ANIE.200900457,19322865.0,P 3 2 1,OMPF_ECOLI,362,1opfA,13,,http://zhanglab.ccmb.med.umich.edu/QUARK/ecoli...,E12394,template-based,1797,339,0.936464,True,339,0.914097,1.120574,True,0.623529,0.037894,0.037894,0.763889,1.523319,0.907414,0.984,3.627893,False,,0.030387,0.555249,0.585635,2zfg.pdb
1083,b0929,P02931,True,ompF,,"['Porin OmpF', 'Outer membrane protein IA', 'O...",MMKRNILAVIVPALLVAGTANAAEIYNKDGNKVDLYGKAVGLHYFS...,362,['PF00267'],"['ecj:Y75_p0901', 'eco:b0929']","['NP_415449.1', 'NC_000913.3', 'YP_489201.1', ...",['GO:0009279; C:cell outer membrane; IDA:EcoCy...,30,3K1B,X-RAY DIFFRACTION,4.39,,"['A', 'C', 'B', 'D']","{'A': ['P02931'], 'C': ['P02931'], 'B': ['P029...",,26-SEP-09,10.1002/PRO.369,20196071.0,P 3 2 1,OMPF_ECOLI,362,1opfA,13,,http://zhanglab.ccmb.med.umich.edu/QUARK/ecoli...,E12394,template-based,1805,340,0.939227,True,340,0.916793,1.123879,True,0.574265,0.011371,-0.011371,0.695238,1.386418,0.870896,0.9444,3.454698,False,,0.030387,0.555249,0.585635,2zfg.pdb


## Loading orthologous genes and sequences

In [6]:
monk_supplement_t1 = pd.read_excel(os.path.join(DATA_FRAMES, '55strains_monk.xlsx'), sheetname='Table 1', skiprows=2, index_col=0, skip_footer=1)
monk_supplement_t8 = pd.read_excel(os.path.join(DATA_FRAMES, '55strains_monk.xlsx'), sheetname='Table 8', skiprows=1)
monk_gene_consv = pd.read_excel(os.path.join(DATA_FRAMES, '55strains_orthologous.xlsx'))
monk_gene_seqs = pd.read_excel(os.path.join(DATA_FRAMES, '55strains_sequences.xlsx'))

### Various cleaning operations

In [7]:
for column in monk_gene_consv:
    if not column.endswith('_LOCI'):
        monk_gene_consv.drop(column, axis=1, inplace=True)

In [8]:
monk_gene_consv.rename(columns={'Escherichia coli BL21(DE3) AM946981(469008.14)_LOCI': 'Escherichia coli BL21 DE3 AM946981(469008.14)_LOCI', 
                                'Escherichia coli BL21(DE3) BL21-Gold(DE3)pLysS AG(469008.4)_LOCI': 'Escherichia coli BL21 DE3 BL21-Gold DE3 pLysS AG(469008.4)_LOCI',
                                'Escherichia coli BL21(DE3) CP001509(469008.13)_LOCI': 'Escherichia coli BL21 DE3 CP001509(469008.13)_LOCI'}, inplace=True)

In [9]:
monk_gene_consv.rename(columns=lambda x: x.split('(')[0], inplace=True)

In [10]:
# this is the orthologous gene table
monk_gene_consv.head()

Unnamed: 0,Escherichia coli 042,Escherichia coli 536,Escherichia coli 55989,Escherichia coli ABU 83972,Escherichia coli APEC O1,Escherichia coli ATCC 8739,Escherichia coli B str. REL606,Escherichia coli BL21 DE3 AM946981,Escherichia coli BL21 DE3 BL21-Gold DE3 pLysS AG,Escherichia coli BL21 DE3 CP001509,Escherichia coli BW2952,Escherichia coli CFT073,Escherichia coli DH1 ME8569,Escherichia coli DH1,Escherichia coli E24377A,Escherichia coli ED1a,Escherichia coli ETEC H10407,Escherichia coli HS,Escherichia coli IAI1,Escherichia coli IAI39,Escherichia coli IHE3034,Escherichia coli KO11FL,Escherichia coli LF82,Escherichia coli NA114,Escherichia coli O103:H2 str. 12009,Escherichia coli O111:H- str. 11128,Escherichia coli O127:H6 str. E2348/69,Escherichia coli O157:H7 EDL933,Escherichia coli O157:H7 str. EC4115,Escherichia coli O157:H7 str. Sakai,Escherichia coli O157:H7 str. TW14359,Escherichia coli O26:H11 str. 11368,Escherichia coli O55:H7 str. CB9615,Escherichia coli O83:H1 str. NRG 857C,Escherichia coli S88,Escherichia coli SE11,Escherichia coli SE15,Escherichia coli SMS-3-5,Escherichia coli UM146,Escherichia coli UMN026,Escherichia coli UMNK88,Escherichia coli UTI89,Escherichia coli W CP002185,Escherichia coli W,Escherichia coli str. K-12 substr. DH10B,Escherichia coli str. K-12 substr. MG1655,Escherichia coli str. K-12 substr. W3110,Shigella boydii CDC 3083-94,Shigella boydii Sb227,Shigella dysenteriae Sd197,Shigella flexneri 2002017,Shigella flexneri 2a str. 2457T,Shigella flexneri 2a str. 301,Shigella flexneri 5 str. 8401,Shigella sonnei Ss046
b0002,EC042_0001,ECP_0002,EC55989_0002,ECABU_c00010,APECO1_1976,EcolC_3653,ECB_00002,ECBD_3616,ECBD_3616,ECBD_3616,BWG_0002,c0003,EcDH1_3594,EcDH1_3594,EcE24377A_0001,,ETEC_0002,,ECIAI1_0002,ECIAI39_0001,ECOK1_0002,EKO11DRAFT_3995,LF82_2259,ECNA114_4646,ECO103_0002,ECO111_0002,E2348C_0002,,ECH74115_0003,ECs0002,ECSP_0002,ECO26_0002,G2583_0002,NRG857_00015,ECS88_0002,ECSE_0002,ECSF_0002,EcSMS35_0001,,ECUMN_0002,UMNK88_1,UTI89_C0002,EschWDRAFT_3337,EschWDRAFT_3337,ECDH10B_0002,b0002,,SbBS512_E0002,SBO_0001,SDY_0002,SFxv_0001,S0002,,SFV_0001,SSON_0002
b0003,EC042_0002,ECP_0003,EC55989_0003,ECABU_c00020,APECO1_1975,EcolC_3652,ECB_00003,ECBD_3615,ECBD_3615,ECBD_3615,BWG_0003,c0004,EcDH1_3593,EcDH1_3593,EcE24377A_0002,,ETEC_0003,,ECIAI1_0003,ECIAI39_0002,ECOK1_0003,EKO11DRAFT_3994,LF82_2260,ECNA114_4647,ECO103_0003,ECO111_0003,E2348C_0003,,ECH74115_0004,ECs0003,ECSP_0003,ECO26_0003,G2583_0003,NRG857_00020,ECS88_0003,ECSE_0003,ECSF_0003,EcSMS35_0002,,ECUMN_0003,UMNK88_2,UTI89_C0003,EschWDRAFT_3336,EschWDRAFT_3336,ECDH10B_0003,b0003,,SbBS512_E0003,SBO_0002,SDY_0003,SFxv_0002,S0003,,SFV_0002,SSON_0003
b0004,EC042_0003,ECP_0004,EC55989_0004,ECABU_c00030,APECO1_1974,EcolC_3651,ECB_00004,ECBD_3614,ECBD_3614,ECBD_3614,BWG_0004,c0005,EcDH1_3592,EcDH1_3592,EcE24377A_0003,,ETEC_0004,,ECIAI1_0004,ECIAI39_0003,ECOK1_0004,EKO11DRAFT_3993,LF82_2261,ECNA114_4648,ECO103_0004,ECO111_0004,E2348C_0004,,ECH74115_0005,ECs0004,ECSP_0004,ECO26_0004,G2583_0004,NRG857_00025,ECS88_0004,ECSE_0004,ECSF_0004,EcSMS35_0003,,ECUMN_0004,UMNK88_3,UTI89_C0004,EschWDRAFT_3335,EschWDRAFT_3335,ECDH10B_0004,b0004,,SbBS512_E0004,SBO_0003,SDY_0004,SFxv_0003,S0004,,SFV_0003,SSON_0004
b0006,EC042_0007,ECP_0007,EC55989_0006,ECABU_c00070,APECO1_1972,EcolC_3649,ECB_00006,ECBD_3612,ECBD_3612,ECBD_3612,BWG_0006,c0010,EcDH1_3590,EcDH1_3590,EcE24377A_0006,,ETEC_0006,,ECIAI1_0006,ECIAI39_0005,ECOK1_0006,EKO11DRAFT_1579,LF82_2459,ECNA114_4650,ECO103_0007,ECO111_0006,E2348C_0006,,ECH74115_0007,ECs0006,ECSP_0006,,G2583_0006,NRG857_00035,ECS88_0006,ECSE_0006,ECSF_0006,EcSMS35_0005,,ECUMN_0006,UMNK88_5,UTI89_C0007,,,ECDH10B_0006,b0006,,SbBS512_E0006,SBO_0005,SDY_0006,SFxv_0005,S0006,,SFV_0005,SSON_0006
b0007,EC042_0008,ECP_0008,EC55989_0007,ECABU_c00080,APECO1_1971,EcolC_3648,ECB_00007,ECBD_3611,ECBD_3611,ECBD_3611,BWG_0007,c0011,EcDH1_3589,EcDH1_3589,EcE24377A_0007,,ETEC_0007,,ECIAI1_0007,ECIAI39_0006,ECOK1_0007,EKO11DRAFT_1578,LF82_2462,ECNA114_4651,ECO103_0008,ECO111_0007,E2348C_0007,,ECH74115_0008,ECs0007,ECSP_0007,ECO26_0007,G2583_0007,NRG857_00040,ECS88_0007,ECSE_0007,ECSF_0007,EcSMS35_0006,,ECUMN_0007,UMNK88_6,UTI89_C0008,,,ECDH10B_0007,b0007,,SbBS512_E0007,,SDY_0007,,,,,


In [11]:
# these are the actual gene sequences of the different strains
monk_gene_seqs.head()

Unnamed: 0,Escherichia coli 042(216592.3)_FIGFAM,Escherichia coli 042(216592.3)_LOCI,Escherichia coli 042(216592.3)_PID,Escherichia coli 042(216592.3)_SEQ,Escherichia coli 536(362663.9)_FIGFAM,Escherichia coli 536(362663.9)_LOCI,Escherichia coli 536(362663.9)_PID,Escherichia coli 536(362663.9)_SEQ,Escherichia coli 55989(585055.6)_FIGFAM,Escherichia coli 55989(585055.6)_LOCI,Escherichia coli 55989(585055.6)_PID,Escherichia coli 55989(585055.6)_SEQ,Escherichia coli ABU 83972(655817.3)_FIGFAM,Escherichia coli ABU 83972(655817.3)_LOCI,Escherichia coli ABU 83972(655817.3)_PID,Escherichia coli ABU 83972(655817.3)_SEQ,Escherichia coli APEC O1(405955.13)_FIGFAM,Escherichia coli APEC O1(405955.13)_LOCI,Escherichia coli APEC O1(405955.13)_PID,Escherichia coli APEC O1(405955.13)_SEQ,Escherichia coli ATCC 8739(481805.6)_FIGFAM,Escherichia coli ATCC 8739(481805.6)_LOCI,Escherichia coli ATCC 8739(481805.6)_PID,Escherichia coli ATCC 8739(481805.6)_SEQ,Escherichia coli B str. REL606(413997.3)_FIGFAM,Escherichia coli B str. REL606(413997.3)_LOCI,Escherichia coli B str. REL606(413997.3)_PID,Escherichia coli B str. REL606(413997.3)_SEQ,Escherichia coli BL21(DE3) AM946981(469008.14)_FIGFAM,Escherichia coli BL21(DE3) AM946981(469008.14)_LOCI,Escherichia coli BL21(DE3) AM946981(469008.14)_PID,Escherichia coli BL21(DE3) AM946981(469008.14)_SEQ,Escherichia coli BL21(DE3) BL21-Gold(DE3)pLysS AG(469008.4)_FIGFAM,Escherichia coli BL21(DE3) BL21-Gold(DE3)pLysS AG(469008.4)_LOCI,Escherichia coli BL21(DE3) BL21-Gold(DE3)pLysS AG(469008.4)_PID,Escherichia coli BL21(DE3) BL21-Gold(DE3)pLysS AG(469008.4)_SEQ,Escherichia coli BL21(DE3) CP001509(469008.13)_FIGFAM,Escherichia coli BL21(DE3) CP001509(469008.13)_LOCI,Escherichia coli BL21(DE3) CP001509(469008.13)_PID,Escherichia coli BL21(DE3) CP001509(469008.13)_SEQ,Escherichia coli BW2952(595496.3)_FIGFAM,Escherichia coli BW2952(595496.3)_LOCI,Escherichia coli BW2952(595496.3)_PID,Escherichia coli BW2952(595496.3)_SEQ,Escherichia coli CFT073(199310.4)_FIGFAM,Escherichia coli CFT073(199310.4)_LOCI,Escherichia coli CFT073(199310.4)_PID,Escherichia coli CFT073(199310.4)_SEQ,Escherichia coli DH1 ME8569(536056.4)_FIGFAM,Escherichia coli DH1 ME8569(536056.4)_LOCI,Escherichia coli DH1 ME8569(536056.4)_PID,Escherichia coli DH1 ME8569(536056.4)_SEQ,Escherichia coli DH1(536056.3)_FIGFAM,Escherichia coli DH1(536056.3)_LOCI,Escherichia coli DH1(536056.3)_PID,Escherichia coli DH1(536056.3)_SEQ,Escherichia coli E24377A(331111.12)_FIGFAM,Escherichia coli E24377A(331111.12)_LOCI,Escherichia coli E24377A(331111.12)_PID,Escherichia coli E24377A(331111.12)_SEQ,Escherichia coli ED1a(585397.7)_FIGFAM,Escherichia coli ED1a(585397.7)_LOCI,Escherichia coli ED1a(585397.7)_PID,Escherichia coli ED1a(585397.7)_SEQ,Escherichia coli ETEC H10407(316401.4)_FIGFAM,Escherichia coli ETEC H10407(316401.4)_LOCI,Escherichia coli ETEC H10407(316401.4)_PID,Escherichia coli ETEC H10407(316401.4)_SEQ,Escherichia coli HS(331112.3)_FIGFAM,Escherichia coli HS(331112.3)_LOCI,Escherichia coli HS(331112.3)_PID,Escherichia coli HS(331112.3)_SEQ,Escherichia coli IAI1(585034.4)_FIGFAM,Escherichia coli IAI1(585034.4)_LOCI,Escherichia coli IAI1(585034.4)_PID,Escherichia coli IAI1(585034.4)_SEQ,Escherichia coli IAI39(585057.6)_FIGFAM,Escherichia coli IAI39(585057.6)_LOCI,Escherichia coli IAI39(585057.6)_PID,Escherichia coli IAI39(585057.6)_SEQ,Escherichia coli IHE3034(714962.3)_FIGFAM,Escherichia coli IHE3034(714962.3)_LOCI,Escherichia coli IHE3034(714962.3)_PID,Escherichia coli IHE3034(714962.3)_SEQ,Escherichia coli KO11FL(595495.4)_FIGFAM,Escherichia coli KO11FL(595495.4)_LOCI,Escherichia coli KO11FL(595495.4)_PID,Escherichia coli KO11FL(595495.4)_SEQ,Escherichia coli LF82(591946.4)_FIGFAM,Escherichia coli LF82(591946.4)_LOCI,Escherichia coli LF82(591946.4)_PID,Escherichia coli LF82(591946.4)_SEQ,Escherichia coli NA114(1033813.3)_FIGFAM,Escherichia coli NA114(1033813.3)_LOCI,Escherichia coli NA114(1033813.3)_PID,Escherichia coli NA114(1033813.3)_SEQ,Escherichia coli O103:H2 str. 12009(585395.4)_FIGFAM,Escherichia coli O103:H2 str. 12009(585395.4)_LOCI,Escherichia coli O103:H2 str. 12009(585395.4)_PID,Escherichia coli O103:H2 str. 12009(585395.4)_SEQ,Escherichia coli O111:H- str. 11128(585396.4)_FIGFAM,Escherichia coli O111:H- str. 11128(585396.4)_LOCI,Escherichia coli O111:H- str. 11128(585396.4)_PID,Escherichia coli O111:H- str. 11128(585396.4)_SEQ,Escherichia coli O127:H6 str. E2348/69(574521.7)_FIGFAM,Escherichia coli O127:H6 str. E2348/69(574521.7)_LOCI,Escherichia coli O127:H6 str. E2348/69(574521.7)_PID,Escherichia coli O127:H6 str. E2348/69(574521.7)_SEQ,Escherichia coli O157:H7 EDL933(155864.8)_FIGFAM,Escherichia coli O157:H7 EDL933(155864.8)_LOCI,Escherichia coli O157:H7 EDL933(155864.8)_PID,Escherichia coli O157:H7 EDL933(155864.8)_SEQ,Escherichia coli O157:H7 str. EC4115(444450.8)_FIGFAM,Escherichia coli O157:H7 str. EC4115(444450.8)_LOCI,Escherichia coli O157:H7 str. EC4115(444450.8)_PID,Escherichia coli O157:H7 str. EC4115(444450.8)_SEQ,Escherichia coli O157:H7 str. Sakai(386585.9)_FIGFAM,Escherichia coli O157:H7 str. Sakai(386585.9)_LOCI,Escherichia coli O157:H7 str. Sakai(386585.9)_PID,Escherichia coli O157:H7 str. Sakai(386585.9)_SEQ,Escherichia coli O157:H7 str. TW14359(544404.4)_FIGFAM,Escherichia coli O157:H7 str. TW14359(544404.4)_LOCI,Escherichia coli O157:H7 str. TW14359(544404.4)_PID,Escherichia coli O157:H7 str. TW14359(544404.4)_SEQ,Escherichia coli O26:H11 str. 11368(573235.3)_FIGFAM,Escherichia coli O26:H11 str. 11368(573235.3)_LOCI,Escherichia coli O26:H11 str. 11368(573235.3)_PID,Escherichia coli O26:H11 str. 11368(573235.3)_SEQ,Escherichia coli O55:H7 str. CB9615(701177.3)_FIGFAM,Escherichia coli O55:H7 str. CB9615(701177.3)_LOCI,Escherichia coli O55:H7 str. CB9615(701177.3)_PID,Escherichia coli O55:H7 str. CB9615(701177.3)_SEQ,Escherichia coli O83:H1 str. NRG 857C(685038.3)_FIGFAM,Escherichia coli O83:H1 str. NRG 857C(685038.3)_LOCI,Escherichia coli O83:H1 str. NRG 857C(685038.3)_PID,Escherichia coli O83:H1 str. NRG 857C(685038.3)_SEQ,Escherichia coli S88(585035.6)_FIGFAM,Escherichia coli S88(585035.6)_LOCI,Escherichia coli S88(585035.6)_PID,Escherichia coli S88(585035.6)_SEQ,Escherichia coli SE11(409438.11)_FIGFAM,Escherichia coli SE11(409438.11)_LOCI,Escherichia coli SE11(409438.11)_PID,Escherichia coli SE11(409438.11)_SEQ,Escherichia coli SE15(431946.3)_FIGFAM,Escherichia coli SE15(431946.3)_LOCI,Escherichia coli SE15(431946.3)_PID,Escherichia coli SE15(431946.3)_SEQ,Escherichia coli SMS-3-5(439855.10)_FIGFAM,Escherichia coli SMS-3-5(439855.10)_LOCI,Escherichia coli SMS-3-5(439855.10)_PID,Escherichia coli SMS-3-5(439855.10)_SEQ,Escherichia coli UM146(869729.3)_FIGFAM,Escherichia coli UM146(869729.3)_LOCI,Escherichia coli UM146(869729.3)_PID,Escherichia coli UM146(869729.3)_SEQ,Escherichia coli UMN026(585056.7)_FIGFAM,Escherichia coli UMN026(585056.7)_LOCI,Escherichia coli UMN026(585056.7)_PID,Escherichia coli UMN026(585056.7)_SEQ,Escherichia coli UMNK88(696406.3)_FIGFAM,Escherichia coli UMNK88(696406.3)_LOCI,Escherichia coli UMNK88(696406.3)_PID,Escherichia coli UMNK88(696406.3)_SEQ,Escherichia coli UTI89(364106.7)_FIGFAM,Escherichia coli UTI89(364106.7)_LOCI,Escherichia coli UTI89(364106.7)_PID,Escherichia coli UTI89(364106.7)_SEQ,Escherichia coli W CP002185(566546.4)_FIGFAM,Escherichia coli W CP002185(566546.4)_LOCI,Escherichia coli W CP002185(566546.4)_PID,Escherichia coli W CP002185(566546.4)_SEQ,Escherichia coli W(566546.3)_FIGFAM,Escherichia coli W(566546.3)_LOCI,Escherichia coli W(566546.3)_PID,Escherichia coli W(566546.3)_SEQ,Escherichia coli str. K-12 substr. DH10B(316385.7)_FIGFAM,Escherichia coli str. K-12 substr. DH10B(316385.7)_LOCI,Escherichia coli str. K-12 substr. DH10B(316385.7)_PID,Escherichia coli str. K-12 substr. DH10B(316385.7)_SEQ,Escherichia coli str. K-12 substr. MG1655(511145.6)_FIGFAM,Escherichia coli str. K-12 substr. MG1655(511145.6)_LOCI,Escherichia coli str. K-12 substr. MG1655(511145.6)_PID,Escherichia coli str. K-12 substr. MG1655(511145.6)_SEQ,Escherichia coli str. K-12 substr. W3110(316407.9)_FIGFAM,Escherichia coli str. K-12 substr. W3110(316407.9)_LOCI,Escherichia coli str. K-12 substr. W3110(316407.9)_PID,Escherichia coli str. K-12 substr. W3110(316407.9)_SEQ,Shigella boydii CDC 3083-94(344609.11)_FIGFAM,Shigella boydii CDC 3083-94(344609.11)_LOCI,Shigella boydii CDC 3083-94(344609.11)_PID,Shigella boydii CDC 3083-94(344609.11)_SEQ,Shigella boydii Sb227(300268.10)_FIGFAM,Shigella boydii Sb227(300268.10)_LOCI,Shigella boydii Sb227(300268.10)_PID,Shigella boydii Sb227(300268.10)_SEQ,Shigella dysenteriae Sd197(300267.13)_FIGFAM,Shigella dysenteriae Sd197(300267.13)_LOCI,Shigella dysenteriae Sd197(300267.13)_PID,Shigella dysenteriae Sd197(300267.13)_SEQ,Shigella flexneri 2002017(591020.3)_FIGFAM,Shigella flexneri 2002017(591020.3)_LOCI,Shigella flexneri 2002017(591020.3)_PID,Shigella flexneri 2002017(591020.3)_SEQ,Shigella flexneri 2a str. 2457T(198215.6)_FIGFAM,Shigella flexneri 2a str. 2457T(198215.6)_LOCI,Shigella flexneri 2a str. 2457T(198215.6)_PID,Shigella flexneri 2a str. 2457T(198215.6)_SEQ,Shigella flexneri 2a str. 301(198214.7)_FIGFAM,Shigella flexneri 2a str. 301(198214.7)_LOCI,Shigella flexneri 2a str. 301(198214.7)_PID,Shigella flexneri 2a str. 301(198214.7)_SEQ,Shigella flexneri 5 str. 8401(373384.11)_FIGFAM,Shigella flexneri 5 str. 8401(373384.11)_LOCI,Shigella flexneri 5 str. 8401(373384.11)_PID,Shigella flexneri 5 str. 8401(373384.11)_SEQ,Shigella sonnei Ss046(300269.11)_FIGFAM,Shigella sonnei Ss046(300269.11)_LOCI,Shigella sonnei Ss046(300269.11)_PID,Shigella sonnei Ss046(300269.11)_SEQ,Gene,E. coli Locus ID,Protein Function,Unnamed: 3
0,fig|216592.3.peg.14,EC042_0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|362663.9.peg.14,ECP_0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|585055.6.peg.12,EC55989_0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|655817.3.peg.15,ECABU_c00150,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|405955.13.peg.13,APECO1_1965,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|481805.6.peg.3896,EcolC_3642,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|413997.3.peg.12,ECB_00014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|469008.14.peg.3733,ECBD_3605,100.0,,fig|469008.4.peg.3733,ECBD_3605,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|469008.13.peg.3733,ECBD_3605,100.0,,fig|595496.3.peg.12,BWG_0013,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|199310.4.peg.15,c0019,99.84,MGKIIGXDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|536056.4.peg.3793,EcDH1_3583,100,,fig|536056.3.peg.3793,EcDH1_3583,100,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|331111.12.peg.343,EcE24377A_0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|585397.7.peg.13,,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|316401.4.peg.13,ETEC_0013,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|331112.3.peg.12,,99.84,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|585034.4.peg.12,ECIAI1_0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|585057.6.peg.12,ECIAI39_0013,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|714962.3.peg.13,ECOK1_0013,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|595495.4.peg.1719,EKO11DRAFT_1572,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|591946.4.peg.12,LF82_0507,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|1033813.3.peg.4704,ECNA114_4657,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|585395.4.peg.13,ECO103_0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|585396.4.peg.12,ECO111_0013,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|574521.7.peg.15,E2348C_0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|155864.8.peg.12,,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|444450.8.peg.151,ECH74115_0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|386585.9.peg.110,ECs0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|544404.4.peg.13,ECSP_0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|573235.3.peg.12,ECO26_0013,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|701177.3.peg.12,G2583_0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|685038.3.peg.12,NRG857_00075,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|585035.6.peg.13,ECS88_0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|409438.11.peg.128,ECSE_0013,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|431946.3.peg.12,ECSF_0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|439855.10.peg.184,EcSMS35_0012,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|869729.3.peg.4909,,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|585056.7.peg.197,ECUMN_0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|696406.3.peg.12,UMNK88_12,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|364106.7.peg.145,UTI89_C0016,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|566546.4.peg.1724,,100.0,MQKREPVIIAPDYTDDELYEWMRQKINAAQDLKWANEARAKQAENL...,fig|566546.3.peg.1724,,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|316385.7.peg.12,ECDH10B_0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|511145.6.peg.12,b0014,100,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|316407.9.peg.12,,100,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|344609.11.peg.59,SbBS512_E0017,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|300268.10.peg.204,SBO_0015,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|300267.13.peg.13,SDY_0013,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,fig|591020.3.peg.14,SFxv_0013,99.84,MGKIIGIDLGTTNSCVAIMDGTIPRVLENAEGDRTTPSIIAYTQDG...,fig|198215.6.peg.14,S0014,99.84,MGKIIGIDLGTTNSCVAIMDGTIPRVLENAEGDRTTPSIIAYTQDG...,fig|198214.7.peg.13,,99.84,MGKIIGIDLGTTNSCVAIMDGTIPRVLENAEGDRTTPSIIAYTQDG...,fig|373384.11.peg.13,SFV_0012,99.84,MGKIIGIDLGTTNSCVAIMDGTIPRVLENAEGDRTTPSIIAYTQDG...,fig|300269.11.peg.15,SSON_0014,100.0,MGKIIGIDLGTTNSCVAIMDGTTPRVLENAEGDRTTPSIIAYTQDG...,dnaK,b0014,Chaperone Hsp70,"intersect(union(setdiff(ME,iJO), intersect(IE,..."
1,fig|216592.3.peg.15,EC042_0015,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|362663.9.peg.15,ECP_0015,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|585055.6.peg.13,EC55989_0015,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|655817.3.peg.16,ECABU_c00160,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|405955.13.peg.14,APECO1_1964,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|481805.6.peg.3895,EcolC_3641,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|413997.3.peg.13,ECB_00015,99.2,MAKQDYYEILGVSKTAEEHEIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|469008.14.peg.3732,ECBD_3604,99.2,,fig|469008.4.peg.3732,ECBD_3604,99.2,MAKQDYYEILGVSKTAEEHEIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|469008.13.peg.3732,ECBD_3604,99.2,,fig|595496.3.peg.13,BWG_0014,99.73,MVKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAK...,fig|199310.4.peg.16,c0020,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|536056.4.peg.3792,EcDH1_3582,100,,fig|536056.3.peg.3792,EcDH1_3582,100,MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAK...,fig|331111.12.peg.344,EcE24377A_0015,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|585397.7.peg.14,,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|316401.4.peg.14,ETEC_0014,99.2,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEVK...,fig|331112.3.peg.14,,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|585034.4.peg.13,ECIAI1_0015,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|585057.6.peg.13,ECIAI39_0014,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|714962.3.peg.14,ECOK1_0014,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|595495.4.peg.1718,EKO11DRAFT_1571,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|591946.4.peg.13,LF82_0506,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|1033813.3.peg.4705,ECNA114_4658,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|585395.4.peg.14,ECO103_0015,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|585396.4.peg.13,ECO111_0014,98.14,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|574521.7.peg.16,E2348C_0015,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|155864.8.peg.13,,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|444450.8.peg.152,ECH74115_0015,99.2,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAETK...,fig|386585.9.peg.111,ECs0015,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|544404.4.peg.14,ECSP_0015,99.2,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAETK...,fig|573235.3.peg.13,ECO26_0014,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|701177.3.peg.13,G2583_0015,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|685038.3.peg.13,NRG857_00080,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|585035.6.peg.14,ECS88_0015,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|409438.11.peg.129,ECSE_0014,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|431946.3.peg.13,ECSF_0015,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|439855.10.peg.185,EcSMS35_0013,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|869729.3.peg.4910,,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|585056.7.peg.198,ECUMN_0015,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|696406.3.peg.13,UMNK88_13,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|364106.7.peg.146,UTI89_C0017,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|566546.4.peg.1723,,99.47,MDFDTIMEKAYEEYFEGLAEGEEALSFSEFKQALSSSAKSNG,fig|566546.3.peg.1723,,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|316385.7.peg.13,ECDH10B_0015,100.0,MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAK...,fig|511145.6.peg.13,b0015,100,MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAK...,fig|316407.9.peg.13,,100,MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAK...,fig|344609.11.peg.60,SbBS512_E0018,99.2,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|300268.10.peg.205,SBO_0016,99.2,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|300267.13.peg.14,SDY_0014,99.2,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,fig|591020.3.peg.15,SFxv_0014,99.73,MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAK...,fig|198215.6.peg.15,S0015,99.73,MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAK...,fig|198214.7.peg.14,,97.87,MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAK...,fig|373384.11.peg.14,SFV_0013,99.73,MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAK...,fig|300269.11.peg.16,SSON_0015,99.47,MAKQDYYEILGVSKTAEEREIKKAYKRLAMKYHPDRNQGDKEAEAK...,dnaJ,b0015,Hsp70 cochaperone,these are the genes in the model that are not ...
2,fig|216592.3.peg.27,EC042_0026,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|362663.9.peg.22,ECP_0022,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|585055.6.peg.21,EC55989_0022,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|655817.3.peg.23,ECABU_c00230,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|405955.13.peg.21,,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|481805.6.peg.3883,EcolC_3631,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|413997.3.peg.24,ECB_00027,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|469008.14.peg.3721,ECBD_3593,100.0,,fig|469008.4.peg.3721,ECBD_3593,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|469008.13.peg.3721,ECBD_3593,100.0,,fig|595496.3.peg.20,BWG_0021,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|199310.4.peg.23,c0027,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|536056.4.peg.3785,EcDH1_3575,100,,fig|536056.3.peg.3785,EcDH1_3575,100,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|331111.12.peg.351,EcE24377A_0023,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|585397.7.peg.21,,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|316401.4.peg.24,ETEC_0023,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|331112.3.peg.22,,100.0,MANIKSAKKRAIQSEKARKHNASRRSMMRTFIKKVYAAIEAGDKAA...,fig|585034.4.peg.21,ECIAI1_0024,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|585057.6.peg.25,ECIAI39_0025,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|714962.3.peg.20,ECOK1_0020,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|595495.4.peg.1707,EKO11DRAFT_1563,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|591946.4.peg.20,LF82_1987,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|1033813.3.peg.8,ECNA114_0008,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|585395.4.peg.20,ECO103_0024,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|585396.4.peg.20,ECO111_0023,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|574521.7.peg.23,E2348C_0023,100.0,MANIKSAKKRAIQSEKARKHNASRRSMMRTFIKKVYAAIEAGDKAA...,fig|155864.8.peg.22,,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|444450.8.peg.163,ECH74115_0026,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|386585.9.peg.121,ECs0026,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|544404.4.peg.25,ECSP_0024,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|573235.3.peg.20,ECO26_0023,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|701177.3.peg.22,G2583_0024,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|685038.3.peg.20,NRG857_00115,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|585035.6.peg.21,ECS88_0022,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|409438.11.peg.136,ECSE_0021,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|431946.3.peg.21,ECSF_0024,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|439855.10.peg.196,EcSMS35_0022,100.0,MANIKSAKKRAIQSEKARKHNASRRSMMRTFIKKVYAAIEAGDKAA...,fig|869729.3.peg.4917,,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|585056.7.peg.207,ECUMN_0023,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|696406.3.peg.20,UMNK88_22,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|364106.7.peg.153,UTI89_C0025,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|566546.4.peg.1714,,100.0,MIKETVTMSHKELHRLQIIQEQAAARIGISIRQVKRLVQRYRNEGP...,fig|566546.3.peg.1714,,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|316385.7.peg.20,ECDH10B_0024,98.36,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRXAAKGLIHKNKA...,fig|511145.6.peg.20,b0023,100,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|316407.9.peg.19,,100,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|344609.11.peg.71,SbBS512_E0027,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|300268.10.peg.213,SBO_0022,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|300267.13.peg.49,SDY_0045,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|591020.3.peg.23,SFxv_0021,95.56,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIPKTKL...,fig|198215.6.peg.23,S0022,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|198214.7.peg.22,,95.08,MMRPFIKKGDAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|373384.11.peg.22,SFV_0018,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,fig|300269.11.peg.30,SSON_0028,100.0,MMRTFIKKVYAAIEAGDKAAAQKAFNEMQPIVDRQAAKGLIHKNKA...,rpsT,b0023,30S ribosomal protein S20,
3,fig|216592.3.peg.30,EC042_0028,99.68,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|362663.9.peg.25,ECP_0024,99.15,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|585055.6.peg.24,EC55989_0025,100.0,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|655817.3.peg.26,ECABU_c00260,99.25,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|405955.13.peg.24,APECO1_1957,99.15,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|481805.6.peg.3880,EcolC_3629,99.68,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|413997.3.peg.27,ECB_00030,99.68,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|469008.14.peg.3718,ECBD_3590,99.68,,fig|469008.4.peg.3718,ECBD_3590,99.68,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|469008.13.peg.3718,ECBD_3590,99.68,,fig|595496.3.peg.23,BWG_0024,100.0,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|199310.4.peg.26,c0030,99.25,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|536056.4.peg.3782,EcDH1_3573,100,,fig|536056.3.peg.3782,EcDH1_3573,100,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|331111.12.peg.354,EcE24377A_0026,99.89,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|585397.7.peg.24,,99.25,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|316401.4.peg.27,ETEC_0026,99.68,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|331112.3.peg.24,,99.89,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|585034.4.peg.24,ECIAI1_0027,100.0,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|585057.6.peg.28,ECIAI39_0027,99.79,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|714962.3.peg.22,ECOK1_0022,99.15,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|595495.4.peg.1704,EKO11DRAFT_1561,100.0,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|591946.4.peg.23,LF82_1096,99.25,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|1033813.3.peg.10,ECNA114_0010,99.25,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|585395.4.peg.22,ECO103_0027,100.0,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|585396.4.peg.23,ECO111_0026,99.79,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|574521.7.peg.26,E2348C_0026,99.13,MRGDLAKREPGMLARWTDDDLYGIIRAAKKGKKTFILHDGPPYANG...,fig|155864.8.peg.25,,99.36,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|444450.8.peg.166,ECH74115_0028,99.47,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|386585.9.peg.124,ECs0029,99.47,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|544404.4.peg.28,ECSP_0027,99.47,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|573235.3.peg.23,ECO26_0026,99.89,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|701177.3.peg.25,G2583_0027,99.57,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|685038.3.peg.23,NRG857_00125,99.25,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|585035.6.peg.24,ECS88_0025,99.15,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|409438.11.peg.139,ECSE_0024,100.0,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|431946.3.peg.24,ECSF_0027,99.25,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|439855.10.peg.199,EcSMS35_0024,99.79,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|869729.3.peg.4920,,99.15,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|585056.7.peg.210,ECUMN_0026,99.68,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|696406.3.peg.22,UMNK88_24,99.57,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|364106.7.peg.156,UTI89_C0028,99.15,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|566546.4.peg.1711,,100.0,MAGILNAPALNPLRRVFVFIFNAFEVLDGAGIESKILK,fig|566546.3.peg.1711,,100.0,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|316385.7.peg.23,ECDH10B_0027,100.0,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|511145.6.peg.23,b0026,100,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|316407.9.peg.21,,100,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|344609.11.peg.74,SbBS512_E0030,99.79,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|300268.10.peg.216,SBO_0025,99.68,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|300267.13.peg.52,SDY_0048,99.57,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|591020.3.peg.26,SFxv_0024,99.68,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|198215.6.peg.26,S0025,99.57,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|198214.7.peg.25,,99.68,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|373384.11.peg.25,SFV_0020,99.79,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,fig|300269.11.peg.33,SSON_0031,100.0,MSDYKSTLNLPETGFPMRGDLAKREPGMLARWTDDDLYGIIRAAKK...,ileS,b0026,Isoleucyl-tRNA synthase,
4,fig|216592.3.peg.61,EC042_0055,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|362663.9.peg.56,ECP_0053,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|585055.6.peg.53,EC55989_0051,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|655817.3.peg.59,ECABU_c00590,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|405955.13.peg.59,APECO1_1931,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|481805.6.peg.3849,EcolC_3604,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|413997.3.peg.58,ECB_00055,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|469008.14.peg.3686,ECBD_3564,100.0,,fig|469008.4.peg.3686,ECBD_3564,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|469008.13.peg.3686,ECBD_3564,100.0,,fig|595496.3.peg.53,BWG_0049,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|199310.4.peg.60,c0064,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|536056.4.peg.3752,EcDH1_3548,100,,fig|536056.3.peg.3752,EcDH1_3548,100,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|331111.12.peg.384,EcE24377A_0055,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|585397.7.peg.55,,99.27,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|316401.4.peg.57,ETEC_0051,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|331112.3.peg.53,,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|585034.4.peg.53,ECIAI1_0053,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|585057.6.peg.58,ECIAI39_0054,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|714962.3.peg.57,ECOK1_0052,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|595495.4.peg.1675,EKO11DRAFT_1535,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|591946.4.peg.53,LF82_1163,99.27,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|1033813.3.peg.41,ECNA114_0041,99.27,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|585395.4.peg.53,ECO103_0054,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|585396.4.peg.54,ECO111_0055,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|574521.7.peg.55,E2348C_0054,99.2,MIDSIVSAINPQKGQAMVEIGPGLAALTEPVGERLDQLTVIELDRD...,fig|155864.8.peg.56,,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|444450.8.peg.197,ECH74115_0057,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|386585.9.peg.155,ECs0056,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|544404.4.peg.59,ECSP_0056,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|573235.3.peg.55,ECO26_0055,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|701177.3.peg.56,G2583_0055,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|685038.3.peg.53,NRG857_00275,99.27,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|585035.6.peg.59,ECS88_0056,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|409438.11.peg.169,ECSE_0052,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|431946.3.peg.56,ECSF_0058,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|439855.10.peg.232,EcSMS35_0055,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|869729.3.peg.4957,,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|585056.7.peg.240,ECUMN_0053,99.27,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|696406.3.peg.49,UMNK88_51,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|364106.7.peg.191,UTI89_C0058,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|566546.4.peg.1682,,100.0,MQDIRQETLNECTRAEQSASVVLWEIDLTEVGGERYFFCNEQNEKG...,fig|566546.3.peg.1682,,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|316385.7.peg.53,ECDH10B_0052,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|511145.6.peg.53,b0051,100,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|316407.9.peg.47,,100,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|344609.11.peg.93,SbBS512_E0045,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|300268.10.peg.234,SBO_0040,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|300267.13.peg.86,SDY_0076,98.8,MIDSIVSAINPQKGQAMVEIGPGLAALTEPVGERLDQLTVIELDRD...,fig|591020.3.peg.57,SFxv_0050,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|198215.6.peg.57,S0050,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|198214.7.peg.57,,99.63,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|373384.11.peg.56,SFV_0045,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,fig|300269.11.peg.67,SSON_0059,100.0,MNNRVHQGHLARKRFGQNFLNDQFVIDSIVSAINPQKGQAMVEIGP...,ksgA,b0051,Dimethyladenosine transferase,


### Functions to map between strain/model name, and strain genes to k12 genes

In [12]:
def strain_name_to_model_name(strain_name):
    return de_unicodeify(monk_supplement_t1[monk_supplement_t1['Strain'] == strain_name]['Model Name'].unique()[0])

def model_name_to_strain_name(model_name):
    return de_unicodeify(monk_supplement_t1[monk_supplement_t1['Model Name'] == model_name]['Strain'].unique()[0])

def get_k12_locus_new_genes(strain_name, homolog_gene_name):
    return de_unicodeify(monk_supplement_t8[(monk_supplement_t8['Organism'] == strain_name) & (monk_supplement_t8['Homolog gene'] == homolog_gene_name)]['Homolog locus'].unique()[0])

def get_k12_locus(strain_name, strain_locus):
    return de_unicodeify(monk_gene_consv[monk_gene_consv[strain_name] == strain_locus].index.unique().tolist()[0])

def gene_to_k12_homolog(model_name, locus):
    strain = model_name_to_strain_name(model_name)
    return get_k12_locus(strain, locus)

# Writing FASTA files for alignments

In [14]:
FF_STRAINS = os.path.join(ROOT_DIR, '55_STRAINS')

In [13]:
strains = ['iAPECO1_1312','iB21_1397','iBWG_1329','ic_1306','iE2348C_1286',
           'iEC042_1314','iEC55989_1330','iECABU_c1320','iECB_1328','iECBD_1354',
           'iECD_1391','iECDH10B_1368','iEcDH1_1363','iECDH1ME8569_1439','iEcE24377_1341',
           'iECED1_1282','iECH74115_1262','iEcHS_1320','iECIAI1_1343','iECIAI39_1322',
           'iECNA114_1301','iECO103_1326','iECO111_1330','iECO26_1355','iECOK1_1307',
           'iEcolC_1368','iECP_1309','iECs_1301','iECS88_1305','iECSE_1348','iECSF_1327',
           'iEcSMS35_1347','iECSP_1301','iECUMN_1333','iECW_1372','iEKO11_1354','iETEC_1333',
           'iG2583_1286','iLF82_1304','iNRG857_1313','iS_1188','iSbBS512_1146','iSBO_1134',
           'iSDY_1059','iSF_1195','iSFV_1184','iSFxv_1172','iSSON_1240','iUMN146_1321',
           'iUMNK88_1353','iUTI89_1310','iWFL_1372','iY75_1357','iZ_1308']

In [31]:
def strain_folder_locations(strain):
    '''
    This is just so below code is easier to work with and read - returns folder locations per strain
    '''
    STRAIN_DIR = os.path.join(FF_STRAINS, strain)

    # data_frames - directory where all data frames will be stored (all stages)
    STRAIN_DATA_FRAMES = os.path.join(STRAIN_DIR, 'data_frames')

    # model_files - directory where original GEMs and GEM-related files are stored
    STRAIN_MODEL_FILES = os.path.join(STRAIN_DIR, 'model_files')
    
    # sequence_files - directory where all sequence related files are stored
    STRAIN_SEQ_ALIGN_FILES = os.path.join(STRAIN_DIR, 'sequence_files/alignment')
    
    return [STRAIN_DIR, STRAIN_DATA_FRAMES, STRAIN_MODEL_FILES, STRAIN_SEQ_ALIGN_FILES]

<span style="background-color:#66FF99">**write_fasta_file**</span>

In [None]:
for strain in strains:
    
    # make folder structure
    [STRAIN_DIR, STRAIN_DATA_FRAMES, STRAIN_MODEL_FILES, STRAIN_SEQ_ALIGN_FILES] = strain_folder_locations(strain)
    for directory in [STRAIN_DIR, STRAIN_DATA_FRAMES, STRAIN_MODEL_FILES, STRAIN_SEQ_ALIGN_FILES]:
        if not os.path.exists(directory):
            os.makedirs(directory)

    # load the strain specific model
    DF_01_STRAIN = pd.read_csv(STRAIN_DATA_FRAMES + 'DF_01_RXN_GENE_UNIPROT_PDB.csv', index_col=0)

    ## write the fasta file for each gene in this model
    gene_id_and_sequence = list(set(zip(DF_01_STRAIN.m_gene, DF_01_STRAIN.u_seq)))
    os.chdir(STRAIN_SEQ_UNIPROT_FILES)

    for m_gene, u_seq in gene_id_and_sequence:
        if pd.isnull(u_seq):
            continue
        
        if m_gene + '.faa' not in strain_written_fasta_files:
            uniprot_seq = SeqRecord(Seq(u_seq, IUPAC.protein),id=m_gene,description='uniprot sequence')
            write_fasta_file(uniprot_seq, m_gene)

# Running alignments

### Additional functions

In [21]:
# loading dataframes that note where PDB starts are to adjust alignment files
DF_03A_PDB_STARTS = pd.read_csv(DATA_FRAMES + '/DF_03A_PDB_STARTS.csv', index_col = 0)
PDB_CHAINS_STARTS = DF_03A_PDB_STARTS.set_index(['pdb_id','chain_id']).to_dict('dict')['start_res']

SIFTS_INDEXED = SIFTS.reset_index().sort(['PDB','CHAIN','SP_PRIMARY']).set_index(['PDB','CHAIN','SP_PRIMARY'])
SIFTS_INDEXED_LIST = SIFTS_INDEXED.index.tolist()

# function to adjust the alignment files
def add_pdb_resnums_to_alignment_df(alignment_df, pdb_id, k12_uniprot_id):
    '''
    Input: alignment_df with a column of the chain ID, pdb_id is the aligned PDB ID, 
            and k12_uniprot_id is the associated uniprot ID
    Output: excluding_wrong_chains with pdb_start and pdb_stop corresponding to the pdb resnums
            as well as chains that have other uniprot ids excluded from the df
    '''

    new_alignment_df = copy.copy(alignment_df)
    excluding_wrong_chains = pd.DataFrame()
        
    # adding pdb_start and pdb_stop as empty columns
    # now adding in pdb_start and pdb_stop
    for chain in new_alignment_df.chain.unique():
        print chain
        subset = new_alignment_df[new_alignment_df.chain == chain]
        subset['id_b_start'] = np.nan
        subset['id_b_stop'] = np.nan

        if not pdb_id.endswith('_modified_min') and not pdb_id.endswith('_model1'):
            pdb_start = PDB_CHAINS_STARTS[(pdb_id.lower(), chain)]
        else:
            pdb_start = 1
        for idx, row in subset.iterrows():
            if row['type'] == 'insertion':
                adder = row['count'] - 1
                pdb_stop = adder + pdb_start
                subset.loc[idx, 'id_b_start'] = pdb_start
                subset.loc[idx, 'id_b_stop'] = pdb_stop
                pdb_start = pdb_stop + 1

            elif row['type'] != 'deletion':
                adder = row['id_a_stop'] - row['id_a_start']
                pdb_stop = adder + pdb_start
                subset.loc[idx, 'id_b_start'] = pdb_start
                subset.loc[idx, 'id_b_stop'] = pdb_stop
                pdb_start = pdb_stop + 1

        excluding_wrong_chains = excluding_wrong_chains.append(subset)
        
    if len(excluding_wrong_chains) > 0:
        return excluding_wrong_chains
    else:
        return new_alignment_df



### Getting the "best" PDBs to align to

In [15]:
# getting the "best" files
iJO_best_files = iJO1366_GEMPRO_min[['m_gene','u_uniprot_acc','ssb_best_file']].drop_duplicates().reset_index(drop = True).sort('ssb_best_file')
iJO_best_files = iJO_best_files[pd.notnull(iJO_best_files.ssb_best_file)]
iJO_best_files.head()



Unnamed: 0,m_gene,u_uniprot_acc,ssb_best_file
265,b3744,P00963,12as_modified_min.pdb
1148,b0854,P31133,1a99.pdb
578,b4079,P07658,1aa6.pdb
235,b1901,P02924,1abf.pdb
408,b2143,P0ABF6,1af2.pdb


### Running all alignments for all strains

In [65]:
monk_gene_consv[monk_gene_consv.index == 'b0002']

Unnamed: 0,Escherichia coli 042,Escherichia coli 536,Escherichia coli 55989,Escherichia coli ABU 83972,Escherichia coli APEC O1,Escherichia coli ATCC 8739,Escherichia coli B str. REL606,Escherichia coli BL21 DE3 AM946981,Escherichia coli BL21 DE3 BL21-Gold DE3 pLysS AG,Escherichia coli BL21 DE3 CP001509,Escherichia coli BW2952,Escherichia coli CFT073,Escherichia coli DH1 ME8569,Escherichia coli DH1,Escherichia coli E24377A,Escherichia coli ED1a,Escherichia coli ETEC H10407,Escherichia coli HS,Escherichia coli IAI1,Escherichia coli IAI39,Escherichia coli IHE3034,Escherichia coli KO11FL,Escherichia coli LF82,Escherichia coli NA114,Escherichia coli O103:H2 str. 12009,Escherichia coli O111:H- str. 11128,Escherichia coli O127:H6 str. E2348/69,Escherichia coli O157:H7 EDL933,Escherichia coli O157:H7 str. EC4115,Escherichia coli O157:H7 str. Sakai,Escherichia coli O157:H7 str. TW14359,Escherichia coli O26:H11 str. 11368,Escherichia coli O55:H7 str. CB9615,Escherichia coli O83:H1 str. NRG 857C,Escherichia coli S88,Escherichia coli SE11,Escherichia coli SE15,Escherichia coli SMS-3-5,Escherichia coli UM146,Escherichia coli UMN026,Escherichia coli UMNK88,Escherichia coli UTI89,Escherichia coli W CP002185,Escherichia coli W,Escherichia coli str. K-12 substr. DH10B,Escherichia coli str. K-12 substr. MG1655,Escherichia coli str. K-12 substr. W3110,Shigella boydii CDC 3083-94,Shigella boydii Sb227,Shigella dysenteriae Sd197,Shigella flexneri 2002017,Shigella flexneri 2a str. 2457T,Shigella flexneri 2a str. 301,Shigella flexneri 5 str. 8401,Shigella sonnei Ss046
b0002,EC042_0001,ECP_0002,EC55989_0002,ECABU_c00010,APECO1_1976,EcolC_3653,ECB_00002,ECBD_3616,ECBD_3616,ECBD_3616,BWG_0002,c0003,EcDH1_3594,EcDH1_3594,EcE24377A_0001,,ETEC_0002,,ECIAI1_0002,ECIAI39_0001,ECOK1_0002,EKO11DRAFT_3995,LF82_2259,ECNA114_4646,ECO103_0002,ECO111_0002,E2348C_0002,,ECH74115_0003,ECs0002,ECSP_0002,ECO26_0002,G2583_0002,NRG857_00015,ECS88_0002,ECSE_0002,ECSF_0002,EcSMS35_0001,,ECUMN_0002,UMNK88_1,UTI89_C0002,EschWDRAFT_3337,EschWDRAFT_3337,ECDH10B_0002,b0002,,SbBS512_E0002,SBO_0001,SDY_0002,SFxv_0001,S0002,,SFV_0001,SSON_0002


In [18]:
# appending strains as columns to this mapping dataframe
import copy
strain_alignment_mapping_df = copy.copy(iJO_best_files)
for strain in strains:
    strain_alignment_mapping_df[strain] = np.nan
strain_alignment_mapping_df = strain_alignment_mapping_df.sort_values(by='m_gene').reset_index(drop=True)
strain_alignment_mapping_df.head()

Unnamed: 0,m_gene,u_uniprot_acc,ssb_best_file,iAPECO1_1312,iB21_1397,iBWG_1329,ic_1306,iE2348C_1286,iEC042_1314,iEC55989_1330,iECABU_c1320,iECB_1328,iECBD_1354,iECD_1391,iECDH10B_1368,iEcDH1_1363,iECDH1ME8569_1439,iEcE24377_1341,iECED1_1282,iECH74115_1262,iEcHS_1320,iECIAI1_1343,iECIAI39_1322,iECNA114_1301,iECO103_1326,iECO111_1330,iECO26_1355,iECOK1_1307,iEcolC_1368,iECP_1309,iECs_1301,iECS88_1305,iECSE_1348,iECSF_1327,iEcSMS35_1347,iECSP_1301,iECUMN_1333,iECW_1372,iEKO11_1354,iETEC_1333,iG2583_1286,iLF82_1304,iNRG857_1313,iS_1188,iSbBS512_1146,iSBO_1134,iSDY_1059,iSF_1195,iSFV_1184,iSFxv_1172,iSSON_1240,iUMN146_1321,iUMNK88_1353,iUTI89_1310,iWFL_1372,iY75_1357,iZ_1308
0,b0002,P00561,AK1H_ECOLI_model1.pdb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,b0003,P00547,KHSE_ECOLI_model1.pdb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,b0004,P00934,1vb3.pdb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,b0007,P30143,YAAJ_ECOLI_model1.pdb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,b0008,P0A870,1onr.pdb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
# now doing all the alignments (takes a while!)

droplist_strains = ['m_reaction','m_gene_reaction_rule','m_metabolites']

uniprot_strain_aln_errors = defaultdict(list)

for strain in strains:
    [STRAIN_DIR, STRAIN_DATA_FRAMES, STRAIN_MODEL_FILES, STRAIN_SEQ_ALIGN_FILES] = strain_folder_locations(strain)

    DF_01_STRAIN = pd.read_csv(STRAIN_DATA_FRAMES + 'DF_01_RXN_GENE_UNIPROT_PDB.csv', index_col=0)
    DF_01_STRAIN_NOTNULL = DF_01_STRAIN[pd.notnull(DF_01_STRAIN.u_seq)].drop(droplist_strains,axis=1).drop_duplicates().sort_index()

    # now for every gene in our new strain..
    for s_gene in DF_01_STRAIN_NOTNULL.m_gene.unique():
        # get the homolog of this gene to k12
        print '1: GETTING GENE HOMOLOG'
        try:
            k12_gene = gene_to_k12_homolog(strain, s_gene)
        except:
            try:
                k12_gene = get_k12_locus_new_genes(model_name_to_strain_name(strain), s_gene)
            except:
                print "SKIPPING %s" % s_gene
                continue

        # if this k12 gene is not in our k12 dataframe, ignore it
        if k12_gene not in iJO1366_GEMPRO_min.m_gene.values:
            print 'STOP: NO GENE HOMOLOG'
            continue

        # otherwise, continue with alignment..
        # get the sequence file for that gene (this will be used in alignment to the PDB of K12)
        s_gene_fasta_path = STRAIN_SEQ_UNIPROT_FILES + s_gene + '.faa'

        # get the sequence file for the k12 gene
        k12_uniprot_id = iJO1366_GEMPRO_min[iJO1366_GEMPRO_min.m_gene == k12_gene].u_uniprot_acc.unique().tolist()[0]
        k12_uniprot_fasta_path = SEQ_UNIPROT_FILES + k12_uniprot_id + '.faa'

        os.chdir(STRAIN_SEQ_ALIGN_FILES)
        # run the alignment (if it doesn't exist)
        print '2: ALIGNING GENE SEQUENCES'
        alignment_filename = "%s_%s_align.txt" % (k12_uniprot_id, s_gene)
        if not os.path.exists(STRAIN_SEQ_ALIGN_FILES + alignment_filename):
            try:
                alignment_filename = run_alignment(k12_uniprot_id, k12_uniprot_fasta_path, s_gene, s_gene_fasta_path)
            except:
                uniprot_strain_aln_errors[strain].append((k12_uniprot_id,s_gene))
                warnings.warn('***ERROR: with alignment of %s and %s***' % (k12_uniprot_id, s_gene))
                continue
        
        print '3: MAKING ALIGNMENT DF'
        alignment_df_path = '%s_table.csv' % alignment_filename.split('.')[0]
        if not os.path.exists(STRAIN_SEQ_ALIGN_FILES + alignment_df_path):
            alignment_df = get_alignment_df(alignment_filename)
            alignment_df.to_csv(alignment_df_path)
        else:
            alignment_df = pd.read_csv(alignment_df_path, index_col=0)
            
        # now we want to align to whatever best PDB there was for the given k12 gene
        subset_best = iJO1366_GEMPRO_min[iJO1366_GEMPRO_min.m_gene == k12_gene]
        best_structure = subset_best[pd.notnull(subset_best.ssb_best_file)].ssb_best_file.unique().tolist()
        if len(best_structure) > 1 or len(best_structure) == 0:
            warnings.warn('%d best structure for gene %s???' % (len(best_structure), k12_gene))
            continue
        best_structure = best_structure[0]
        best_structure_id = best_structure.split('.')[0]
        best_structure_path = SEQ_BEST_FILES + best_structure_id + '.faa'

        print '4: ALIGNING TO BEST STRUCTURE'
        pdb_alignment_filename = '%s_%s_align.txt' % (s_gene, best_structure_id)
        if not os.path.exists(STRAIN_SEQ_ALIGN_FILES + pdb_alignment_filename):
            try:
                pdb_alignment_filename = run_alignment(s_gene, s_gene_fasta_path, best_structure_id, best_structure_path)
            except:
                uniprot_strain_aln_errors[strain].append((best_structure_id,s_gene))
                warnings.warn('***ERROR: with alignment of %s and %s***' % (best_structure_id, s_gene))
                continue
                
        print '5: MAKING ALIGNMENT DF'
        pdb_alignment_plus_start_stop_df_path = '%s_table.csv' % pdb_alignment_filename.split('.')[0]
        pdb_alignment_df = get_alignment_df(pdb_alignment_filename)
        pdb_alignment_df.to_csv(pdb_alignment_plus_start_stop_df_path)
        
        print '6: ADDING PDB RESNUMS'
        pdb_alignment_with_pdb_resnums_path = pdb_alignment_plus_start_stop_df_path.split('.')[0] + '_corrected.csv'
        separate_chain_df = pdb_alignment_df.join(pdb_alignment_df['id_b'].apply(lambda x: pd.Series(x.split('.')[1])))
        separate_chain_df = separate_chain_df.rename(columns={0:'chain'})
        pdb_alignment_with_pdb_resnums_df = add_pdb_resnums_to_alignment_df(separate_chain_df, best_structure_id, k12_uniprot_id)
        pdb_alignment_with_pdb_resnums_df.to_csv(STRAIN_SEQ_ALIGN_FILES + pdb_alignment_with_pdb_resnums_path)
        strain_alignment_mapping_df.loc[strain_alignment_mapping_df[strain_alignment_mapping_df.m_gene == k12_gene].index, strain] = pdb_alignment_with_pdb_resnums_path

In [None]:
# checking if there were any alignment errors
uniprot_strain_aln_errors

In [None]:
strain_alignment_mapping_df.to_csv(os.path.join(FF_STRAINS, 'DF_ALIGNMENTS.csv')

In [22]:
strain_alignment_mapping_df = pd.read_csv(os.path.join(FF_STRAINS, 'DF_ALIGNMENTS.csv'), index_col=0)
strain_alignment_mapping_df.head()

Unnamed: 0,m_gene,u_uniprot_acc,ssb_best_file,iAPECO1_1312,iB21_1397,iBWG_1329,ic_1306,iE2348C_1286,iEC042_1314,iEC55989_1330,iECABU_c1320,iECB_1328,iECBD_1354,iECD_1391,iECDH10B_1368,iEcDH1_1363,iECDH1ME8569_1439,iEcE24377_1341,iECED1_1282,iECH74115_1262,iEcHS_1320,iECIAI1_1343,iECIAI39_1322,iECNA114_1301,iECO103_1326,iECO111_1330,iECO26_1355,iECOK1_1307,iEcolC_1368,iECP_1309,iECs_1301,iECS88_1305,iECSE_1348,iECSF_1327,iEcSMS35_1347,iECSP_1301,iECUMN_1333,iECW_1372,iEKO11_1354,iETEC_1333,iG2583_1286,iJO1366,iLF82_1304,iNRG857_1313,iS_1188,iSbBS512_1146,iSBO_1134,iSDY_1059,iSF_1195,iSFV_1184,iSFxv_1172,iSSON_1240,iUMN146_1321,iUMNK88_1353,iUTI89_1310,iWFL_1372,iY75_1357,iZ_1308
0,b0002,P00561,AK1H_ECOLI_model1.pdb,APECO1_1976_AK1H_ECOLI_model1_align_table_corr...,,BWG_0002_AK1H_ECOLI_model1_align_table_correct...,c0003_AK1H_ECOLI_model1_align_table_corrected.csv,E2348C_0002_AK1H_ECOLI_model1_align_table_corr...,EC042_0001_AK1H_ECOLI_model1_align_table_corre...,EC55989_0002_AK1H_ECOLI_model1_align_table_cor...,ECABU_c00010_AK1H_ECOLI_model1_align_table_cor...,ECB_00002_AK1H_ECOLI_model1_align_table_correc...,,,ECDH10B_0002_AK1H_ECOLI_model1_align_table_cor...,EcDH1_3594_AK1H_ECOLI_model1_align_table_corre...,,EcE24377A_0001_AK1H_ECOLI_model1_align_table_c...,,ECH74115_0003_AK1H_ECOLI_model1_align_table_co...,,ECIAI1_0002_AK1H_ECOLI_model1_align_table_corr...,ECIAI39_0001_AK1H_ECOLI_model1_align_table_cor...,ECNA114_4646_AK1H_ECOLI_model1_align_table_cor...,ECO103_0002_AK1H_ECOLI_model1_align_table_corr...,ECO111_0002_AK1H_ECOLI_model1_align_table_corr...,ECO26_0002_AK1H_ECOLI_model1_align_table_corre...,ECOK1_0002_AK1H_ECOLI_model1_align_table_corre...,EcolC_3653_AK1H_ECOLI_model1_align_table_corre...,ECP_0002_AK1H_ECOLI_model1_align_table_correct...,ECs0002_AK1H_ECOLI_model1_align_table_correcte...,ECS88_0002_AK1H_ECOLI_model1_align_table_corre...,ECSE_0002_AK1H_ECOLI_model1_align_table_correc...,ECSF_0002_AK1H_ECOLI_model1_align_table_correc...,EcSMS35_0001_AK1H_ECOLI_model1_align_table_cor...,ECSP_0002_AK1H_ECOLI_model1_align_table_correc...,ECUMN_0002_AK1H_ECOLI_model1_align_table_corre...,,,ETEC_0002_AK1H_ECOLI_model1_align_table_correc...,G2583_0002_AK1H_ECOLI_model1_align_table_corre...,b0002_AK1H_ECOLI_model1_align_table_corrected.csv,LF82_2259_AK1H_ECOLI_model1_align_table_correc...,NRG857_00015_AK1H_ECOLI_model1_align_table_cor...,S0002_AK1H_ECOLI_model1_align_table_corrected.csv,SbBS512_E0002_AK1H_ECOLI_model1_align_table_co...,SBO_0001_AK1H_ECOLI_model1_align_table_correct...,SDY_0002_AK1H_ECOLI_model1_align_table_correct...,,SFV_0001_AK1H_ECOLI_model1_align_table_correct...,SFxv_0001_AK1H_ECOLI_model1_align_table_correc...,SSON_0002_AK1H_ECOLI_model1_align_table_correc...,,UMNK88_1_AK1H_ECOLI_model1_align_table_correct...,UTI89_C0002_AK1H_ECOLI_model1_align_table_corr...,,,
1,b0003,P00547,KHSE_ECOLI_model1.pdb,APECO1_1975_KHSE_ECOLI_model1_align_table_corr...,,BWG_0003_KHSE_ECOLI_model1_align_table_correct...,c0004_KHSE_ECOLI_model1_align_table_corrected.csv,E2348C_0003_KHSE_ECOLI_model1_align_table_corr...,EC042_0002_KHSE_ECOLI_model1_align_table_corre...,EC55989_0003_KHSE_ECOLI_model1_align_table_cor...,ECABU_c00020_KHSE_ECOLI_model1_align_table_cor...,ECB_00003_KHSE_ECOLI_model1_align_table_correc...,,,ECDH10B_0003_KHSE_ECOLI_model1_align_table_cor...,EcDH1_3593_KHSE_ECOLI_model1_align_table_corre...,EcDH1_3593_KHSE_ECOLI_model1_align_table_corre...,EcE24377A_0002_KHSE_ECOLI_model1_align_table_c...,,ECH74115_0004_KHSE_ECOLI_model1_align_table_co...,,ECIAI1_0003_KHSE_ECOLI_model1_align_table_corr...,ECIAI39_0002_KHSE_ECOLI_model1_align_table_cor...,ECNA114_4647_KHSE_ECOLI_model1_align_table_cor...,ECO103_0003_KHSE_ECOLI_model1_align_table_corr...,ECO111_0003_KHSE_ECOLI_model1_align_table_corr...,ECO26_0003_KHSE_ECOLI_model1_align_table_corre...,ECOK1_0003_KHSE_ECOLI_model1_align_table_corre...,EcolC_3652_KHSE_ECOLI_model1_align_table_corre...,ECP_0003_KHSE_ECOLI_model1_align_table_correct...,ECs0003_KHSE_ECOLI_model1_align_table_correcte...,ECS88_0003_KHSE_ECOLI_model1_align_table_corre...,ECSE_0003_KHSE_ECOLI_model1_align_table_correc...,ECSF_0003_KHSE_ECOLI_model1_align_table_correc...,EcSMS35_0002_KHSE_ECOLI_model1_align_table_cor...,ECSP_0003_KHSE_ECOLI_model1_align_table_correc...,ECUMN_0003_KHSE_ECOLI_model1_align_table_corre...,,,ETEC_0003_KHSE_ECOLI_model1_align_table_correc...,G2583_0003_KHSE_ECOLI_model1_align_table_corre...,b0003_KHSE_ECOLI_model1_align_table_corrected.csv,LF82_2260_KHSE_ECOLI_model1_align_table_correc...,NRG857_00020_KHSE_ECOLI_model1_align_table_cor...,S0003_KHSE_ECOLI_model1_align_table_corrected.csv,SbBS512_E0003_KHSE_ECOLI_model1_align_table_co...,SBO_0002_KHSE_ECOLI_model1_align_table_correct...,SDY_0003_KHSE_ECOLI_model1_align_table_correct...,,SFV_0002_KHSE_ECOLI_model1_align_table_correct...,SFxv_0002_KHSE_ECOLI_model1_align_table_correc...,SSON_0003_KHSE_ECOLI_model1_align_table_correc...,,UMNK88_2_KHSE_ECOLI_model1_align_table_correct...,UTI89_C0003_KHSE_ECOLI_model1_align_table_corr...,,,
2,b0004,P00934,1vb3.pdb,APECO1_1974_1vb3_align_table_corrected.csv,,BWG_0004_1vb3_align_table_corrected.csv,c0005_1vb3_align_table_corrected.csv,E2348C_0004_1vb3_align_table_corrected.csv,EC042_0003_1vb3_align_table_corrected.csv,EC55989_0004_1vb3_align_table_corrected.csv,ECABU_c00030_1vb3_align_table_corrected.csv,ECB_00004_1vb3_align_table_corrected.csv,,,ECDH10B_0004_1vb3_align_table_corrected.csv,EcDH1_3592_1vb3_align_table_corrected.csv,,EcE24377A_0003_1vb3_align_table_corrected.csv,,ECH74115_0005_1vb3_align_table_corrected.csv,,ECIAI1_0004_1vb3_align_table_corrected.csv,ECIAI39_0003_1vb3_align_table_corrected.csv,ECNA114_4648_1vb3_align_table_corrected.csv,ECO103_0004_1vb3_align_table_corrected.csv,ECO111_0004_1vb3_align_table_corrected.csv,ECO26_0004_1vb3_align_table_corrected.csv,ECOK1_0004_1vb3_align_table_corrected.csv,EcolC_3651_1vb3_align_table_corrected.csv,ECP_0004_1vb3_align_table_corrected.csv,ECs0004_1vb3_align_table_corrected.csv,ECS88_0004_1vb3_align_table_corrected.csv,ECSE_0004_1vb3_align_table_corrected.csv,ECSF_0004_1vb3_align_table_corrected.csv,EcSMS35_0003_1vb3_align_table_corrected.csv,ECSP_0004_1vb3_align_table_corrected.csv,ECUMN_0004_1vb3_align_table_corrected.csv,,,ETEC_0004_1vb3_align_table_corrected.csv,G2583_0004_1vb3_align_table_corrected.csv,b0004_1vb3_align_table_corrected.csv,LF82_2261_1vb3_align_table_corrected.csv,NRG857_00025_1vb3_align_table_corrected.csv,S0004_1vb3_align_table_corrected.csv,SbBS512_E0004_1vb3_align_table_corrected.csv,SBO_0003_1vb3_align_table_corrected.csv,SDY_0004_1vb3_align_table_corrected.csv,,SFV_0003_1vb3_align_table_corrected.csv,SFxv_0003_1vb3_align_table_corrected.csv,SSON_0004_1vb3_align_table_corrected.csv,,UMNK88_3_1vb3_align_table_corrected.csv,UTI89_C0004_1vb3_align_table_corrected.csv,,,
3,b0007,P30143,YAAJ_ECOLI_model1.pdb,APECO1_1971_YAAJ_ECOLI_model1_align_table_corr...,,BWG_0007_YAAJ_ECOLI_model1_align_table_correct...,c0011_YAAJ_ECOLI_model1_align_table_corrected.csv,E2348C_0007_YAAJ_ECOLI_model1_align_table_corr...,EC042_0008_YAAJ_ECOLI_model1_align_table_corre...,EC55989_0007_YAAJ_ECOLI_model1_align_table_cor...,ECABU_c00080_YAAJ_ECOLI_model1_align_table_cor...,ECB_00007_YAAJ_ECOLI_model1_align_table_correc...,,,ECDH10B_0007_YAAJ_ECOLI_model1_align_table_cor...,EcDH1_3589_YAAJ_ECOLI_model1_align_table_corre...,,EcE24377A_0007_YAAJ_ECOLI_model1_align_table_c...,,ECH74115_0008_YAAJ_ECOLI_model1_align_table_co...,,ECIAI1_0007_YAAJ_ECOLI_model1_align_table_corr...,ECIAI39_0006_YAAJ_ECOLI_model1_align_table_cor...,ECNA114_4651_YAAJ_ECOLI_model1_align_table_cor...,ECO103_0008_YAAJ_ECOLI_model1_align_table_corr...,ECO111_0007_YAAJ_ECOLI_model1_align_table_corr...,ECO26_0007_YAAJ_ECOLI_model1_align_table_corre...,ECOK1_0007_YAAJ_ECOLI_model1_align_table_corre...,EcolC_3648_YAAJ_ECOLI_model1_align_table_corre...,ECP_0008_YAAJ_ECOLI_model1_align_table_correct...,ECs0007_YAAJ_ECOLI_model1_align_table_correcte...,ECS88_0007_YAAJ_ECOLI_model1_align_table_corre...,ECSE_0007_YAAJ_ECOLI_model1_align_table_correc...,ECSF_0007_YAAJ_ECOLI_model1_align_table_correc...,EcSMS35_0006_YAAJ_ECOLI_model1_align_table_cor...,ECSP_0007_YAAJ_ECOLI_model1_align_table_correc...,ECUMN_0007_YAAJ_ECOLI_model1_align_table_corre...,,,ETEC_0007_YAAJ_ECOLI_model1_align_table_correc...,G2583_0007_YAAJ_ECOLI_model1_align_table_corre...,b0007_YAAJ_ECOLI_model1_align_table_corrected.csv,LF82_2462_YAAJ_ECOLI_model1_align_table_correc...,NRG857_00040_YAAJ_ECOLI_model1_align_table_cor...,,SbBS512_E0007_YAAJ_ECOLI_model1_align_table_co...,,SDY_0007_YAAJ_ECOLI_model1_align_table_correct...,,,,,,UMNK88_6_YAAJ_ECOLI_model1_align_table_correct...,UTI89_C0008_YAAJ_ECOLI_model1_align_table_corr...,,,
4,b0008,P0A870,1onr.pdb,APECO1_1970_1onr_align_table_corrected.csv,,BWG_0008_1onr_align_table_corrected.csv,c0012_1onr_align_table_corrected.csv,E2348C_0008_1onr_align_table_corrected.csv,EC042_0009_1onr_align_table_corrected.csv,EC55989_0009_1onr_align_table_corrected.csv,ECABU_c00090_1onr_align_table_corrected.csv,ECB_00008_1onr_align_table_corrected.csv,,,ECDH10B_0008_1onr_align_table_corrected.csv,EcDH1_3588_1onr_align_table_corrected.csv,,EcE24377A_0008_1onr_align_table_corrected.csv,,ECH74115_0009_1onr_align_table_corrected.csv,,ECIAI1_0009_1onr_align_table_corrected.csv,ECIAI39_0008_1onr_align_table_corrected.csv,ECNA114_4652_1onr_align_table_corrected.csv,ECO103_0009_1onr_align_table_corrected.csv,ECO111_0008_1onr_align_table_corrected.csv,ECO26_0008_1onr_align_table_corrected.csv,ECOK1_0008_1onr_align_table_corrected.csv,EcolC_3647_1onr_align_table_corrected.csv,ECP_0009_1onr_align_table_corrected.csv,ECs0008_1onr_align_table_corrected.csv,ECS88_0009_1onr_align_table_corrected.csv,ECSE_0008_1onr_align_table_corrected.csv,ECSF_0008_1onr_align_table_corrected.csv,EcSMS35_0007_1onr_align_table_corrected.csv,ECSP_0008_1onr_align_table_corrected.csv,ECUMN_0009_1onr_align_table_corrected.csv,,,ETEC_0008_1onr_align_table_corrected.csv,G2583_0008_1onr_align_table_corrected.csv,b0008_1onr_align_table_corrected.csv,LF82_2216_1onr_align_table_corrected.csv,NRG857_00045_1onr_align_table_corrected.csv,S0008_1onr_align_table_corrected.csv,SbBS512_E0008_1onr_align_table_corrected.csv,SBO_0009_1onr_align_table_corrected.csv,SDY_0008_1onr_align_table_corrected.csv,,SFV_0007_1onr_align_table_corrected.csv,SFxv_0007_1onr_align_table_corrected.csv,SSON_0009_1onr_align_table_corrected.csv,,UMNK88_7_1onr_align_table_corrected.csv,UTI89_C0009_1onr_align_table_corrected.csv,,,


# Create dataframe of only mutations

In [53]:
# generates all point mutations across all strains
Tlist = pd.DataFrame([])
for k in range(50,58):
    
    Mlist = pd.DataFrame([])
    
    for i in range(0,5):
        filename = strain_alignment_mapping_df[strain_alignment_mapping_df.columns[k]][i]
        strain = strain_alignment_mapping_df.columns[k]
        
        [STRAIN_DIR, STRAIN_DATA_FRAMES, STRAIN_MODEL_FILES, STRAIN_SEQ_ALIGN_FILES] = strain_folder_locations(strain)
        
        if pd.notnull(filename):
            filepath = os.path.join(STRAIN_SEQ_ALIGN_FILES, filename)
            mutlist = pd.read_csv(filepath,header=0,index_col=0)
        else:
            mutlist = pd.DataFrame([])
        mutlist=mutlist.reset_index(drop='True')
        Glist=[]
        for j in range(0,len(mutlist)):
            Glist.append([strain_alignment_mapping_df.m_gene[i],strain_alignment_mapping_df.ssb_best_file[i],strain_alignment_mapping_df.columns[k]])        
        gdf = pd.DataFrame(Glist)
        gdf = gdf.reset_index(drop='True')
        mutlistf = pd.merge(gdf,mutlist,left_index='True',right_index='True',how='inner')
        Mlist=Mlist.append(mutlistf)
    Tlist=Tlist.append(Mlist)

In [56]:
Mutation=Tlist[Tlist.type=='mutation']
Mutation.columns=['mutation_genes','pdb', 'strain', u'id_a', u'id_b', u'type', u'start', u'stop', u'count', u'uniprot_aa', u'pdb_aa', u'chain', u'pdb_start', u'pdb_stop']
Mutation=Mutation.drop_duplicates(subset=['mutation_genes','pdb','strain','id_a','pdb_start','pdb_stop','start','stop','uniprot_aa','pdb_aa']).reset_index(drop=True)

In [57]:
Mutation

Unnamed: 0,mutation_genes,pdb,strain,id_a,id_b,type,start,stop,count,uniprot_aa,pdb_aa,chain,pdb_start,pdb_stop
0,b0002,AK1H_ECOLI_model1.pdb,iSFxv_1172,SFxv_0001,AK1H_ECOLI_model1.X,mutation,512,512,1,S,N,X,512,512
1,b0002,AK1H_ECOLI_model1.pdb,iSFxv_1172,SFxv_0001,AK1H_ECOLI_model1.X,mutation,630,630,1,V,M,X,630,630
2,b0003,KHSE_ECOLI_model1.pdb,iSFxv_1172,SFxv_0002,KHSE_ECOLI_model1.X,mutation,25,25,1,A,T,X,25,25
3,b0003,KHSE_ECOLI_model1.pdb,iSFxv_1172,SFxv_0002,KHSE_ECOLI_model1.X,mutation,48,48,1,K,N,X,48,48
4,b0003,KHSE_ECOLI_model1.pdb,iSFxv_1172,SFxv_0002,KHSE_ECOLI_model1.X,mutation,274,274,1,D,E,X,274,274
5,b0004,1vb3.pdb,iSFxv_1172,SFxv_0003,1vb3.A,mutation,123,123,1,D,G,A,123,123
6,b0004,1vb3.pdb,iSFxv_1172,SFxv_0003,1vb3.A,mutation,237,237,1,A,T,A,237,237
7,b0004,1vb3.pdb,iSFxv_1172,SFxv_0003,1vb3.A,mutation,262,262,1,F,L,A,262,262
8,b0004,1vb3.pdb,iSFxv_1172,SFxv_0003,1vb3.A,mutation,418,418,1,S,A,A,418,418
9,b0002,AK1H_ECOLI_model1.pdb,iSSON_1240,SSON_0002,AK1H_ECOLI_model1.X,mutation,302,302,1,G,D,X,302,302


# Count the occurence of mutations per strain

In [62]:
AAdict2 = {'C':'polar',
 'I':'hydrophobic',
 'G':'nonpolar', 
 'S':'polar', 
 'Q':'polar', 
 'K':'negative',
 'N':'polar',
 'P':'nonpolar', 
 'D':'negative', 
 'T':'polar', 
 'F':'hydrophobic', 
 'A':'nonpolar', 
 'M':'hydrophobic', 
 'H':'positive', 
 'L':'hydrophobic', 
 'R':'positive', 
 'W':'hydrophobic', 
 'V':'hydrophobic', 
 'E':'negative', 
 'Y':'polar'}

In [64]:
Datalist=['R', 'H', 'K', 'D', 'E', 'S', 'T', 'N', 
          'Q', 'C', 'U', 'Y', 'P', 'G', 'A', 'I', 
          'L', 'M', 'F', 'W', 'V', 'X', 'positive', 'negative', 'polar', 'nonpolar', 'hydrophobic','unknown']

#step 1, filter interesting residue from gene.
reslist=[]

df = Mutation

for gene in df.mutation_genes:
    num=len(df[df.mutation_genes == gene])
    tmp_df=df[df.mutation_genes==gene].drop_duplicates().reset_index()
    
    for i in range(0,len(tmp_df.index)):
        
        stoich_mut = []
        
        for t in Datalist:
            if t == aasubset:
                stoich_mut.append(-1)
            elif t == mutsubset:
                stoich_mut.append(1)
            elif t == AAdict2[aasubset]:
                if AAdict2[aasubset] != AAdict2[mutsubset]:
                    stoich_mut.append(-1)
                else:
                    stoich_mut.append(0)
            elif t == AAdict2[mutsubset]:
                if AAdict2[aasubset] != AAdict2[mutsubset]:
                    stoich_mut.append(1)    
                else:
                    stoich_mut.append(0)
            else:
                stoich_mut.append(0)
                
        reslist.append([gene, num, loc]+stoich_mut)        
mutres=pd.DataFrame(reslist).drop_duplicates()
mutres.columns=['gene','strains','res', u'R', u'H', u'K', u'D', u'E', 
                u'S', u'T', u'N', u'Q', u'C', u'U', u'Y', u'P', u'G', u'A', u'I', 
                u'L', u'M', u'F', u'W', u'V', u'X', u'positive', u'negative', u'polar',
                u'nonpolar', u'hydrophobic',u'unknown']
mutres = mutres.drop_duplicates()
mutres.head(2)

Unnamed: 0,gene,res,strains,strainlist,R,H,K,D,E,S,T,N,Q,C,U,Y,P,G,A,I,L,M,F,W,V,X,positive,negative,polar,nonpolar,unknown,syslist,sasa,redepth,sstructure,CM,EP,IP,SG,per_CM,per_EP,per_IP,per_SG,i_entry_name,pdb_file
0,b0003,274,37,"['iAPECO1_1312', 'ic_1306', 'iE2348C_1286', 'i...",0,0,0,37,-37,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,['Threonine and Lysine Metabolism'],158,1.734404,H,0.108108,0.405405,0.297297,0.189189,0.444444,0.9375,1.0,1.0,KHSE_ECOLI,KHSE_ECOLI_model1.pdb
1,b0004,237,38,"['iAPECO1_1312', 'ic_1306', 'iE2348C_1286', 'i...",0,0,0,0,0,0,-38,0,0,0,0,0,0,0,38,0,0,0,0,0,0,0,0,0,-38,38,0,"['Cofactor and Prosthetic Group Biosynthesis',...",25,2.577587,T,0.105263,0.421053,0.289474,0.184211,0.444444,1.0,1.0,1.0,THRC_ECOLI,THRC_ECOLI_model1.pdb
2,b0026,849,33,"['iAPECO1_1312', 'ic_1306', 'iE2348C_1286', 'i...",0,0,0,0,0,-33,0,0,0,0,0,0,0,0,33,0,0,0,0,0,0,0,0,0,-33,33,0,['tRNA Charging'],21,1.883486,-,0.121212,0.484848,0.212121,0.181818,0.444444,1.0,0.636364,0.857143,SYI_ECOLI,SYI_ECOLI_model1.pdb
