In [1]:
import pandas as pd

In [2]:
cog_descriptions = pd.read_parquet('../data/interim/cog_descriptions.pq')
cog_proteins = pd.read_parquet('../data/interim/cog_proteins.pq')
mobilome_cogs = pd.read_csv('../data/external/mobilome_cog_ids.csv')
defense_homologs = pd.read_parquet('../data/interim/defense_finder_homologs_seq_ids.pq')

In [3]:
seq_id_accessions = pd.read_parquet('../data/interim/seq_id_accessions.pq')

### Write out COGs from Makarova et al. for manual inspection

In [3]:
cog_mobilome_descriptions = cog_descriptions[cog_descriptions['cog_id']
                                             .isin(mobilome_cogs['cog_id'])]

In [5]:
cog_mobilome_descriptions.to_csv('../data/interim/cog_putative_mobilome_descriptions.csv', 
                                 index=False)

In [6]:
cog_other_descriptions = cog_descriptions[~cog_descriptions['cog_id']
                                          .isin(mobilome_cogs['cog_id'])]

In [7]:
cog_other_descriptions.to_csv('../data/interim/cog_other_descriptions.csv', 
                                 index=False)

### Read in manaully selected cogs

In [4]:
filtered_mobilome_cog_ids = pd.read_csv('../data/external/filtered_mobilome_cog_ids.csv')
secretion_system_cog_ids = pd.read_csv('../data/external/filtered_secretion_system_cog_ids.csv')

In [5]:
selected_cog_ids = filtered_mobilome_cog_ids['cog_id'].to_list() + secretion_system_cog_ids['cog_id'].to_list()

In [6]:
selected_cog_protein_ids = (cog_proteins.loc[cog_proteins['cog_id'].isin(selected_cog_ids) &
                                             (cog_proteins['membership_class'] == 0), 
                                             ['protein_id', 'cog_id']].drop_duplicates())

In [7]:
len(selected_cog_protein_ids)

24378

Select subset of sequences in the RefSeq data and filter out homologs of defense systems

In [8]:
selected_cog_seq_ids = seq_id_accessions.loc[(seq_id_accessions['protein_accession'].isin(selected_cog_protein_ids['protein_id'])) &
                                              ~seq_id_accessions['seq_id'].isin(defense_homologs['seq_id']), ]

In [9]:
len(selected_cog_seq_ids)

11924

In [10]:
merged_selected_cog_ids = (selected_cog_seq_ids.rename(columns={'protein_accession': 'protein_id'})
                           .merge(selected_cog_protein_ids, how='inner', on='protein_id')
                           .merge(cog_descriptions[['cog_id', 'cog_name', 'pathway']]
                                  .rename(columns={'cog_name': 'name'}), 
                                  how='inner', on='cog_id'))

In [11]:
merged_selected_cog_ids['name'].value_counts().head(20)

name
Site-specific recombinase XerD                                                             1997
Transposase                                                                                1345
REP element-mobilizing transposase RayT                                                     642
Site-specific DNA recombinase SpoIVCA/DNA invertase PinE                                    622
Predicted component of the type VI protein secretion system                                 470
Transposase (or an inactivated derivative)                                                  396
Transposase and inactivated derivatives, IS30 family                                        305
Type VI protein secretion system component Hcp (secreted cytotoxin)                         288
Type IV secretory pathway, VirB4 component                                                  259
Transposase InsO and inactivated derivatives                                                254
Transposase InsE and inactivated de

In [12]:
merged_selected_cog_ids['name'].value_counts().tail(20)

name
Bacteriophage P2-related tail formation protein                           32
Mu-like prophage tail sheath protein gpL                                  26
Mu-like prophage tail protein gpP                                         25
Phage capsid portal protein XkdE                                          21
Phage DNA packaging protein, Nu1 subunit of terminase                     18
IS4 transposase InsG                                                      18
Type IV secretory pathway, VirB6 component                                17
DNA primase, phage- or plasmid-associated                                 17
Type III secretory pathway, EscU/YscU component                           15
Integrase/recombinase, includes phage integrase                           15
Type IV secretory pathway, VirJ component                                 14
ESX protein secretion system component YukD                               13
Phage-related minor tail protein                                       

In [13]:
merged_selected_cog_ids.to_parquet('../data/interim/cog_filtered_seq_ids.pq', index=False)