In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
go_df = pd.read_parquet('../data/interim/seq_id_go_processes.pq')
true_defense_genes = pd.read_parquet('../data/interim/defense_finder_true_defense_seq_ids.pq')

In [4]:
seq_id_names = pd.read_parquet('../data/interim/seq_id_names.pq')

If a gene **might** (p>0.4) be defenseive based on its GO process, then don't include it in the negative set. i.e., if p(defense|GO process)>0.4, remove gene from the negative list

### GO term filter

In [5]:
defense_go_df = go_df[go_df['seq_id'].isin(true_defense_genes['seq_id'])]

In [6]:
defense_go_stats = (defense_go_df['go_process'].value_counts()
                     .reset_index(name='count')
                     .rename(columns={'index': 'go_process'}))

In [7]:
baseline_go_stats = (go_df['go_process'].value_counts()
                     .reset_index(name='count')
                     .rename(columns={'index': 'go_process'}))

In [8]:
merged_go_stats = (defense_go_stats.merge(baseline_go_stats, how='inner', on='go_process', 
                                          suffixes=('_defense', '_baseline')))
merged_go_stats['defense_percent'] = merged_go_stats['count_defense']/merged_go_stats['count_baseline']

In [9]:
merged_go_stats.sort_values('defense_percent', ascending=False).head(20)

Unnamed: 0,go_process,count_defense,count_baseline,defense_percent
4,defense response to virus,3865,4637,0.833513
0,maintenance of CRISPR repeat elements,44785,54361,0.823844
3,DNA restriction-modification system,8078,15299,0.528008
2,DNA methylation,9301,21212,0.438478
40,nucleic acid phosphodiester bond hydrolysis,23,53,0.433962
1,DNA modification,10240,25462,0.402168
8,DNA methylation on adenine,446,4076,0.109421
7,dGTP catabolic process,457,6373,0.071709
9,protein catabolic process,338,9641,0.035059
20,protein tetramerization,104,4806,0.02164


In [10]:
defense_go_terms = merged_go_stats.loc[merged_go_stats['defense_percent'] > 0.4, 'go_process'].to_list()

We'll filter out genes that are members of the following  pathways

In [11]:
defense_go_terms

['maintenance of CRISPR repeat elements',
 'DNA modification',
 'DNA methylation',
 'DNA restriction-modification system',
 'defense response to virus',
 'nucleic acid phosphodiester bond hydrolysis']

In [12]:
defense_go_term_seq_ids = go_df.loc[go_df['go_process'].isin(defense_go_terms), 
                                    'seq_id'].unique()

### T/A

Due to their frequent misannotation in stress response or persister cell formation, we will remove obvious TA systems from the data

In [13]:
ta_pattern = pattern = r'toxin-antitoxin|addiction module|abortive infection'

In [14]:
ta_names = ['nucleotidyl transferase AbiEii/AbiGii toxin family protein',
            'BrnT family toxin', 
            'zeta toxin family protein']

In [15]:
ta_rows = (seq_id_names['name'].str.contains(ta_pattern, regex=True) |
           seq_id_names['name'].isin(ta_names))

In [23]:
seq_id_names.loc[ta_rows, 'name'].value_counts().head(50)

name
type II toxin-antitoxin system RelE/ParE family toxin                              40136
type II toxin-antitoxin system VapC family toxin                                   25391
type II toxin-antitoxin system PemK/MazF family toxin                              14282
type II toxin-antitoxin system Phd/YefM family antitoxin                           13125
type II toxin-antitoxin system HicB family antitoxin                               10758
type II toxin-antitoxin system prevent-host-death family antitoxin                 10571
nucleotidyl transferase AbiEii/AbiGii toxin family protein                          9819
type II toxin-antitoxin system HicA family toxin                                    7774
type II toxin-antitoxin system VapB family antitoxin                                7188
HigA family addiction module antitoxin                                              7077
type II toxin-antitoxin system ParD family antitoxin                                6244
type IV toxin-an

In [17]:
ta_ids = seq_id_names.loc[ta_rows, 'seq_id'].unique()

### Filtering

In [18]:
background_go_df = go_df.loc[~go_df['seq_id'].isin(true_defense_genes['seq_id']) &
                             ~go_df['seq_id'].isin(defense_go_term_seq_ids) &
                             ~go_df['seq_id'].isin(ta_ids), :]

In [19]:
background_go_df['go_process'].value_counts().head(30)

go_process
translation                                           748338
transmembrane transport                               503296
regulation of DNA-templated transcription             380624
regulation of transcription%2C DNA-templated          373328
proteolysis                                           341891
DNA repair                                            325868
phosphorelay signal transduction system               288476
carbohydrate metabolic process                        276985
DNA recombination                                     268528
peptidoglycan biosynthetic process                    234656
DNA replication                                       211933
transport                                             203509
tRNA modification                                     193661
glycolytic process                                    166618
biosynthetic process                                  143295
histidine biosynthetic process                        142640
biological_pr

In [20]:
background_seq_ids = background_go_df['seq_id'].drop_duplicates()

In [21]:
len(background_seq_ids)

14097829

In [24]:
background_seq_ids.to_csv('../data/interim/background_seq_ids.csv', index=False)
background_go_df.to_parquet('../data/interim/background_go_terms.pq', index=False)