In [1]:
import pandas as pd
from core import med_log_odds_cutoff
from tqdm import tqdm
import os
import numpy as np

In [3]:
seq_id_accessions = pd.read_parquet('../data/interim/seq_id_accessions.pq')
df_true_df = pd.read_parquet('../data/interim/defense_finder_genes_genomes.pq')
df_homolog_df = pd.read_parquet('../data/interim/defense_finder_homologs_profile_names.pq')
df_homolog_df = df_homolog_df.merge(seq_id_accessions, how='inner', on='seq_id')

In [None]:
%%time
seq_assemblies = pd.read_csv('../data/interim/seq_assemblies.csv', 
                             names=['product_accession', 'seq_id', 'seq', 'assembly'])

In [4]:
unqiue_df_homolog_df = (df_homolog_df.groupby('seq_id')
                        .sample(n=1, random_state=7))

In [5]:
prediction_out_dir = '../data/processed/refseq_500_predictions/'

In [6]:
predictions = pd.concat([pd.read_csv(prediction_out_dir + f) for 
                         f in tqdm(os.listdir(prediction_out_dir)) if '.csv' in f])

100%|██████████| 500/500 [01:07<00:00,  7.45it/s]


In [7]:
predictions['log_odds'] = np.log(predictions['beaker_prediction']/(1-predictions['beaker_prediction']))
predictions['predicted_defensive'] = predictions['log_odds'] > med_log_odds_cutoff

In [8]:
predictions['predicted_defensive'].sum()

19895

In [9]:
unique_true_df = (df_true_df.sort_values('sys_score', ascending=False)
                  .groupby(['genome', 'protein_accession'])
                  .head(1))

In [10]:
merged_predictions = (predictions.merge(unqiue_df_homolog_df.rename(columns={'protein_accession': 'product_accession', 
                                                                             'gene_name': 'defense_homolog_name'})
                                        .drop(columns='seq_id'), 
                                        how='left', on='product_accession'))
merged_predictions['defense_homolog'] = ~merged_predictions['defense_homolog_name'].isna()
merged_predictions['assembly_stub'] = merged_predictions['assembly'].str.split('.', expand=True)[0]
merged_predictions = (merged_predictions.merge(unique_true_df[['genome', 'protein_accession','gene_name', 'sys_id']]
                                               .rename(columns={'genome': 'assembly_stub', 
                                                                'protein_accession': 'product_accession', 
                                                                'gene_name': 'defense_system_gene', 
                                                                'sys_id': 'defense_system'}), 
                                               how='left', on=['product_accession', 'assembly_stub']))
merged_predictions['defense_system_protein'] = ~merged_predictions['defense_system_gene'].isna()

In [11]:
len(merged_predictions) == len(predictions)

True

In [12]:
def assign_predicted_category(row):
    if row['defense_system_protein']:
        return 'Defense homolog in expected system'
    elif row['defense_homolog']:
        return 'Defense homolog in new context'
    else:
        return 'Putative novel defense gene'

In [13]:
predicted_defense_df = merged_predictions[merged_predictions['predicted_defensive']].copy()
predicted_defense_df['predicted_category'] = predicted_defense_df.apply(assign_predicted_category, axis=1)

In [14]:
predicted_defense_df['predicted_category'].value_counts()

predicted_category
Putative novel defense gene           8021
Defense homolog in expected system    6331
Defense homolog in new context        5543
Name: count, dtype: int64

In [None]:
assembly_n_genes = (predictions.groupby('assembly')
                    .agg(total_genes=('product_accession', 'count'), 
                         defensive_genes=('predicted_defensive', 'sum'))
                    .reset_index())
assembly_n_genes['frac_defensive'] = assembly_n_genes['defensive_genes']/assembly_n_genes['total_genes']


In [None]:
assembly_n_genes['frac_defensive'].plot.hist()

In [None]:
assembly_n_genes.sort_values('frac_defensive')

In [None]:
putative_novel_df = predicted_defense_df[predicted_defense_df['predicted_category'] == 'Putative novel defense gene'].copy()

In [None]:
putative_novel_df['name'].value_counts().head(50)

In [27]:
filtered_seqs = seq_assemblies.loc[seq_assemblies['product_accession']
                                   .isin(putative_novel_df['product_accession']), 
                                   ['product_accession', 'seq']].drop_duplicates()


In [30]:
with open('../data/interim/refseq_500_putative_novel_seqs.faa', 'w') as f:
    for _, row in filtered_seqs.iterrows():
        print('>' + row['product_accession'], file=f)
        print(row['seq'], file=f)

In [16]:
merged_predictions.to_csv('../data/processed/refseq_500_predictions.csv', index=False)