In [1]:
import pandas as pd
import os
from joblib import Parallel, delayed
from tqdm import tqdm

In [2]:
background_split_seqs = pd.read_parquet('../data/interim/background_model_seqs.pq')
background_split_seqs['defense'] = False

In [3]:
defense_split_seqs = pd.read_parquet('../data/interim/defense_finder_model_seqs.pq')
defense_split_seqs['defense'] = True

In [5]:
narrow_defense_split_seqs = (defense_split_seqs[['protein_accession', 'genome', 'seq_id', 'seq', 'cluster_id', 
                                                 'split', 'defense', 'gene_name']]
                             .rename(columns={'protein_accession': 'protein_id',
                                              'genome': 'assembly'}))

In [6]:
split_seqs = pd.concat([background_split_seqs, 
                        narrow_defense_split_seqs])


In [7]:
ft_dir = '../data/genome_downloads/ft/'
ft_files = [x for x in os.listdir(ft_dir) if '.txt' in x]

In [8]:
def pick_protein_context_id(assembly, assembly_df, ft_files, ft_dir):
    assembly_ft_file = next(x for x in ft_files if assembly in x)
    assembly_ft = pd.read_table(ft_dir + assembly_ft_file)
    relevant_ft = (assembly_ft[assembly_ft['product_accession'].isin(assembly_df['protein_id'])]
               .groupby('product_accession')
               .sample(n=1, random_state=7))
    relevant_ft['protein_context_id'] = (relevant_ft['product_accession'] + '|' +
                                         relevant_ft['genomic_accession'] + '|' +
                                         relevant_ft['start'].astype(str) + '|' +
                                         relevant_ft['strand'])
    out_ft = relevant_ft[['product_accession', 'protein_context_id']]
    return out_ft

In [9]:
split_protein_context_id_list = Parallel(n_jobs=48)(delayed(pick_protein_context_id)(assembly, assembly_df, ft_files, ft_dir) for assembly, assembly_df in 
                                                    tqdm(split_seqs.groupby('assembly'), total=split_seqs['assembly'].nunique()))

100%|██████████| 17405/17405 [01:40<00:00, 172.87it/s]


In [10]:
split_protein_context_df = pd.concat(split_protein_context_id_list)

In [11]:
del split_protein_context_id_list

In [12]:
merged_split_seqs = split_seqs.merge(split_protein_context_df.rename(columns={'product_accession': 'protein_id'}), how='inner', 
                                     on='protein_id')

In [13]:
test_seqs = merged_split_seqs[merged_split_seqs['split'] == 'test']

In [14]:
test_seqs['defense'].value_counts()

defense
False    166243
True      10128
Name: count, dtype: int64

In [15]:
test_y = test_seqs[['seq_id', 'defense']]

In [16]:
with open('../data/interim/test_seqs.faa', 'w') as f:
    for _, row in test_seqs.iterrows():
        print('>' + row['seq_id'], file=f)
        print(row['seq'], file=f)

### Make summary of test sequence for interpretability

In [18]:
%%time
seq_id_names = pd.read_parquet('../data/interim/seq_id_names.pq')
test_seq_names = seq_id_names[seq_id_names['seq_id'].isin(test_y['seq_id'])]
del seq_id_names

CPU times: user 42.3 s, sys: 9.63 s, total: 51.9 s
Wall time: 46.7 s


In [19]:
test_seq_accessions = test_seqs[['protein_id', 'seq_id']]

In [20]:
%%time
seq_id_go = pd.read_parquet('../data/interim/seq_id_go_processes.pq')

CPU times: user 5.26 s, sys: 1.29 s, total: 6.55 s
Wall time: 5.86 s


In [21]:
filtered_seq_id_go = seq_id_go[seq_id_go['seq_id'].isin(test_y['seq_id'])]
agg_seq_id_go = (filtered_seq_id_go.groupby('seq_id')
                 .agg({'go_process': lambda x: ' | '.join(x)})
                 .reset_index())

In [22]:
seq_id_cogs = pd.read_parquet('../data/interim/cog_filtered_seq_ids.pq')
breif_seq_id_cogs = seq_id_cogs[['seq_id', 'name']].rename(columns={'name': 'cog_name'})

In [24]:
test_seq_names_go = (test_seqs[['seq_id', 'protein_id', 'protein_context_id', 'gene_name']]
                     .merge(test_seq_names, how='inner', 
                            on='seq_id')
                     .merge(agg_seq_id_go, how='left', 
                                          on='seq_id')
                     .merge(breif_seq_id_cogs, how='left', on='seq_id'))

In [25]:
merged_split_seqs['seq_id'].value_counts()

seq_id
00000031de6e3b5adb7f0ecb501bce05d5aebd9f5c2d64450793600a    1
f63e89707225e018917de6ed481b733441544ece8c365878532c3b3d    1
79e60e2e29cdcf2fb1fb5a1059206c7dafaa7f6ad85389e8c0f9b690    1
74eca015f055342d723fbf237cdaec88a72107fb52dfafa7e9ae5290    1
74ec909c4e61fcba356811bf8538a016ac03defddf3f1c9733b55d9e    1
                                                           ..
2972aa3fb1bc10442dd07f5050fa83606a225f7d8f5a17f19244ae54    1
a4b5599b00850fbbaec60596d696b20df66806c3f33aaa07224ac3af    1
82582bc87d9389ba8ccf31d190003c01d486c9f20af1f3b086acc6c9    1
29727fcbad31c2aa6a71cdab4b24e70217350c9fd64f1d8b3e11aa0c    1
418966b67b02cb9cd6c4c94194c3e44ecb3751e1a3aba2f2d8e5c834    1
Name: count, Length: 1997942, dtype: int64

In [26]:
test_seq_names_go.to_parquet('../data/interim/test_seq_names_processes.pq', index=False)
test_y.to_csv('../data/interim/test_y.csv', index=False)
merged_split_seqs.to_parquet('../data/interim/model_seqs.pq', index=False)