In [1]:
import pandas as pd
import os
from core import read_dom_table

In [2]:
model_seq_info = pd.read_parquet('../data3/interim/model_seq_info.pq')

In [3]:
og_hmm_dir = '/home/gridsan/pdeweirdt/.macsyfinder/data/defense-finder-models/profiles/'
og_hmm_paths = {x.split('.')[0]: os.path.join(og_hmm_dir, x) for x in os.listdir(og_hmm_dir) if '.hmm' in x}

In [4]:
working_dir = '../data3/interim/cv_hmm_work/'
if not os.path.exists(working_dir):
    os.mkdir(working_dir)

## Search

In [5]:
%%time
for fold, fold_test_df in model_seq_info.groupby('test_fold'):
    print(fold)
    fold_train_df = model_seq_info[model_seq_info['test_fold'] != fold]
    train_hmms = fold_train_df['defense_gene'].dropna().drop_duplicates()
    train_hmm_paths = [og_hmm_paths[x] for x in train_hmms]
    train_hmm_f = os.path.join(working_dir, 'split_' + str(fold) + '_train.hmm')
    os.system(' '.join(['cat', ' '.join(train_hmm_paths),  '>', train_hmm_f]))
    temp_out_file = os.path.join(working_dir, 'temp_hmmer_out.txt')
    dom_out = os.path.join(working_dir, 'split_' + str(fold) + '_hmm_search_dom.txt')
    test_seq_f = '../data3/interim/model_split_' + str(fold) + '_test_seqs.faa'
    os.system(' '.join(['conda run -n beaker', 
                        'hmmsearch',
                        '--cpu', '40',
                        '-o', temp_out_file,
                        '--domtblout', dom_out,
                        train_hmm_f, test_seq_f]))
    os.system('rm ' + temp_out_file)

0
1
2
3
4
CPU times: user 444 ms, sys: 72.8 ms, total: 517 ms
Wall time: 8min 1s


## Compile

In [6]:
dom_table_list = list()

In [7]:
for fold in model_seq_info['test_fold'].unique():
    dom_out = os.path.join(working_dir, 'split_' + str(fold) + '_hmm_search_dom.txt')
    fold_dom_table = read_dom_table(dom_out)
    dom_table_list.append(fold_dom_table)    

In [8]:
cat_dom_tables = pd.concat(dom_table_list)

In [9]:
len(cat_dom_tables)

84881

In [10]:
top_dom_table = (cat_dom_tables.sort_values('seq_score', ascending=False)
                 .groupby('target')
                 .head(1))

In [11]:
out_df = (model_seq_info[['seq_id']]
          .merge(top_dom_table[['target', 'seq_score']]
                 .rename(columns={'target': 'seq_id', 
                                  'seq_score': 'prediction'}), 
                 how='left', on='seq_id'))
out_df['prediction'] = out_df['prediction'].fillna(0)
out_df['method'] = 'Defense HMM search'

In [12]:
out_df.to_parquet('../data3/interim/cv_predictions_hmm.pq', index=False)