In [1]:
import pandas as pd
import os
from core import read_mmseqs_results

In [2]:
model_seq_info = pd.read_parquet('../data3/interim/model_seq_info.pq')

In [3]:
mmseqs_search_out = '../data3/interim/candidate_mmseqs_profile_out.txt'

In [4]:
%%time
mmseqs_search_df = read_mmseqs_results(mmseqs_search_out)

CPU times: user 57.4 s, sys: 7.39 s, total: 1min 4s
Wall time: 1min 5s


**Query:** train gene

**Target:** test gene

In [5]:
top_train_list = list()
for fold, fold_test_df in model_seq_info.groupby('test_fold'):
    print(fold)
    top_mmseqs_df = (mmseqs_search_df[~mmseqs_search_df['query'].isin(fold_test_df['seq_id']) & 
                                      mmseqs_search_df['target'].isin(fold_test_df['seq_id'])]
                     .sort_values('bits', ascending=False)
                     .groupby('target')
                     .head(1))
    top_train_list.append(top_mmseqs_df)

0
1
2
3
4


In [6]:
cat_top_hit = pd.concat(top_train_list)

In [9]:
leaked_data = cat_top_hit[((cat_top_hit['qcov'] > 0.8) | 
                           (cat_top_hit['tcov'] > 0.8)) & 
                          (cat_top_hit['fident'] > 0.3)]

In [10]:
merged_leaked_data = (leaked_data.merge(model_seq_info
                                         .rename(columns={'seq_id': 'query'}), 
                                         how='inner', on='query')
                      .merge(model_seq_info
                             .rename(columns={'seq_id': 'target'}), 
                             how='inner', on='target', suffixes=('_query', '_target')))

In [11]:
pd.set_option('display.max_colwidth', 100)
merged_leaked_data[['query', 'target', 'qcov', 'tcov', 'fident', 'defensive_query', 'defensive_target']]

Unnamed: 0,query,target,qcov,tcov,fident,defensive_query,defensive_target
0,27170489b598945be8f2540c4e3d172e6f206c0411643fcb1b7c0eda,58716cd0f8d8332725ba61f467c8b195330aeb9267176c9a589765db,0.999,0.999,0.449,False,False
1,27170489b598945be8f2540c4e3d172e6f206c0411643fcb1b7c0eda,8d05d5458982cbd3c21f7a0f4b357bfd954940adb13cd5f8a9c2f984,0.887,0.991,0.304,False,False
2,4f7028c7ce85fa28077affb5a49c0127890f15f050c5afa950c1688a,e40f33194ea22f335bc7721cec6b7f55ba7436ce4974d2f7b953770a,1.000,0.557,0.755,False,False
3,8be89a7980cadaae21525e944b3432e536a27e65c261b9b1a3f4a5ea,638b0209643f118a2f5c8e39492941dc9589ca158a3bde9456485224,0.409,0.973,0.512,False,True
4,8c8df95cc8f4fe2a0c50de840ef973d257961bb969aba94b4211119b,80e229a954e8ef84b2f4a6448fb3215c0882976d7c408c34261e5f91,0.558,0.992,0.792,False,False
...,...,...,...,...,...,...,...
13795,1f462cd498bbba2b85eec400c5d0eb3019247195442b99d875ac9947,f153e0bed5b527a0e3278bb3a36e5f2e21b008db09212eb55081f64a,0.434,0.820,0.465,False,False
13796,c3b78e74963b024dbbe4361676fd69be800c82b44145dbf688218780,ee5220395385dcd4c16ab996ae129ff8d032cc5bf5f70e3daa1a3268,0.588,0.862,0.369,False,False
13797,c0ac1bee470bdc822b8eb29d68da1280459e7aa4593ea1a4b3a5678d,1e0f44509c7ea964fc144cf9093ec35fc6d823237a0f227c24415c60,0.625,0.816,0.500,False,False
13798,f0fd02ae5b7ab81746e50289f062c3350ce059a6fddf32339c41406b,8490f668bc0c9f31e4b18b89064a3c66fcda5fb07d50a06aae6641c3,0.764,0.854,0.465,False,False


In [12]:
merged_cat_top_hit = (cat_top_hit[['query', 'target', 'bits']]
                      .merge(model_seq_info[['seq_id', 'defensive']]
                             .rename(columns={'seq_id': 'query', 
                                              'defensive': 'query_defensive'}), 
                             how='inner', on='query'))
merged_cat_top_hit['direction'] = merged_cat_top_hit['query_defensive'] * 2 - 1
merged_cat_top_hit['prediction'] = merged_cat_top_hit['bits'] * merged_cat_top_hit['direction']

In [13]:
out_df = (model_seq_info[['seq_id']]
          .merge(merged_cat_top_hit[['target', 'prediction']]
                 .rename(columns={'target': 'seq_id'}), 
                 how='left', on='seq_id'))
out_df['prediction'] = out_df['prediction'].fillna(0)
out_df['method'] = 'MMseqs profile nearest neighbor'

In [14]:
out_df.to_parquet('../data3/interim/cv_predictions_mmseqs_profile.pq', index=False)