In [1]:
import pandas as pd

In [2]:
model_seq_info = pd.read_parquet('../data3/interim/model_seq_info.pq')
search_df = pd.read_parquet('../data3/interim/model_seq_blast_search.pq')

**Query:** train gene

**Target:** test gene

In [3]:
top_train_list = list()
for fold, fold_test_df in model_seq_info.groupby('test_fold'):
    print(fold)
    top_blast_df = (search_df[~search_df['query'].isin(fold_test_df['seq_id']) & 
                              search_df['target'].isin(fold_test_df['seq_id'])]
                     .sort_values('bits', ascending=False)
                     .groupby('target')
                     .head(1))
    top_train_list.append(top_blast_df)

0
1
2
3
4


In [4]:
cat_top_hit = pd.concat(top_train_list)

In [5]:
leaked_data = cat_top_hit[((cat_top_hit['qcov'] > 0.8) | 
                           (cat_top_hit['tcov'] > 0.8)) & 
                          (cat_top_hit['fident'] > 0.3)]

In [6]:
merged_leaked_data = (leaked_data.merge(model_seq_info
                                         .rename(columns={'seq_id': 'query'}), 
                                         how='inner', on='query')
                      .merge(model_seq_info
                             .rename(columns={'seq_id': 'target'}), 
                             how='inner', on='target', suffixes=('_query', '_target')))

In [7]:
pd.set_option('display.max_colwidth', 100)
merged_leaked_data[['query', 'target', 'qcov', 'tcov', 'fident', 'defensive_query', 'defensive_target']]

Unnamed: 0,query,target,qcov,tcov,fident,defensive_query,defensive_target
0,90722b86e0e71131620cec19bdacc39f959e6ed61186d61420f8cedf,49cb910efe0fa88b16eb2150960766bf90c3ed91298288b8ececf37d,1.001556,0.703663,0.71329,False,False
1,4f7028c7ce85fa28077affb5a49c0127890f15f050c5afa950c1688a,e40f33194ea22f335bc7721cec6b7f55ba7436ce4974d2f7b953770a,1.039216,0.573841,0.48151,False,False
2,8c8df95cc8f4fe2a0c50de840ef973d257961bb969aba94b4211119b,80e229a954e8ef84b2f4a6448fb3215c0882976d7c408c34261e5f91,0.560168,1.006486,0.55747,False,False
3,8c8df95cc8f4fe2a0c50de840ef973d257961bb969aba94b4211119b,115932760f50bf58ddd7bcaf4d99c10b438c22a1b485d484550a2981,0.422984,0.990141,0.47084,False,False
4,8be89a7980cadaae21525e944b3432e536a27e65c261b9b1a3f4a5ea,638b0209643f118a2f5c8e39492941dc9589ca158a3bde9456485224,0.426650,1.015710,0.33849,False,True
...,...,...,...,...,...,...,...
12642,40da6856c44d2fa8c97dbf18813b8ecb21debf4841fd85dee61e5a34,4e0dc2470f41a3a2882b0f8766ecd0a16323958c6e9665fedc69f61a,1.000000,1.000000,0.93750,False,False
12643,40da6856c44d2fa8c97dbf18813b8ecb21debf4841fd85dee61e5a34,b867ee1259209996197fcb2216a6adba8717f24c8af79a51a5d1d69c,1.000000,1.000000,0.93750,False,False
12644,20c026cc2a019c6de31dcdb8c50f5d1d8c88dc076ca3607e365c69cc,7b419d0e4de2f4290e896d76cee82bf3befa17f28c814ca9c8495dd8,0.909091,0.645161,0.35000,False,False
12645,90e55631988ee03ce42ce17cd80752f4adcc1c810b69a77de7180c2d,2fa5593ff348a88dec9892d3b06a0aae5a40b7ad823fcdc7184fe622,1.000000,1.000000,0.93750,False,False


In [8]:
merged_cat_top_hit = (cat_top_hit[['query', 'target', 'bits']]
                      .merge(model_seq_info[['seq_id', 'defensive']]
                             .rename(columns={'seq_id': 'query', 
                                              'defensive': 'query_defensive'}), 
                             how='inner', on='query'))
merged_cat_top_hit['direction'] = merged_cat_top_hit['query_defensive'] * 2 - 1
merged_cat_top_hit['prediction'] = merged_cat_top_hit['bits'] * merged_cat_top_hit['direction']

In [9]:
out_df = (model_seq_info[['seq_id']]
          .merge(merged_cat_top_hit[['target', 'prediction']]
                 .rename(columns={'target': 'seq_id'}), 
                 how='left', on='seq_id'))
out_df['prediction'] = out_df['prediction'].fillna(0)
out_df['method'] = 'BLAST nearest neighbor'

In [10]:
out_df.to_parquet('../data3/interim/cv_predictions_blast.pq', index=False)