In [2]:
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np

In [3]:
def get_gene_dist(center_seq_id, context_df):
    center_strand = context_df.loc[(context_df['relative_position'] == 0), 'strand'].item()
    if center_strand == '+':
        context_df = context_df.sort_values('relative_position', ascending=True)
    else:
        context_df = context_df.sort_values('relative_position', ascending=False)
    curr_end = context_df['end']
    next_start = context_df['start'].shift(-1)
    if (context_df['end'] < context_df['start']).any():
        context_df['wraparound'] = context_df['end'] < context_df['start']
        next_wraparound = context_df['wraparound'].shift(-1)
        distances = np.where(next_wraparound, 
                             -curr_end,
                             next_start - curr_end)
    else:
        distances = next_start - curr_end
    distances = list(distances)
    out_dict = {'center_seq_id': center_seq_id}
    relative_positions = context_df['relative_position'].to_list()
    for i in range(len(context_df) - 1):
        pos_i = relative_positions[i]
        pos_j = relative_positions[i+1]
        out_dict['dist_' + 
                 ':'.join([str(min(pos_i, pos_j)), 
                           str(max(pos_i, pos_j))])] = distances[i]
    return out_dict

In [4]:
n_feats = 640
feat_names = ['ft' + str(i+1) for i in range(n_feats)]

In [5]:
model_protein_neighbors = pd.read_parquet('../data3/interim/model_neighbors_seq_ids.pq')
model_rep_df = pd.read_parquet('../data3/interim/model_neighbor_representations.pq')
model_rep_df.columns = feat_names

In [6]:
%%time
gene_context_info = pd.read_parquet('../data/interim/refseq_gc_content_breif.pq')

CPU times: user 39.2 s, sys: 21.5 s, total: 1min
Wall time: 59.3 s


## Get representation features

In [7]:
wide_rep_df = (model_protein_neighbors[['relative_position', 'center_seq_id', 'seq_id']]
                 .merge(model_rep_df, how='inner', 
                        left_on='seq_id', right_index=True)
                 .pivot(index='center_seq_id', columns='relative_position', 
                        values=feat_names))
wide_rep_df = wide_rep_df.fillna(0)
wide_rep_df.columns = [x[0] + '_' + str(x[1]) for x in wide_rep_df.columns]

In [8]:
del model_rep_df

## Get nt features

### Feature class 1: nucleotide

In [9]:
gene_context_info['protein_context_id'] = (gene_context_info['protein_id'] + '|' + 
                                           gene_context_info['genomic_locus'] + '|' +
                                           gene_context_info['start'].astype(str) + '|' +
                                           gene_context_info['strand'])

In [10]:
model_unique_context_ids = model_protein_neighbors['protein_context_id'].drop_duplicates()

In [11]:
nt_features = ['scaled_gc_frac', 'scaled_A_frac', 'scaled_C_frac', 'scaled_T_frac',
               'scaled_G_frac', 'scaled_AA_frac', 'scaled_AC_frac', 'scaled_AT_frac',
               'scaled_AG_frac', 'scaled_CA_frac', 'scaled_CC_frac', 'scaled_CT_frac',
               'scaled_CG_frac', 'scaled_TA_frac', 'scaled_TC_frac', 'scaled_TT_frac', 
               'scaled_TG_frac', 'scaled_GA_frac', 'scaled_GC_frac', 'scaled_GT_frac',
               'scaled_GG_frac']

In [12]:
filtered_gene_context_info = gene_context_info[gene_context_info['protein_context_id'].isin(model_unique_context_ids)]

In [13]:
del gene_context_info

In [14]:
(~model_unique_context_ids.isin(filtered_gene_context_info['protein_context_id'])).sum()

0

In [15]:
merged_model_protein_neighbors = (model_protein_neighbors
                                  .merge(filtered_gene_context_info[['protein_context_id'] + nt_features], 
                                         how='inner', on='protein_context_id'))

In [16]:
wide_nt_df = merged_model_protein_neighbors.pivot(index='center_seq_id', values=nt_features, columns='relative_position')
wide_nt_df.columns = [x[0] + '_' + str(x[1]) for x in wide_nt_df.columns]
wide_nt_df = wide_nt_df.fillna(1.1)

### Feature class 2: len

In [17]:
len_df = merged_model_protein_neighbors.copy()
len_df['len'] = len_df['seq'].str.len()


In [18]:
wide_len_df = len_df.pivot(index='center_seq_id', values=['len'], columns='relative_position')
wide_len_df.columns = [x[0] + '_' + str(x[1]) for x in wide_len_df.columns]
wide_len_df = wide_len_df.fillna(0)

### Feature class 3: strand

In [19]:
center_strand = merged_model_protein_neighbors.loc[merged_model_protein_neighbors['relative_position'] == 0, 
                                                   ['center_seq_id', 'strand']].rename(columns={'strand': 'center_strand'})
strand_df = merged_model_protein_neighbors.merge(center_strand, how='inner', 
                                                                      on='center_seq_id')
strand_df['co_directional'] = (strand_df['center_strand'] == strand_df['strand']).astype(int)


In [20]:
wide_strand_df = strand_df.pivot(index='center_seq_id', values=['co_directional'], columns='relative_position')
wide_strand_df.columns = [x[0] + '_' + str(x[1]) for x in wide_strand_df.columns]
wide_strand_df = wide_strand_df.fillna(2)

### Feature class 4: distance

In [21]:
protein_dist_list = Parallel(n_jobs=48)(delayed(get_gene_dist)(center_seq_id, context_df) for center_seq_id, context_df in 
                                        tqdm(model_protein_neighbors.groupby('center_seq_id'), 
                                             total=model_protein_neighbors['center_seq_id'].nunique(), 
                                             position=0))

100%|██████████| 200458/200458 [01:08<00:00, 2911.07it/s]


In [22]:
protein_dist_df = pd.DataFrame(protein_dist_list)

In [23]:
protein_dist_df.drop(columns='center_seq_id').min(axis=0)

dist_-2:-1   -119.0
dist_-1:0    -119.0
dist_0:1     -119.0
dist_1:2     -118.0
dtype: float64

In [24]:
protein_dist_df = (protein_dist_df.fillna(-200)
                   .set_index('center_seq_id'))

## Merge features

In [25]:
feature_df = wide_rep_df
for df in [wide_nt_df, wide_len_df, wide_strand_df, protein_dist_df]:
    feature_df = (feature_df.merge(df, left_index=True, right_index=True, how='inner'))

In [26]:
feature_df.shape

(200458, 3319)

In [27]:
feature_df

Unnamed: 0_level_0,ft1_-2,ft1_-1,ft1_0,ft1_1,ft1_2,ft2_-2,ft2_-1,ft2_0,ft2_1,ft2_2,...,len_2,co_directional_-2,co_directional_-1,co_directional_0,co_directional_1,co_directional_2,dist_-2:-1,dist_-1:0,dist_0:1,dist_1:2
center_seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000003c6658a2cf3fe4e739e0a641f624338811ec0e55e5a7509fc86,0.019829,-0.105371,-0.064539,-0.092942,-0.092769,-0.056798,-0.032411,-0.103213,0.066013,-0.019033,...,203.0,1.0,1.0,1.0,0.0,0.0,314.0,132.0,29.0,8.0
0000443dc15d579934c57e4b2b1533f58a44a3040003e88cf890415d,-0.064863,-0.098375,-0.171987,-0.201704,-0.109224,-0.118126,-0.027895,-0.069349,0.008060,0.075848,...,59.0,1.0,1.0,1.0,1.0,1.0,76.0,61.0,119.0,268.0
00011a6f43ddef04b38ddb80e079e995128c758b8d0fef221575fc14,-0.136331,-0.069856,-0.048419,-0.121671,-0.154427,-0.002103,-0.036254,0.075202,-0.007625,0.014461,...,314.0,0.0,0.0,1.0,0.0,1.0,562.0,1570.0,389.0,1208.0
000162b2e47bdb67d2b05159f2278de59944c6c480a62208357ff9d3,-0.084717,-0.016160,-0.100061,-0.074617,-0.036009,-0.121645,-0.061003,-0.060410,0.075180,-0.053403,...,241.0,1.0,1.0,1.0,0.0,0.0,2.0,32.0,-5.0,140.0
0001b651260c631da96f5be3bb6369d341fa33136e72727651535c00,-0.204649,-0.165588,-0.143960,-0.127274,-0.134224,0.129246,0.018843,0.032963,0.107930,-0.034815,...,288.0,1.0,1.0,1.0,1.0,1.0,302.0,16.0,153.0,138.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffe22c006b65defec0d43d82f17b7a57e3b3e75e64b5b92dcc63643,-0.065802,-0.071812,-0.040645,-0.074758,0.077448,-0.043128,-0.069755,-0.123530,0.013845,-0.196845,...,920.0,1.0,0.0,1.0,1.0,1.0,97.0,277.0,59.0,98.0
fffeb518840157c0677fc2a6f41bc7ee8ba36ce36bae0e18c0862165,0.031747,0.014191,-0.028699,-0.049381,0.006482,-0.122986,-0.156735,-0.084740,-0.116834,-0.115845,...,244.0,0.0,0.0,1.0,0.0,1.0,-3.0,-92.0,7.0,410.0
fffeda7da5ec5628b6fa79bed90c49590dd958fcd3e3cfa68bd71b91,-0.169804,-0.009544,-0.084809,-0.088796,-0.161447,0.043151,-0.086960,-0.009640,-0.023616,-0.039633,...,161.0,1.0,0.0,1.0,0.0,0.0,792.0,738.0,520.0,-3.0
ffff06672262ded10c08cf9c69483eeb988a4f325806abf4b19e8691,-0.088125,-0.083768,-0.058371,-0.107058,-0.045431,-0.013914,0.048164,0.019016,-0.149642,-0.104616,...,358.0,1.0,1.0,1.0,0.0,0.0,57.0,-3.0,39.0,9.0


In [28]:
feature_df.to_parquet('../data3/interim/defense_predictor_full_ft_mat.pq', index=True)