In [1]:
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm
import os

In [2]:
model_protein_neighbors = pd.read_parquet('../data/interim/model_protein_neighbors_seq_ids.pq')

In [3]:
model_protein_neighbors['center_seq_id'].nunique()

1997942

In [4]:
nt_features = ['scaled_A_frac', 'scaled_C_frac', 'scaled_T_frac',
                'scaled_G_frac', 'scaled_AA_frac', 'scaled_AC_frac', 'scaled_AT_frac',
                'scaled_AG_frac', 'scaled_CA_frac', 'scaled_CC_frac', 'scaled_CT_frac',
                'scaled_CG_frac', 'scaled_TA_frac', 'scaled_TC_frac', 'scaled_TT_frac', 
                'scaled_TG_frac', 'scaled_GA_frac', 'scaled_GC_frac', 'scaled_GT_frac',
                'scaled_GG_frac']

In [41]:
%%time
if 'refseq_gc_content_breif.pq' in os.listdir('../data/interim/'):
    gene_context_info = pd.read_parquet('../data/interim/refseq_gc_content_breif.pq')
else:
    gene_context_info = pd.read_csv('../data/interim/refseq_gc_content.csv', 
                                    names=['assembly', 'genomic_locus', 'protein_id', 'start', 'end', 'strand', 'gc_frac', 'scaled_gc_frac', 'A_frac', 'C_frac', 'T_frac', 'G_frac', 'AA_frac', 'AC_frac', 'AT_frac', 'AG_frac', 'CA_frac', 'CC_frac', 'CT_frac', 'CG_frac', 'TA_frac', 'TC_frac', 'TT_frac', 'TG_frac', 'GA_frac', 'GC_frac', 'GT_frac', 'GG_frac', 'scaled_A_frac', 'scaled_C_frac', 'scaled_T_frac', 'scaled_G_frac', 'scaled_AA_frac', 'scaled_AC_frac', 'scaled_AT_frac', 'scaled_AG_frac', 'scaled_CA_frac', 'scaled_CC_frac', 'scaled_CT_frac', 'scaled_CG_frac', 'scaled_TA_frac', 'scaled_TC_frac', 'scaled_TT_frac', 'scaled_TG_frac', 'scaled_GA_frac', 'scaled_GC_frac', 'scaled_GT_frac', 'scaled_GG_frac']
                                   )
    breif_gene_context_info = gene_context_info[['assembly', 'genomic_locus', 'protein_id', 'start', 'end', 'strand', 
                                             'scaled_gc_frac'] + nt_features]
    breif_gene_context_info.to_parquet('../data/interim/refseq_gc_content_breif.pq', index=False)
    del gene_context_info
    gene_context_info = breif_gene_context_info

CPU times: user 38 s, sys: 16.2 s, total: 54.2 s
Wall time: 29.2 s


In [6]:
gene_context_info['protein_context_id'] = (gene_context_info['protein_id'] + '|' + 
                                           gene_context_info['genomic_locus'] + '|' +
                                           gene_context_info['start'].astype(str) + '|' +
                                           gene_context_info['strand'])

In [7]:
model_unique_context_ids = model_protein_neighbors['protein_context_id'].drop_duplicates()

In [8]:
filtered_gene_context_info = gene_context_info[gene_context_info['protein_context_id'].isin(model_unique_context_ids)].copy()
filtered_gene_context_info['len'] = filtered_gene_context_info['end'] - filtered_gene_context_info['start']
filtered_gene_context_info = filtered_gene_context_info[['protein_context_id', 'scaled_gc_frac', 'len', 'strand', 'start', 'end'] +
                                                        nt_features]

In [9]:
del gene_context_info

In [None]:
(~model_unique_context_ids.isin(filtered_gene_context_info['protein_context_id'])).sum()

1

In [38]:
model_unique_context_ids[~model_unique_context_ids.isin(filtered_gene_context_info['protein_context_id'])]

protein_accession
WP_198061853.1    WP_198061853.1|NZ_CP065856.1|4158049|+
Name: protein_context_id, dtype: object

In [46]:
gene_context_info[(gene_context_info['genomic_locus'] == 'NZ_CP065856.1') & 
                  (gene_context_info['start'] == '4158049')]

Unnamed: 0,assembly,genomic_locus,protein_id,start,end,strand,scaled_gc_frac,scaled_A_frac,scaled_C_frac,scaled_T_frac,...,scaled_CT_frac,scaled_CG_frac,scaled_TA_frac,scaled_TC_frac,scaled_TT_frac,scaled_TG_frac,scaled_GA_frac,scaled_GC_frac,scaled_GT_frac,scaled_GG_frac


In [None]:
filtered_gene_context_info

In [40]:
filtered_gene_context_info.head()

Unnamed: 0,protein_context_id,scaled_gc_frac,len,strand,start,end,scaled_A_frac,scaled_C_frac,scaled_T_frac,scaled_G_frac,...,scaled_CT_frac,scaled_CG_frac,scaled_TA_frac,scaled_TC_frac,scaled_TT_frac,scaled_TG_frac,scaled_GA_frac,scaled_GC_frac,scaled_GT_frac,scaled_GG_frac
246,NP_414787.4|NC_000913.3|268097|-,1.147127,863,-,268097,268960,-1.179943,0.818835,-0.29451,1.058365,...,0.118471,0.349002,-0.232652,0.414369,-1.014091,0.47056,-0.496978,0.538086,1.30785,0.745131
247,NP_414788.1|NC_000913.3|269289|-,-0.539899,893,-,269289,270182,0.244777,-0.042509,0.486051,-0.82383,...,0.782311,-1.170612,0.125329,-0.868289,0.898446,0.662835,0.512024,-0.390771,-0.53754,-1.116827
248,NP_414790.1|NC_000913.3|270603|+,-0.950725,1151,+,270603,271754,1.820526,-0.530059,-0.698558,-1.018301,...,0.469721,-1.150382,0.599562,0.309877,-0.64658,-1.699179,1.32817,-1.285362,-1.316926,-0.879074
249,YP_009518741.1|NC_000913.3|272847|+,0.709983,1145,+,272847,273992,0.038834,0.60161,-1.042723,0.564981,...,0.14201,-0.203262,-0.7243,0.572475,-1.353125,-0.34294,0.527487,0.059789,-0.72898,0.859126
250,NP_414793.2|NC_000913.3|274101|-,0.806973,980,-,274101,275081,0.382228,1.375633,-1.563398,-0.015032,...,-0.747733,-0.766212,-1.646724,0.171832,-1.285847,-0.590055,0.651771,0.539759,-1.735407,0.313406


In [11]:
merged_model_protein_neighbors = (model_protein_neighbors
                                  .merge(filtered_gene_context_info, how='left', on='protein_context_id'))

In [12]:
def get_protein_dist(center_seq_id, context_df):
    out_dict = {'center_seq_id': center_seq_id}
    context_df = (context_df.sort_values('start', ascending=True)
                  .dropna()
                  .reset_index(drop=True))
    if len(context_df):
        prev_end = context_df.loc[0, 'end']
        prev_pos = context_df.loc[0, 'relative_position']
        for _, row in context_df.iloc[1:, :].iterrows():
            curr_start = row['start']
            curr_pos = row['relative_position']
            if abs(curr_pos - prev_pos) != 1: # missing value
                break
            sep = curr_start - prev_end
            pos_key = ':'.join([str(x) for x in sorted([curr_pos, prev_pos])])
            out_dict['dist_' + pos_key] = sep
            prev_end =  row['end']
            prev_pos = row['relative_position']
    return out_dict
    

In [13]:
protein_dist_list = Parallel(n_jobs=40)(delayed(get_protein_dist)(center_seq_id, context_df) for center_seq_id, context_df in 
                                        tqdm(merged_model_protein_neighbors.groupby('center_seq_id'), 
                                             total=merged_model_protein_neighbors['center_seq_id'].nunique(), 
                                             position=0))

100%|██████████| 1997942/1997942 [07:58<00:00, 4173.28it/s]


In [14]:
protein_dist_df = pd.DataFrame(protein_dist_list)

We'll fill na values with 70

In [15]:
protein_dist_df.drop(columns='center_seq_id').median(axis=0)

dist_-2:-1    74.0
dist_-1:0     90.0
dist_0:1      47.0
dist_1:2      64.0
dtype: float64

In [16]:
dist_fill_value = 70

In [17]:
center_strand = merged_model_protein_neighbors.loc[merged_model_protein_neighbors['relative_position'] == 0, 
                                                   ['center_seq_id', 'strand']].rename(columns={'strand': 'center_strand'})
merged_model_protein_neighbors = merged_model_protein_neighbors.merge(center_strand, how='inner', 
                                                                      on='center_seq_id')
merged_model_protein_neighbors['co_directional'] = (merged_model_protein_neighbors['center_strand'] == 
                                                    merged_model_protein_neighbors['strand']).astype(int)

In [18]:
merged_model_protein_neighbors

Unnamed: 0,seq_id,relative_position,protein_context_id,center_seq_id,scaled_gc_frac,len,strand,start,end,scaled_A_frac,...,scaled_TA_frac,scaled_TC_frac,scaled_TT_frac,scaled_TG_frac,scaled_GA_frac,scaled_GC_frac,scaled_GT_frac,scaled_GG_frac,center_strand,co_directional
0,589f03f456b3b2319598ba16c39b02ea0e93b4938b9323...,2,NP_052604.1|NC_002127.1|413|+,02ae268da7033aa3ca6b088be59f545bf1fb3f78fcba30...,0.135089,323.0,+,413.0,736.0,0.421245,...,0.171431,-0.448141,-0.643725,-0.772463,1.288548,-0.233139,0.148627,0.796850,-,0
1,45c9ffceb4c19a3772cd39c075a6ac7754bb253e171b0b...,1,NP_052605.1|NC_002127.1|971|-,02ae268da7033aa3ca6b088be59f545bf1fb3f78fcba30...,-3.947183,380.0,-,971.0,1351.0,3.625920,...,2.678686,0.372474,1.632903,-2.663052,0.184258,-2.735342,-2.097031,-2.842932,-,1
2,02ae268da7033aa3ca6b088be59f545bf1fb3f78fcba30...,0,NP_052606.1|NC_002127.1|1348|-,02ae268da7033aa3ca6b088be59f545bf1fb3f78fcba30...,-2.902183,1040.0,-,1348.0,2388.0,3.458894,...,2.203730,-0.197443,-0.082798,-1.523080,0.673516,-2.240322,-2.398005,-2.064829,-,1
3,589f03f456b3b2319598ba16c39b02ea0e93b4938b9323...,1,NP_052604.1|NC_002127.1|413|+,45c9ffceb4c19a3772cd39c075a6ac7754bb253e171b0b...,0.135089,323.0,+,413.0,736.0,0.421245,...,0.171431,-0.448141,-0.643725,-0.772463,1.288548,-0.233139,0.148627,0.796850,-,0
4,45c9ffceb4c19a3772cd39c075a6ac7754bb253e171b0b...,0,NP_052605.1|NC_002127.1|971|-,45c9ffceb4c19a3772cd39c075a6ac7754bb253e171b0b...,-3.947183,380.0,-,971.0,1351.0,3.625920,...,2.678686,0.372474,1.632903,-2.663052,0.184258,-2.735342,-2.097031,-2.842932,-,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9749078,30f067f4d1e036902d730fa8ea0e98cab8d96a0fe435dc...,-2,YP_501450.1|NC_007795.1|2774450|-,4757f91c12528bc49e87117a35076b8f6684629eda0133...,-1.433362,560.0,-,2774450.0,2775010.0,0.797855,...,0.465510,0.145591,0.582507,-0.776712,1.055644,-1.321942,-0.940149,-0.924618,+,0
9749079,c1a6790679e24a1e96ea218374c0d870b87556ea529d0b...,-1,YP_501451.1|NC_007795.1|2775174|+,4757f91c12528bc49e87117a35076b8f6684629eda0133...,0.420774,1238.0,+,2775174.0,2776412.0,-0.936401,...,-0.042308,0.827644,0.507769,-0.275105,-0.025146,0.221240,-0.236492,0.158719,+,1
9749080,4757f91c12528bc49e87117a35076b8f6684629eda0133...,0,YP_501452.1|NC_007795.1|2776376|+,4757f91c12528bc49e87117a35076b8f6684629eda0133...,-0.384378,305.0,+,2776376.0,2776681.0,-0.664195,...,0.448341,0.582063,0.850915,-0.155798,0.111216,-0.114869,-0.570239,-0.604577,+,1
9749081,4fe300ea46bf737048a5e2bd913d8a03f2b43e2930bcfb...,1,YP_501453.1|NC_007795.1|2776678|+,4757f91c12528bc49e87117a35076b8f6684629eda0133...,-0.993582,872.0,+,2776678.0,2777550.0,1.017254,...,0.286746,0.167527,-0.390277,-0.417332,-0.010915,-0.911288,-1.188981,-0.489098,+,1


In [19]:
median_len = filtered_gene_context_info['len'].median()
median_len

869.0

In [20]:
fill_values = dict()
for feature in nt_features:
    fill_values[feature] = 0

In [21]:
fill_values['scaled_gc_frac'] = 0
fill_values['len'] = median_len
fill_values['dist'] = dist_fill_value
fill_values['co_directional'] = 2

In [22]:
wide_model_protein_neighbors = (merged_model_protein_neighbors.pivot(index='center_seq_id', 
                                                                     values=['scaled_gc_frac', 'len', 'co_directional'] + nt_features, 
                                                                     columns='relative_position'))
wide_model_protein_neighbors.columns = ['_'.join([str(y) for y in x]) for x in wide_model_protein_neighbors.columns.to_flat_index()]
wide_model_protein_neighbors = wide_model_protein_neighbors.merge(protein_dist_df.set_index('center_seq_id'), 
                                                                  how='left', left_index=True, right_index=True)
for column in wide_model_protein_neighbors.columns:
    column_stub = '_'.join(column.split('_')[:-1])
    fill_value = fill_values[column_stub]
    wide_model_protein_neighbors[column] = wide_model_protein_neighbors[column].fillna(fill_value)

In [23]:
wide_model_protein_neighbors

Unnamed: 0_level_0,scaled_gc_frac_-2,scaled_gc_frac_-1,scaled_gc_frac_0,scaled_gc_frac_1,scaled_gc_frac_2,len_-2,len_-1,len_0,len_1,len_2,...,scaled_GT_frac_2,scaled_GG_frac_-2,scaled_GG_frac_-1,scaled_GG_frac_0,scaled_GG_frac_1,scaled_GG_frac_2,dist_-2:-1,dist_-1:0,dist_0:1,dist_1:2
center_seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000031de6e3b5adb7f0ecb501bce05d5aebd9f5c2d64450793600a,-0.291969,-0.808439,-1.166177,-0.262478,0.845258,1364.0,851.0,839.0,524.0,851.0,...,-1.200405,1.794010,-0.515178,-0.341655,1.601801,-0.344099,200.0,356.0,239.0,284.0
000003c6658a2cf3fe4e739e0a641f624338811ec0e55e5a7509fc86,-0.956352,-2.254873,0.073184,0.293719,0.863756,167.0,347.0,605.0,629.0,611.0,...,-1.486449,3.629988,0.435936,-0.420555,0.909974,0.451593,314.0,132.0,29.0,8.0
0000085725b772ff8734798556e9317c38b3105bee0df167bf6cace0,-0.800718,-0.709763,-1.685457,-1.873131,-1.574010,290.0,182.0,683.0,1670.0,2291.0,...,-0.603746,-0.690089,-0.152412,-0.512570,-1.293962,-0.956345,1292.0,595.0,102.0,2.0
00000b041f06ee25cf204d5168238a080dc196974ff162ec2e0416ed,0.889862,0.818415,-0.027893,-1.085253,0.344178,2663.0,998.0,1181.0,446.0,2060.0,...,-0.376070,0.416345,0.236975,0.350606,-1.068934,0.038963,-12.0,821.0,55.0,0.0
00000c3015ae18f4115e77c81047159441e4c6322058d16ff24071f9,0.427975,0.534383,0.718382,0.139919,-0.476428,431.0,1091.0,1322.0,683.0,833.0,...,-1.106032,0.373060,0.827793,0.214715,1.121724,0.224208,-3.0,189.0,224.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffffdafc717268abe13a393cf8b3bb709f2119d1a138396b4076bcc0,0.972729,0.562742,0.670913,1.001866,0.533308,890.0,626.0,356.0,1499.0,1400.0,...,0.617685,0.755903,0.273112,-0.626218,1.268492,0.494462,114.0,15.0,111.0,176.0
ffffeb4edb2788545cc1fb279e6f336ef546103ec33b2cad693054de,0.485771,0.346782,0.405034,1.373829,0.747234,608.0,1007.0,857.0,758.0,770.0,...,-0.309120,-0.620039,-0.422556,-0.523575,0.668382,0.588874,252.0,13.0,13.0,34.0
fffff2d6d419282acd12150546bb3e574f0992816fe1d71d37e9ea19,-1.381990,1.315606,0.666974,1.078907,1.501027,290.0,533.0,383.0,1370.0,638.0,...,0.912425,-1.666129,-0.086662,-0.487186,1.516951,1.703906,435.0,51.0,151.0,101.0
fffffb3e364069d1dd95e7fcebe0ee5635d7747745c303bf91b1224d,0.813223,-0.128049,-0.158994,-0.281792,-0.246723,707.0,2252.0,632.0,482.0,383.0,...,-0.500796,0.413001,-0.427990,-0.744513,-0.578809,-0.403795,150.0,93.0,-3.0,52.0


## append sequence features to protein representations

In [24]:
%%time
train_X_reps = pd.read_parquet('../data/interim/train_X_reps.pq')
test_X_reps = pd.read_parquet('../data/interim/test_X_reps.pq')
val_X_reps = pd.read_parquet('../data/interim/val_X_reps.pq')

CPU times: user 3min 10s, sys: 1min 34s, total: 4min 44s
Wall time: 28.2 s


In [25]:
train_X = (train_X_reps.merge(wide_model_protein_neighbors, how='inner', 
                              left_index=True, right_index=True))

In [26]:
test_X = (test_X_reps.merge(wide_model_protein_neighbors, how='inner', 
                            left_index=True, right_index=True))

In [27]:
val_X = (val_X_reps.merge(wide_model_protein_neighbors, how='inner', 
                            left_index=True, right_index=True))

In [28]:
(val_X.columns == train_X.columns).all()

True

In [29]:
(test_X.columns == train_X.columns).all()

True

In [30]:
(train_X.index == train_X_reps.index).all()

True

In [31]:
(test_X.index == test_X_reps.index).all()

True

In [32]:
(val_X.index == val_X_reps.index).all()

True

In [33]:
train_X.to_parquet('../data/interim/train_X.pq')
test_X.to_parquet('../data/interim/test_X.pq')
val_X.to_parquet('../data/interim/val_X.pq')

In [2]:
train_X = pd.read_parquet('../data/interim/train_X.pq')
train_X.shape

(1744039, 3319)