In [1]:
import pandas as pd
import os
import numpy as np
from tqdm import tqdm

In [2]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [3]:
model_protein_neighbors = pd.read_parquet('../data/interim/model_protein_neighbors_seq_ids.pq')

In [4]:
center_protein_info = pd.read_parquet('../data/interim/filtered_model_seqs.pq')

In [4]:
os.system('cat ../data/interim/representations/*.csv > ../data/interim/model_representations.csv')

0

In [6]:
os.system('rm -r ../data/interim/representations/')

0

In [5]:
n_feats = 640
feat_names = ['ft' + str(i+1) for i in range(n_feats)]

In [6]:
%%time
model_representations = pd.read_csv('../data/interim/model_representations.csv', 
                                    index_col=0, names=feat_names)

CPU times: user 10min 22s, sys: 54.6 s, total: 11min 17s
Wall time: 11min 18s


In [10]:
def get_split_representations(center_protein_info, split, 
                              model_protein_neighbors, feat_names):
    split_y = (center_protein_info.loc[center_protein_info['split'] == split, 
                                       ['seq_id', 'defense']]
               .set_index('seq_id'))
    split_protein_neighbors = (model_protein_neighbors[model_protein_neighbors['center_seq_id']
                                                       .isin(split_y.index)]
                               .set_index('seq_id'))
    split_representations = model_representations.loc[split_protein_neighbors.index, :]
    cat_split_representations = pd.concat([split_protein_neighbors, split_representations], 
                                          axis=1)
    print('pivoting features')
    split_X = cat_split_representations.pivot(values=feat_names,index='center_seq_id', columns='relative_position')
    split_X = split_X.fillna(0)
    split_X = split_X.loc[split_y.index, :]
    split_X.columns = [x[0] + '_' + str(x[1]) for x in split_X.columns]
    assert (split_X.index == split_y.index).sum()/len(split_X)
    return split_X, split_y
    

In [11]:
%%time 
val_X, val_y = get_split_representations(center_protein_info, 
                                         'val', model_protein_neighbors,
feat_names)

pivoting features
CPU times: user 17 s, sys: 4.68 s, total: 21.7 s
Wall time: 24.7 s


In [12]:
%%time 
test_X, test_y = get_split_representations(center_protein_info, 
                                         'test', model_protein_neighbors,
feat_names)

pivoting features
CPU times: user 25.6 s, sys: 15.3 s, total: 40.9 s
Wall time: 41.2 s


## Outputs

In [13]:
val_X.to_parquet('../data/interim/val_X_reps.pq')
val_y.to_parquet('../data/interim/val_y.pq')
test_X.to_parquet('../data/interim/test_X_reps.pq')
test_y.to_parquet('../data/interim/test_y_indexed.pq')

In [14]:
val_X.columns

Index(['ft1_-2', 'ft1_-1', 'ft1_0', 'ft1_1', 'ft1_2', 'ft2_-2', 'ft2_-1',
       'ft2_0', 'ft2_1', 'ft2_2',
       ...
       'ft639_-2', 'ft639_-1', 'ft639_0', 'ft639_1', 'ft639_2', 'ft640_-2',
       'ft640_-1', 'ft640_0', 'ft640_1', 'ft640_2'],
      dtype='object', length=3200)

In [15]:
test_X.columns

Index(['ft1_-2', 'ft1_-1', 'ft1_0', 'ft1_1', 'ft1_2', 'ft2_-2', 'ft2_-1',
       'ft2_0', 'ft2_1', 'ft2_2',
       ...
       'ft639_-2', 'ft639_-1', 'ft639_0', 'ft639_1', 'ft639_2', 'ft640_-2',
       'ft640_-1', 'ft640_0', 'ft640_1', 'ft640_2'],
      dtype='object', length=3200)

## Get training representations with less memory

In [16]:
del val_X, val_y, test_X, test_y

In [None]:
split = 'train'

In [None]:
split_y = (center_protein_info.loc[center_protein_info['split'] == split, 
                                   ['seq_id', 'defense']]
           .set_index('seq_id'))
split_protein_neighbors = (model_protein_neighbors[model_protein_neighbors['center_seq_id']
                                                   .isin(split_y.index)]
                           .set_index('seq_id'))
split_representations = model_representations.loc[split_protein_neighbors.index, :]
cat_split_representations = pd.concat([split_protein_neighbors, split_representations], 
                                      axis=1)
print('pivoting features')


pivoting features


In [None]:
del model_protein_neighbors, center_protein_info, model_representations

In [None]:
unique_center_ids = cat_split_representations['center_seq_id'].unique()
chunk_size = 100_000
n_chunks = np.ceil(len(unique_center_ids)/chunk_size)
center_chunks = chunks(unique_center_ids, chunk_size)
print(n_chunks)

18.0


In [None]:
split_X_list = []
for center_chunk in tqdm(center_chunks, total=n_chunks):
    current_representations = cat_split_representations[cat_split_representations['center_seq_id']
                                                        .isin(center_chunk)]
    current_X = current_representations.pivot(values=feat_names,index='center_seq_id', columns='relative_position')
    split_X_list.append(current_X)

100%|██████████| 18/18.0 [02:42<00:00,  9.01s/it]


In [None]:
del cat_split_representations, split_representations

In [None]:
split_X = pd.concat(split_X_list, axis=0)

In [None]:
del split_X_list

In [None]:
split_X.fillna(0, inplace=True)

In [None]:
split_X = split_X.loc[split_y.index,:]

In [None]:
split_X.columns = [x[0] + '_' + str(x[1]) for x in split_X.columns]
assert ((split_X.index == split_y.index).sum()/len(split_X) == 1)

In [None]:
split_X.head()

Unnamed: 0_level_0,ft1_-2,ft1_-1,ft1_0,ft1_1,ft1_2,ft2_-2,ft2_-1,ft2_0,ft2_1,ft2_2,...,ft639_-2,ft639_-1,ft639_0,ft639_1,ft639_2,ft640_-2,ft640_-1,ft640_0,ft640_1,ft640_2
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000031de6e3b5adb7f0ecb501bce05d5aebd9f5c2d64450793600a,-0.083329,-0.042273,-0.004617,-0.075961,-0.11923,-0.08548,-0.065199,-0.062837,-0.064302,-0.165219,...,0.111215,0.065203,0.05789,0.024322,-0.001099,-0.052025,0.023455,-0.023632,-0.009242,-0.06477
630eedef674c705d4c3f1a80262a4078b919053d96207505e271fe09,-0.158411,-0.10575,-0.020799,-0.162112,-0.161958,-0.125808,-0.031724,-0.068024,-0.111398,-0.10038,...,-0.01197,-0.045389,0.075452,0.049475,-0.017922,-0.020179,-0.048058,-0.005416,0.067265,0.100298
84db8c4f92fb99dbe4c68a77965aebf3d0a6415fb45870a695518bb7,-0.098758,-0.145095,0.025922,-0.052099,-0.105004,-0.02645,-0.06575,-0.043146,-0.087908,-0.137171,...,0.163176,-0.094553,0.034607,-0.064664,0.048833,0.076394,-0.052672,-0.033729,0.022013,-0.015605
96e1190b92e1bc534871c42f763c30150b07842e443fa4febc7a61d7,-0.068715,-0.145819,0.01075,-0.034874,-0.114603,-0.103854,-0.098787,-0.050438,0.030913,0.01622,...,-0.080807,-0.077194,0.076167,-0.056793,0.053638,0.081381,0.05012,-0.008312,-0.049163,-0.028677
bd23691f0bc6ce6b93b6f6468980236a3a232a681d29a9b0b17537c6,-0.157066,-0.101072,0.006881,-0.036522,-0.032527,-0.093689,-0.119771,-0.059533,0.015149,-0.032177,...,0.121944,0.074653,0.054354,0.07237,-0.053129,-0.019027,0.065122,-0.028658,0.022518,-0.012641


In [None]:
split_X.to_parquet('../data/interim/train_X_reps.pq')
split_y.to_parquet('../data/interim/train_y.pq')

In [None]:
split_X.columns

Index(['ft1_-2', 'ft1_-1', 'ft1_0', 'ft1_1', 'ft1_2', 'ft2_-2', 'ft2_-1',
       'ft2_0', 'ft2_1', 'ft2_2',
       ...
       'ft639_-2', 'ft639_-1', 'ft639_0', 'ft639_1', 'ft639_2', 'ft640_-2',
       'ft640_-1', 'ft640_0', 'ft640_1', 'ft640_2'],
      dtype='object', length=3200)