In [1]:
import pandas as pd
from tqdm import tqdm
import os
from joblib import Parallel, delayed
import numpy as np

In [2]:
model_seq_df = pd.read_parquet('../data/interim/model_seqs.pq')

In [4]:
model_seq_df.to_csv('../data/processed/model_seqs.csv', index=False)

In [4]:
len(model_seq_df)

1997942

In [5]:
model_seq_df[['split', 'defense']].value_counts()

split  defense
train  False      1644397
test   False       166243
train  True         99642
val    False        73098
test   True         10128
val    True          4434
Name: count, dtype: int64

In [6]:
n_neighbors = 2

In [7]:
ft_dir = '../data/genome_downloads/ft/'
ft_files = [x for x in os.listdir(ft_dir) if '.txt' in x]

In [8]:
n_assemblies = model_seq_df['assembly'].nunique()

In [9]:
neighbor_out = '../data/interim/temp_protein_neighbors/'

In [10]:
if 'temp_protein_neighbors' in os.listdir('../data/interim'):
    os.system('rm -r ' + neighbor_out)
os.mkdir(neighbor_out)

In [11]:
def get_assembly_neighbors(assembly, assembly_df, ft_files, ft_dir, neighbor_out):
    assembly_ft_file = next(x for x in ft_files if assembly in x)
    assembly_ft = pd.read_table(ft_dir + assembly_ft_file)
    assembly_ft['attributes'] = assembly_ft['attributes'].astype(str)
    filtered_ft = assembly_ft[~assembly_ft['attributes'].str.contains('pseudo', na=False) & 
                              (assembly_ft['# feature'] == 'CDS')].reset_index(drop=True)
    filtered_ft['protein_context_id'] = (filtered_ft['product_accession'] + '|' +
                                         filtered_ft['genomic_accession'] + '|' +
                                         filtered_ft['start'].astype(str) + '|' +
                                         filtered_ft['strand'])
    assembly_neighbor_list = []
    for _, row in assembly_df.iterrows():
        protein_context_id = row['protein_context_id']
        seq_id = row['seq_id']
        center_row = (filtered_ft[filtered_ft['protein_context_id'] == protein_context_id]
                      .squeeze())
        center_index = center_row.name
        center_location = center_row['genomic_accession']
        center_strand = center_row['strand']
        protein_neighbor_df = (filtered_ft.iloc[max(center_index - n_neighbors, 0):(center_index+n_neighbors+1), :])
        protein_neigbhor_df = protein_neighbor_df[protein_neighbor_df['genomic_accession'] == center_location]
        protein_neighbor_out = (protein_neigbhor_df[['product_accession', 'protein_context_id']].reset_index()
                                .rename(columns={'index': 'relative_position'}))
        protein_neighbor_out['relative_position'] = protein_neighbor_out['relative_position'] - center_index
        if center_strand == '-':
            protein_neighbor_out['relative_position'] = -protein_neighbor_out['relative_position']
        protein_neighbor_out['center_seq_id'] = seq_id
        assembly_neighbor_list.append(protein_neighbor_out)
    assembly_neighbor_df = (pd.concat(assembly_neighbor_list)
                            .reset_index(drop=True))
    assembly_neighbor_df.to_csv(neighbor_out + assembly  + '.csv', index=False, header=False)

In [12]:
_ = Parallel(n_jobs=48)(delayed(get_assembly_neighbors)
                    (assembly, assembly_df, ft_files, ft_dir, neighbor_out)
                    for assembly, assembly_df in tqdm(model_seq_df.groupby('assembly'), 
                                                       total=n_assemblies, leave=True, 
                                                       position=0))

100%|██████████| 17405/17405 [02:25<00:00, 119.81it/s]


In [13]:
model_protein_neighbor_file = '../data/interim/model_protein_neighbors.csv'

In [14]:
os.system('cat ' + neighbor_out + '*.csv > ' + model_protein_neighbor_file)

0

In [15]:
os.system('rm -r ' + neighbor_out)

0

In [16]:
model_protein_neighbor_df = pd.read_csv(model_protein_neighbor_file, 
                                        names=['relative_position', 'product_accession', 'protein_context_id', 'center_seq_id'])

In [17]:
model_protein_neighbor_df['relative_position'].value_counts()

relative_position
 0    1997942
 1    1949892
-1    1948300
 2    1926775
-2    1926174
Name: count, dtype: int64

In [18]:
model_protein_neighbor_ids = model_protein_neighbor_df['product_accession'].drop_duplicates()

In [19]:
len(model_protein_neighbor_ids)

8444708

In [20]:
%%time
all_protein_ids = pd.read_parquet('../data/interim/seq_id_accessions.pq')

CPU times: user 35.4 s, sys: 11.4 s, total: 46.8 s
Wall time: 51 s


In [21]:
neighbor_protein_id_df = all_protein_ids[all_protein_ids['protein_accession'].isin(model_protein_neighbor_ids)]

In [22]:
neighbor_seq_ids = set(neighbor_protein_id_df['seq_id'].drop_duplicates())

In [23]:
chunk_size = 10_000

In [24]:
model_seq_neighbor_out = '../data/interim/model_seq_neighbors/'

In [27]:
if 'model_seq_neighbors' in os.listdir('../data/interim'):
    os.system('rm -r ' + model_seq_neighbor_out)
os.mkdir(model_seq_neighbor_out)

In [30]:
n_chunks = np.ceil(len(neighbor_protein_id_df)/chunk_size)

In [33]:
n_super_chunks = 4

In [34]:
super_chunk_size = np.ceil(n_chunks/n_super_chunks)
super_chunk_size

212.0

In [35]:
for i in range(n_super_chunks):
    os.mkdir(model_seq_neighbor_out + str(i))

In [37]:
i = 0
chunk = 0
super_chunk = 0
for line in tqdm(open('../data/interim/unique_seqs.faa', 'r')):
    if '>' == line[0]: 
        seq_id = line[1:-1]
    else:
        seq = line[:-1]
        if seq_id in neighbor_seq_ids:
            if i % chunk_size == 0:
                super_chunk = str(int(np.floor(chunk/super_chunk_size)))
                if i != 0:
                    f.close()
                f = open(model_seq_neighbor_out + super_chunk + '/neighbors_' + str(chunk) + '.faa', 'w')
                chunk += 1
            if 'J' in seq:
                seq = seq.replace('J', 'L')
            print('>'+seq_id, file=f)
            print(seq, file=f)
            i += 1
f.close()

133062578it [01:44, 1279267.68it/s]


In [38]:
neighbor_protein_id_df = neighbor_protein_id_df.set_index('protein_accession')

In [39]:
model_protein_neighbor_df = model_protein_neighbor_df.set_index('product_accession')

In [40]:
merged_neigbhor_seq_ids = (neighbor_protein_id_df.merge(model_protein_neighbor_df, how='inner', 
                                                        left_index=True, right_index=True))

In [41]:
merged_neigbhor_seq_ids.head()

Unnamed: 0_level_0,seq_id,relative_position,protein_context_id,center_seq_id
protein_accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NP_052604.1,589f03f456b3b2319598ba16c39b02ea0e93b4938b9323...,2,NP_052604.1|NC_002127.1|413|+,02ae268da7033aa3ca6b088be59f545bf1fb3f78fcba30...
NP_052604.1,589f03f456b3b2319598ba16c39b02ea0e93b4938b9323...,1,NP_052604.1|NC_002127.1|413|+,45c9ffceb4c19a3772cd39c075a6ac7754bb253e171b0b...
NP_052605.1,45c9ffceb4c19a3772cd39c075a6ac7754bb253e171b0b...,1,NP_052605.1|NC_002127.1|971|-,02ae268da7033aa3ca6b088be59f545bf1fb3f78fcba30...
NP_052605.1,45c9ffceb4c19a3772cd39c075a6ac7754bb253e171b0b...,0,NP_052605.1|NC_002127.1|971|-,45c9ffceb4c19a3772cd39c075a6ac7754bb253e171b0b...
NP_052606.1,02ae268da7033aa3ca6b088be59f545bf1fb3f78fcba30...,0,NP_052606.1|NC_002127.1|1348|-,02ae268da7033aa3ca6b088be59f545bf1fb3f78fcba30...


In [42]:
merged_neigbhor_seq_ids['relative_position'].value_counts()

relative_position
 0    1997942
 1    1949892
-1    1948300
 2    1926775
-2    1926174
Name: count, dtype: int64

In [43]:
merged_neigbhor_seq_ids.to_parquet('../data/interim/model_protein_neighbors_seq_ids.pq')

In [44]:
model_seq_df.to_parquet('../data/interim/filtered_model_seqs.pq', index=False)

### Optional -- remove representations folder if building new representations

In [5]:
if 'representations' in os.listdir('../data/interim'):
    os.system('rm -r ../data/interim/representations/')
os.mkdir('../data/interim/representations/')