In [2]:
import pandas as pd
from tqdm import tqdm
import os
from joblib import Parallel, delayed
import numpy as np

In [3]:
model_seq_df = pd.read_parquet('../data3/interim/model_seq_info.pq')

In [4]:
len(model_seq_df)

200458

In [5]:
n_neighbors = 2

In [6]:
ft_dir = '../data/genome_downloads/ft/'
ft_files = {x.split('.')[0]: x for x in os.listdir(ft_dir) if '.txt' in x}

In [7]:
def get_assembly_neighbors(assembly, assembly_df, ft_files=ft_files, ft_dir=ft_dir):
    assembly_ft_file = ft_files[assembly]
    assembly_ft = pd.read_table(ft_dir + assembly_ft_file)
    assembly_ft['attributes'] = assembly_ft['attributes'].astype(str)
    filtered_ft = assembly_ft[~assembly_ft['attributes'].str.contains('pseudo', na=False) & 
                              (assembly_ft['# feature'] == 'CDS')].reset_index(drop=True)
    filtered_ft['protein_context_id'] = (filtered_ft['product_accession'] + '|' +
                                         filtered_ft['genomic_accession'] + '|' +
                                         filtered_ft['start'].astype(str) + '|' +
                                         filtered_ft['strand'])
    assembly_neighbor_list = []
    for _, row in assembly_df.iterrows():
        protein_context_id = row['protein_context_id']
        seq_id = row['seq_id']
        center_row = (filtered_ft[filtered_ft['protein_context_id'] == protein_context_id]
                      .squeeze())
        center_index = center_row.name
        center_location = center_row['genomic_accession']
        center_strand = center_row['strand']
        protein_neighbor_df = (filtered_ft.iloc[max(center_index - n_neighbors, 0):(center_index+n_neighbors+1), :])
        protein_neigbhor_df = protein_neighbor_df[protein_neighbor_df['genomic_accession'] == center_location]
        protein_neighbor_out = (protein_neigbhor_df[['product_accession', 'protein_context_id', 'start', 'end', 'strand']].reset_index()
                                .rename(columns={'index': 'relative_position'}))
        protein_neighbor_out['relative_position'] = protein_neighbor_out['relative_position'] - center_index
        if center_strand == '-':
            protein_neighbor_out['relative_position'] = -protein_neighbor_out['relative_position']
        protein_neighbor_out['center_seq_id'] = seq_id
        assembly_neighbor_list.append(protein_neighbor_out)
    assembly_neighbor_df = (pd.concat(assembly_neighbor_list)
                            .reset_index(drop=True))
    return assembly_neighbor_df

In [8]:
n_assemblies = model_seq_df['assembly_stub'].nunique()

In [9]:
assembly_neighbor_list = Parallel(n_jobs=48)(delayed(get_assembly_neighbors)
                                             (assembly, assembly_df, ft_files)
                                             for assembly, assembly_df in tqdm(model_seq_df.groupby('assembly_stub'), 
                                                                               total=n_assemblies, position=0))

100%|██████████| 17109/17109 [02:14<00:00, 127.09it/s]


In [10]:
model_protein_neighbor_df = pd.concat(assembly_neighbor_list)

In [11]:
model_protein_neighbor_df['relative_position'].value_counts()

relative_position
 0    200458
-1    185840
 1    185007
-2    182120
 2    181392
Name: count, dtype: int64

In [12]:
model_protein_neighbor_ids = model_protein_neighbor_df['product_accession'].drop_duplicates()

In [13]:
len(model_protein_neighbor_ids)

897133

In [14]:
%%time
all_protein_ids = pd.read_parquet('../data/interim/seq_id_accessions.pq')

CPU times: user 36.6 s, sys: 11.5 s, total: 48.1 s
Wall time: 50.1 s


In [15]:
neighbor_seq_ids = all_protein_ids.loc[all_protein_ids['protein_accession'].isin(model_protein_neighbor_ids), 
                                       'seq_id'].drop_duplicates()

In [16]:
%%time
all_protein_seqs = pd.read_parquet('../data/interim/refseq_seq_ids.pq')

CPU times: user 1min 20s, sys: 1min 5s, total: 2min 25s
Wall time: 2min 45s


In [17]:
neighbor_seqs = all_protein_seqs[all_protein_seqs['seq_id']
                                 .isin(neighbor_seq_ids)]

In [18]:
with open('../data3/interim/model_neighbor_seqs.faa', 'w') as f:
    for _, row in tqdm(neighbor_seqs.iterrows(), total=len(neighbor_seqs), position=0):
        print('>' + row['seq_id'], file=f)
        seq = row['seq']
        if 'X' in seq:
            seq = seq.replace('X', '')
        if 'U' in seq:
            seq = seq.replace('U', 'C')
        if 'B' in seq:
            seq = seq.replace('B', 'N')
        if 'J' in seq:
            seq = seq.replace('J', 'L')
        print(seq, file=f)

100%|██████████| 897131/897131 [00:47<00:00, 18968.34it/s]


In [19]:
merged_model_protein_neighbors = (model_protein_neighbor_df.merge(all_protein_ids
                                                                  .rename(columns={'protein_accession': 'product_accession'}), 
                                                                  how='inner', on='product_accession')
                                  .merge(all_protein_seqs, how='inner', on='seq_id'))

In [20]:
merged_model_protein_neighbors.to_parquet('../data3/interim/model_neighbors_seq_ids.pq', index=False)