In [1]:
import pandas as pd
from joblib import Parallel, delayed
import os
from tqdm import tqdm

In [2]:
def get_assembly_protein_names(ft_file):
    assembly_ft = pd.read_table('../data/genome_downloads/ft/' + ft_file)
    filtered_ft = assembly_ft.loc[~assembly_ft['product_accession'].isna() &
                                  ~assembly_ft['name'].isna(),
                                  ['product_accession', 'name']].drop_duplicates()
    assembly = ft_file.split('.')[0]
    out_file = '../data/interim/assembly_proteins_temp/' + assembly + '.csv'
    filtered_ft.to_csv(out_file, index=False, header=False)

In [3]:
os.mkdir('../data/interim/assembly_proteins_temp/')
ft_files = [x for x in os.listdir('../data/genome_downloads/ft') if '.txt' in x]
print('Filtering ft files')
_ = Parallel(n_jobs=32)(delayed(get_assembly_protein_names)(ft_file) for ft_file in tqdm(ft_files))
os.system('cat ../data/interim/assembly_proteins_temp/*.csv > ../data/interim/all_assembly_proteins_temp.csv')
os.system('rm -r ../data/interim/assembly_proteins_temp/')

Filtering ft files


100%|██████████| 17454/17454 [01:05<00:00, 266.50it/s]


0

In [None]:
print('Getting unique protein names')
all_assembly_proteins = pd.read_csv('../data/interim/all_assembly_proteins_temp.csv',
                                    names=['product_accession', 'name'])
unique_protein_names = all_assembly_proteins.drop_duplicates()
del all_assembly_proteins
os.system('rm ../data/interim/all_assembly_proteins_temp.csv')
unique_protein_names = unique_protein_names.set_index('product_accession')

In [5]:
print('Merging with sequence IDs')
unique_seq_id_accessions = pd.read_parquet('../data/interim/seq_id_accessions.pq')
unique_seq_id_accessions = unique_seq_id_accessions.set_index('protein_accession')
seq_id_names = (unique_protein_names
                .merge(unique_seq_id_accessions, how='inner', left_index=True, right_index=True))
seq_id_names = seq_id_names[['name', 'seq_id']].drop_duplicates(ignore_index=True)
seq_id_names = (seq_id_names.groupby('seq_id')
                .head(1)) # some sequences are given multiple names
seq_id_names.to_parquet('../data/interim/seq_id_names.pq', index=False)

Merging with sequence IDs
