In [1]:
import pandas as pd
import os
import re
from joblib import Parallel, delayed
from tqdm import tqdm

In [2]:
def get_go_processes(in_file):
    gff_df = pd.read_table('../data/genome_downloads/gff/' + in_file,
                           names=['seqid', 'source', 'feature', 'start', 'end',
                                  'score', 'strand', 'frame', 'attributes'],
                           comment='#')
    filtered_gff_df = gff_df[gff_df['attributes'].str.contains('go_process', na=False) &
                             ~gff_df['attributes'].str.contains('pseudo', na=False)]
    assembly = in_file.split('.')[0]
    with open('../data/interim/go_process_temp/' + assembly + '.csv', 'w') as f:
        for attr in filtered_gff_df['attributes']:
            prot_id = re.search('protein_id=([^;]+)', attr).group(1)
            go_processes = re.search('go_process=([^;]+)', attr).group(1)
            for process in go_processes.split(','):
                process = process.split('|')[0]
                print(prot_id + ',' + process, file=f)

In [3]:
os.mkdir('../data/interim/go_process_temp/')
gff_files = [x for x in os.listdir('../data/genome_downloads/gff') if '.gff' in x]
print('Extracting go processes')
_ = Parallel(n_jobs=48)(delayed(get_go_processes)(file) for file in tqdm(gff_files))
os.system('cat ../data/interim/go_process_temp/*.csv > ../data/interim/go_processs_temp.csv')
os.system('rm -r ../data/interim/go_process_temp/')

Extracting go processes


100%|██████████| 17453/17453 [00:56<00:00, 307.03it/s]


0

In [4]:
print('Getting unique go processes')
unique_go_processes = (pd.read_csv('../data/interim/go_processs_temp.csv',
                                   names=['product_accession', 'go_process'])
                       .drop_duplicates()
                       .set_index('product_accession'))

Getting unique go processes


In [5]:
os.system('rm ../data/interim/go_processs_temp.csv')

0

In [6]:
print('Merging seq ids with go processes')
unique_seq_id_accessions = (pd.read_parquet('../data/interim/seq_id_accessions.pq')
                            .set_index('protein_accession'))
unique_seq_ids_go_process = (unique_go_processes.merge(unique_seq_id_accessions, how='inner',
                                                           left_index=True, right_index=True)
                             [['seq_id', 'go_process']]
                             .drop_duplicates(ignore_index=True)) # so new indeces are labeled 0, 1, ...

Merging seq ids with go processes


In [7]:
print('Proteins with a go process', unique_seq_ids_go_process['seq_id'].nunique())

Proteins with a go process 14240378


In [8]:
print('Top go processes')
unique_seq_ids_go_process['go_process'].value_counts().head(10)

Top go processes


translation                                     748384
transmembrane transport                         503298
regulation of DNA-templated transcription       382174
regulation of transcription%2C DNA-templated    374496
proteolysis                                     341988
DNA repair                                      326009
phosphorelay signal transduction system         288515
carbohydrate metabolic process                  276985
DNA recombination                               268820
peptidoglycan biosynthetic process              234657
Name: go_process, dtype: int64

In [9]:
print('Number of go processes', unique_seq_ids_go_process['go_process'].nunique())

Number of go processes 1156


In [10]:
unique_seq_ids_go_process.to_parquet('../data/interim/seq_id_go_processes.pq', index=False)
