In [1]:
import glob
import gzip
import os
import re
import typing as t
from itertools import chain
from concurrent.futures import ProcessPoolExecutor

import tqdm
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature
from Bio.Seq import Seq

In [2]:
os.chdir("/home/is6/work/cas_thesis/new_domains")

In [4]:
def feat_to_seqrec(feature: SeqFeature) -> t.Optional[SeqRecord]:
    qualifiers = feature.qualifiers
    try:
        locustag = qualifiers['locus_tag'][0]
        translation = qualifiers['translation'][0]
        seq = Seq(translation)
        return SeqRecord(id=locustag, name='', description='', seq=seq)
    except KeyError:
        return None


def extract_proteins(contig: SeqRecord) -> t.List[SeqRecord]:
    cds = (feat for feat in contig.features if feat.type == 'CDS')
    return list(filter(bool, map(feat_to_seqrec, cds)))


In [5]:
os.makedirs('proteins', exist_ok=True)

In [6]:
samples = pd.read_csv('samples.tsv', sep='\t')
ids = samples['id']

annotation_root = 'assemblies'
annotation_paths = [glob.glob(f'{annotation_root}/{id_}*.gbff.gz')[0] for id_ in ids]

protein_root = 'proteins'
protein_paths = [f'{protein_root}/{id_}.faa' for id_ in ids]

In [7]:
def genbank_to_proteins(input_: str, output: str) -> t.List[SeqRecord]:
    compressed = input_.endswith('gz')
    open_ = (lambda path: gzip.open(path, 'rt')) if compressed else open
    with open_(input_) as buffer:
        contigs = SeqIO.parse(buffer, 'gb')
        proteins = chain.from_iterable(map(extract_proteins, contigs))
        SeqIO.write(proteins, output, 'fasta')


with ProcessPoolExecutor(20) as workers:
    list(tqdm.tqdm(workers.map(genbank_to_proteins, annotation_paths, protein_paths), total=len(samples)))
    

100%|██████████| 18207/18207 [34:10<00:00,  8.88it/s]  
