In [1]:
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO, SeqFeature
import pandas as pd
import sys
import os
import gzip
import typing as t
import numpy as np


In [None]:
# necessary functions
# explode a dataframe column
def explode(df, lst_cols, fill_value='', preserve_index=False):
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res

# function for parsing gzipped genbank files
def parse_gzipped_gb(path: str) -> t.List[SeqRecord]:
    with gzip.open(path, 'rt') as buffer:
        return list(SeqIO.parse(buffer, 'genbank'))
    

In [None]:
# making a 16s database

In [3]:
dir_path = "/home/is6/ASVs_comparison/samples"
dir_name = []
file_name = []
for path, dirs, files in os.walk(dir_path):
    dir_name.append(dirs)
    file_name.append(files)

In [4]:
new_file_list = list(filter(None,file_name))
new_dir_list = list(dir_name[0])
all_samples = list(zip(new_dir_list,new_file_list))

In [5]:
sample_df = pd.DataFrame(all_samples)
sample_df.columns = ["genera", "samples"]

In [7]:
sample_df['samples'] = sample_df['samples'].astype(str)
new_df = pd.DataFrame(explode(sample_df.assign(samples=sample_df.samples.str.split(',')), 'samples'))


Unnamed: 0,genera,samples
0,Herbidospora,['GCA_005233835.1_ASM523383v1_genomic.gbff.gz'
1,Herbidospora,'GCA_009901565.1_ASM990156v1_genomic.gbff.gz'
2,Herbidospora,'GCA_001570525.1_ASM157052v1_genomic.gbff.gz'
3,Herbidospora,'GCA_001570605.1_ASM157060v1_genomic.gbff.gz'
4,Herbidospora,'GCA_001570585.1_ASM157058v1_genomic.gbff.gz'


In [8]:
new_df.samples = new_df.samples.str.replace("'", '')
new_df.samples = new_df.samples.str.replace("[", '')
new_df.samples = new_df.samples.str.replace("]", '')
new_df.samples = new_df.samples.str.replace(" ", '')
new_df.head()


Unnamed: 0,genera,samples
0,Herbidospora,GCA_005233835.1_ASM523383v1_genomic.gbff.gz
1,Herbidospora,GCA_009901565.1_ASM990156v1_genomic.gbff.gz
2,Herbidospora,GCA_001570525.1_ASM157052v1_genomic.gbff.gz
3,Herbidospora,GCA_001570605.1_ASM157060v1_genomic.gbff.gz
4,Herbidospora,GCA_001570585.1_ASM157058v1_genomic.gbff.gz


In [13]:
!mkdir 16s_database

In [14]:
# main loop to iterate through samples and write final fasta


for i, row in new_df.iterrows():
    file = "/home/is6/ASVs_comparison/16s_database/{}.fasta".format(row.genera)
    x = open(file, "a+")  #open existing fasta file and append to it
    #for path in row.samples:
    path = "/home/is6/ASVs_comparison/samples/{}/{}".format(row.genera, row.samples)
    gbank= parse_gzipped_gb(path)
    
    accession_number = row.samples.replace('.gbff.gz','')
    sample_id, accn, seq_type = accession_number.rsplit('_', 2)

    for genome in gbank:
        for gene in genome.features:
            if gene.type=="rRNA":
                if 'product' in gene.qualifiers:
                    if '16S' in gene.qualifiers['product'][0]:
                        start = gene.location.nofuzzy_start
                        end = gene.location.nofuzzy_end
                        if 'db_xref' in gene.qualifiers:
                            gi=[]
                            gi=str(gene.qualifiers['db_xref'])
                            gi=gi.split(":")[1]
                            gi=gi.split("'")[0]
                            print (">%s %s %s %s\n%s" % (sample_id, accn, seq_type,genome.description,genome.seq[start:end]), file=x)
                        else:
                            print (">%s %s %s %s\n%s" % (sample_id, accn, seq_type, genome.description,genome.seq[start:end]), file=x)
                            
    x.close()


In [19]:
#combining all extracted 16s sequences from each genera in one fsta file
!cat 16s_database/*.fasta > 16s_database/16s_database.fasta

In [20]:
#checking directory files
!ls -ltrh 16s_database

total 300K
-rw-rw-r-- 1 is6 is6 4.3K Jul  1 10:25 Herbidospora.fasta
-rw-rw-r-- 1 is6 is6  18K Jul  1 10:25 Polaromonas.fasta
-rw-rw-r-- 1 is6 is6  26K Jul  1 10:26 Sorangium.fasta
-rw-rw-r-- 1 is6 is6  48K Jul  1 10:26 Chitinophaga.fasta
-rw-rw-r-- 1 is6 is6 1.6K Jul  1 10:26 Conexibacter.fasta
-rw-rw-r-- 1 is6 is6  46K Jul  1 10:26 Variovorax.fasta
-rw-rw-r-- 1 is6 is6 143K Jul  1 10:31 16s_database.fasta
