In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
import pandas as pd
import numpy as np

In [2]:
def get_qualifier(feature, qualifier):
    try:
        return feature.qualifiers[qualifier][0]
    except:
        return np.nan
def get_features(gb_file):
    seq_dc=[]
    for rec in SeqIO.parse(gb_file, "genbank"):
        for feature in rec.features:
            feat_dc = {'type':feature.type}
            feat_dc['gene'] = get_qualifier(feature,'gene')
            feat_dc['product'] = get_qualifier(feature,'product')
#             feat_dc['info'] = get_qualifier(feature,'info')
            if feat_dc['gene'] is np.nan:
                feat_dc['gene'] = get_qualifier(feature,'note')
            feat_dc['len'] = len(feature.location.extract(rec).seq)
            feat_dc['location'] = str(feature.location)
            feat_dc['seq'] = str(feature.location.extract(rec).seq)
            seq_dc.append(feat_dc)
    return pd.DataFrame(seq_dc)

In [3]:
df_genes = get_features(gb_file='Nicotiana_Tabacum_plastome.gb')

In [12]:
df_genes.groupby('type').size()

type
CDS               98
exon              49
gene             144
intron            16
misc_feature      10
rRNA               8
rep_origin         4
repeat_region      2
source             1
tRNA              37
dtype: int64

In [15]:
df_genes[df_genes['type']=='rRNA']

Unnamed: 0,type,gene,product,len,location,seq
256,rRNA,,16S ribosomal RNA,1491,[102761:104252](+),TCTCATGGAGAGTTCGATCCTGGCTCAGGATGAACGCTGGCGGCAT...
267,rRNA,,23S ribosomal RNA,2810,[106330:109140](+),TTCAAACGAGGAAAGGCTTACGGTGGATACCTAGGCACCCAGAGAC...
269,rRNA,,4.5S ribosomal RNA,103,[109241:109344](+),GAAGGTCACGGCGAGACGAGCCGTTTATCATTACGATAGGTGTCAA...
271,rRNA,,5S ribosomal RNA,121,[109600:109721](+),TATTCTGGTGTCCTAGGCGTAGAGGAACCACACCAATCCATCCCGA...
323,rRNA,,5S ribosomal RNA,121,[132908:133029](-),TATTCTGGTGTCCTAGGCGTAGAGGAACCACACCAATCCATCCCGA...
325,rRNA,,4.5S ribosomal RNA,103,[133285:133388](-),GAAGGTCACGGCGAGACGAGCCGTTTATCATTACGATAGGTGTCAA...
327,rRNA,,23S ribosomal RNA,2810,[133489:136299](-),TTCAAACGAGGAAAGGCTTACGGTGGATACCTAGGCACCCAGAGAC...
338,rRNA,,16S ribosomal RNA,1491,[138377:139868](-),TCTCATGGAGAGTTCGATCCTGGCTCAGGATGAACGCTGGCGGCAT...


In [14]:
genes = df_genes[df_genes['type'].isin(['gene','rRNA'])].groupby('gene').head(1)['gene'].to_frame().reset_index(drop=True)
genes.gene.to_csv('Nicotiana_genes_sorted.txt',header=False,index=False)