In [72]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
import pandas as pd
import numpy as np

In [73]:
def get_qualifier(feature, qualifier):
    try:
        return feature.qualifiers[qualifier][0]
    except:
        return np.nan
def get_features(gb_file):
    seq_dc=[]
    for rec in SeqIO.parse(gb_file, "genbank"):
        for feature in rec.features:
            feat_dc = {'type':feature.type}
            feat_dc['gene'] = get_qualifier(feature,'gene')
            feat_dc['product'] = get_qualifier(feature,'product')
#             feat_dc['info'] = get_qualifier(feature,'info')
            if feat_dc['gene'] is np.nan:
                feat_dc['gene'] = get_qualifier(feature,'note')
            feat_dc['len'] = len(feature.location.extract(rec).seq)
            feat_dc['location'] = str(feature.location)
            feat_dc['seq'] = str(feature.location.extract(rec).seq)
            seq_dc.append(feat_dc)
    return pd.DataFrame(seq_dc)

In [74]:
def write_fasta(seq_df, sample_stats, fasta_file):
    Records = []
    for idx, row in seq_df.iterrows():
        record = SeqRecord( Seq(row.seq), id=sample_stats['Sample'].replace('-','_') + '-' + row.gene.replace('-','_'), 
                description= ';product=' + str(row['product']) + ', location=' + str(row['location']) + ';' )
        Records.append(record)
    SeqIO.write(Records,fasta_file,format='fasta')

In [75]:
ref_dir = 'References/'
all_gb_files = [ref_dir + file for file in os.listdir(ref_dir) if file.endswith(".gb")]
# all_gb_files = ['References/NC_003386.1.gb']
print(len(all_gb_files))

Samples_ls = []
for gb_file in all_gb_files:
    # Get features
    seq_df = get_features(gb_file)

    # Get raw feature stats
    sample_stats={'Sample':gb_file.split('/')[-1].replace('.gb','')}
    sample_stats['Nfeatures_raw']=seq_df.shape[0];sample_stats['Ngenes_raw']=seq_df.gene.nunique()
    feat_raw_stats = seq_df.groupby('type').size().sort_values(ascending=False).to_dict()

    # Removing fragments
    seq_df = seq_df[seq_df.gene.str.contains('-fragment')==False]
    # Keeping CDS, rRNA, repeat_region
    accepted_types = ['CDS','rRNA','tRNA'] # Could add repeat_region
    seq_gene = seq_df[seq_df.type=='gene']
    seq_df = seq_df[seq_df.type.isin(accepted_types)]
    seq_gene = seq_gene[seq_gene.gene.isin(seq_df.gene)==False]
    sample_stats['Nfeatures_sel']=seq_df.shape[0];sample_stats['Ngenes_sel']=seq_df.gene.nunique()

    # Removing duplicates
    sample_stats['Nidentical'] = seq_df[seq_df.duplicated(keep=False)].gene.nunique()
    sample_stats['Nduplicates'] = seq_df[seq_df.duplicated(subset='gene',keep=False)].gene.nunique()
    seq_df = seq_df.sort_values('len',ascending=False).groupby('gene').head(1) # Keep longest sequence per gene
    sample_stats['Nfeatures']=seq_df.shape[0];sample_stats['Ngenes']=seq_df.gene.nunique()
    Samples_ls.append(sample_stats)
    print(sample_stats)
    print(feat_raw_stats)
    print(seq_df.groupby('type').size().sort_values(ascending=False).to_dict())

    seq_df = seq_df.sort_values(['type','gene']).reset_index(drop=True)
    write_fasta(seq_df, sample_stats, fasta_file = gb_file.replace('.gb','_CDS.fasta'))

124
{'Sample': 'NC_003386.1', 'Nfeatures_raw': 546, 'Ngenes_raw': 129, 'Nfeatures_sel': 208, 'Ngenes_sel': 119, 'Nidentical': 0, 'Nduplicates': 49, 'Nfeatures': 119, 'Ngenes': 119}
{'gene': 207, 'CDS': 125, 'exon': 72, 'tRNA': 67, 'intron': 49, 'rRNA': 21, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 81, 'tRNA': 34, 'rRNA': 4}
{'Sample': 'NC_005086.1', 'Nfeatures_raw': 467, 'Ngenes_raw': 120, 'Nfeatures_sel': 177, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 38, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 169, 'CDS': 100, 'exon': 76, 'tRNA': 67, 'intron': 40, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 80, 'tRNA': 31, 'rRNA': 4}
{'Sample': 'NC_009962.1', 'Nfeatures_raw': 448, 'Ngenes_raw': 119, 'Nfeatures_sel': 169, 'Ngenes_sel': 114, 'Nidentical': 0, 'Nduplicates': 32, 'Nfeatures': 114, 'Ngenes': 114}
{'gene': 160, 'CDS': 95, 'exon': 73, 'tRNA': 64, 'intron': 41, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 



{'Sample': 'NC_016986.1', 'Nfeatures_raw': 514, 'Ngenes_raw': 131, 'Nfeatures_sel': 197, 'Ngenes_sel': 126, 'Nidentical': 0, 'Nduplicates': 47, 'Nfeatures': 126, 'Ngenes': 126}
{'gene': 188, 'CDS': 109, 'exon': 81, 'tRNA': 76, 'intron': 43, 'rRNA': 12, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 85, 'tRNA': 37, 'rRNA': 4}
{'Sample': 'NC_017006.1', 'Nfeatures_raw': 507, 'Ngenes_raw': 127, 'Nfeatures_sel': 196, 'Ngenes_sel': 120, 'Nidentical': 0, 'Nduplicates': 44, 'Nfeatures': 120, 'Ngenes': 120}
{'gene': 197, 'CDS': 118, 'exon': 64, 'tRNA': 63, 'intron': 38, 'rRNA': 22, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 84, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_020146.1', 'Nfeatures_raw': 548, 'Ngenes_raw': 131, 'Nfeatures_sel': 213, 'Ngenes_sel': 121, 'Nidentical': 0, 'Nduplicates': 54, 'Nfeatures': 121, 'Ngenes': 121}
{'gene': 214, 'CDS': 134, 'exon': 67, 'tRNA': 63, 'intron': 43, 'rRNA': 22, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 84, 'tR



{'Sample': 'NC_020372.1', 'Nfeatures_raw': 452, 'Ngenes_raw': 119, 'Nfeatures_sel': 175, 'Ngenes_sel': 114, 'Nidentical': 0, 'Nduplicates': 41, 'Nfeatures': 114, 'Ngenes': 114}
{'gene': 166, 'CDS': 101, 'exon': 69, 'tRNA': 63, 'intron': 36, 'rRNA': 12, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 31, 'rRNA': 4}
{'Sample': 'NC_021426.1', 'Nfeatures_raw': 480, 'Ngenes_raw': 122, 'Nfeatures_sel': 178, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 38, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 171, 'CDS': 102, 'exon': 79, 'tRNA': 64, 'intron': 43, 'rRNA': 16, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_022137.1', 'Nfeatures_raw': 540, 'Ngenes_raw': 126, 'Nfeatures_sel': 207, 'Ngenes_sel': 117, 'Nidentical': 0, 'Nduplicates': 55, 'Nfeatures': 117, 'Ngenes': 117}
{'gene': 210, 'CDS': 138, 'exon': 69, 'tRNA': 57, 'intron': 41, 'rRNA': 20, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 85, 'tR



{'Sample': 'NC_024929.1', 'Nfeatures_raw': 483, 'Ngenes_raw': 121, 'Nfeatures_sel': 180, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 37, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 167, 'CDS': 102, 'exon': 85, 'tRNA': 66, 'intron': 44, 'rRNA': 14, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_025745.1', 'Nfeatures_raw': 389, 'Ngenes_raw': 119, 'Nfeatures_sel': 155, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 32, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 149, 'CDS': 100, 'exon': 53, 'tRNA': 49, 'intron': 29, 'rRNA': 8, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_026295.1', 'Nfeatures_raw': 465, 'Ngenes_raw': 125, 'Nfeatures_sel': 181, 'Ngenes_sel': 121, 'Nidentical': 0, 'Nduplicates': 42, 'Nfeatures': 121, 'Ngenes': 121}
{'gene': 179, 'CDS': 117, 'exon': 63, 'tRNA': 53, 'intron': 39, 'rRNA': 13, 'source': 1}
{'CDS': 83, 'tRNA': 34, 'rRNA': 4}
{'Sample': 'NC_026301.1', 'Nfeatures_raw': 405, 'Ngenes_raw



{'Sample': 'NC_027512.1', 'Nfeatures_raw': 480, 'Ngenes_raw': 126, 'Nfeatures_sel': 187, 'Ngenes_sel': 122, 'Nidentical': 0, 'Nduplicates': 38, 'Nfeatures': 122, 'Ngenes': 122}
{'gene': 179, 'CDS': 108, 'exon': 72, 'tRNA': 67, 'intron': 37, 'rRNA': 12, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 86, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_027829.1', 'Nfeatures_raw': 472, 'Ngenes_raw': 105, 'Nfeatures_sel': 177, 'Ngenes_sel': 102, 'Nidentical': 0, 'Nduplicates': 42, 'Nfeatures': 102, 'Ngenes': 102}
{'gene': 174, 'CDS': 95, 'exon': 74, 'tRNA': 66, 'intron': 40, 'rRNA': 22, 'source': 1}
{'CDS': 68, 'tRNA': 30, 'rRNA': 4}
{'Sample': 'NC_029427.1', 'Nfeatures_raw': 454, 'Ngenes_raw': 120, 'Nfeatures_sel': 168, 'Ngenes_sel': 114, 'Nidentical': 0, 'Nduplicates': 32, 'Nfeatures': 114, 'Ngenes': 114}
{'gene': 157, 'CDS': 97, 'exon': 80, 'tRNA': 62, 'intron': 43, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 31, 'rRNA': 4}
{'Sample': 'NC_029433



{'Sample': 'NC_034686.1', 'Nfeatures_raw': 470, 'Ngenes_raw': 122, 'Nfeatures_sel': 180, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 35, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 171, 'CDS': 102, 'exon': 73, 'tRNA': 69, 'intron': 40, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_034942.1', 'Nfeatures_raw': 443, 'Ngenes_raw': 123, 'Nfeatures_sel': 173, 'Ngenes_sel': 120, 'Nidentical': 0, 'Nduplicates': 40, 'Nfeatures': 120, 'Ngenes': 120}
{'gene': 169, 'CDS': 110, 'exon': 64, 'tRNA': 56, 'intron': 34, 'rRNA': 9, 'source': 1}
{'CDS': 82, 'tRNA': 34, 'rRNA': 4}
{'Sample': 'NC_035050.1', 'Nfeatures_raw': 492, 'Ngenes_raw': 120, 'Nfeatures_sel': 195, 'Ngenes_sel': 114, 'Nidentical': 0, 'Nduplicates': 38, 'Nfeatures': 114, 'Ngenes': 114}
{'gene': 188, 'CDS': 114, 'tRNA': 70, 'exon': 68, 'intron': 35, 'rRNA': 12, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 78, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_03549



{'Sample': 'NC_036154.1', 'Nfeatures_raw': 459, 'Ngenes_raw': 122, 'Nfeatures_sel': 176, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 36, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 169, 'CDS': 97, 'exon': 70, 'tRNA': 70, 'intron': 38, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_036304.1', 'Nfeatures_raw': 443, 'Ngenes_raw': 123, 'Nfeatures_sel': 168, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 33, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 158, 'CDS': 98, 'exon': 72, 'tRNA': 62, 'intron': 38, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}




{'Sample': 'NC_036416.1', 'Nfeatures_raw': 476, 'Ngenes_raw': 122, 'Nfeatures_sel': 178, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 36, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 168, 'CDS': 96, 'exon': 80, 'tRNA': 68, 'intron': 43, 'rRNA': 16, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_036660.1', 'Nfeatures_raw': 475, 'Ngenes_raw': 122, 'Nfeatures_sel': 180, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 35, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 172, 'CDS': 100, 'exon': 75, 'tRNA': 68, 'intron': 39, 'rRNA': 16, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 78, 'tRNA': 33, 'rRNA': 4}
{'Sample': 'NC_036960.1', 'Nfeatures_raw': 481, 'Ngenes_raw': 123, 'Nfeatures_sel': 180, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 35, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 171, 'CDS': 101, 'exon': 79, 'tRNA': 68, 'intron': 43, 'rRNA': 14, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRN



{'Sample': 'NC_037948.1', 'Nfeatures_raw': 478, 'Ngenes_raw': 122, 'Nfeatures_sel': 178, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 38, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 171, 'CDS': 102, 'exon': 78, 'tRNA': 64, 'intron': 42, 'rRNA': 16, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_038057.1', 'Nfeatures_raw': 463, 'Ngenes_raw': 121, 'Nfeatures_sel': 173, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 34, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 161, 'CDS': 96, 'exon': 81, 'tRNA': 67, 'intron': 43, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_038074.1', 'Nfeatures_raw': 527, 'Ngenes_raw': 123, 'Nfeatures_sel': 204, 'Ngenes_sel': 122, 'Nidentical': 0, 'Nduplicates': 51, 'Nfeatures': 122, 'Ngenes': 122}
{'gene': 206, 'CDS': 126, 'exon': 71, 'tRNA': 64, 'intron': 41, 'rRNA': 18, 'source': 1}
{'CDS': 85, 'tRNA': 33, 'rRNA': 4}
{'Sample': 'NC_03809



{'Sample': 'NC_039424.1', 'Nfeatures_raw': 461, 'Ngenes_raw': 121, 'Nfeatures_sel': 177, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 36, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 168, 'CDS': 96, 'exon': 72, 'tRNA': 71, 'intron': 39, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_039600.1', 'Nfeatures_raw': 475, 'Ngenes_raw': 119, 'Nfeatures_sel': 179, 'Ngenes_sel': 114, 'Nidentical': 0, 'Nduplicates': 40, 'Nfeatures': 114, 'Ngenes': 114}
{'gene': 168, 'CDS': 101, 'exon': 79, 'tRNA': 67, 'intron': 44, 'rRNA': 11, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 31, 'rRNA': 4}




{'Sample': 'NC_039627.1', 'Nfeatures_raw': 492, 'Ngenes_raw': 124, 'Nfeatures_sel': 182, 'Ngenes_sel': 116, 'Nidentical': 0, 'Nduplicates': 37, 'Nfeatures': 116, 'Ngenes': 116}
{'gene': 176, 'CDS': 101, 'exon': 81, 'tRNA': 67, 'intron': 42, 'rRNA': 20, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 80, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_039676.1', 'Nfeatures_raw': 487, 'Ngenes_raw': 123, 'Nfeatures_sel': 185, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 40, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 176, 'CDS': 104, 'exon': 77, 'tRNA': 70, 'intron': 41, 'rRNA': 14, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_039731.1', 'Nfeatures_raw': 461, 'Ngenes_raw': 121, 'Nfeatures_sel': 173, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 35, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 161, 'CDS': 98, 'exon': 79, 'tRNA': 65, 'intron': 43, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRN



{'Sample': 'NC_039803.1', 'Nfeatures_raw': 455, 'Ngenes_raw': 122, 'Nfeatures_sel': 169, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 33, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 158, 'CDS': 95, 'exon': 79, 'tRNA': 65, 'intron': 43, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_039815.1', 'Nfeatures_raw': 489, 'Ngenes_raw': 123, 'Nfeatures_sel': 187, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 41, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 180, 'CDS': 105, 'exon': 74, 'tRNA': 71, 'intron': 40, 'rRNA': 14, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_040160.1', 'Nfeatures_raw': 369, 'Ngenes_raw': 117, 'Nfeatures_sel': 146, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 25, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 139, 'CDS': 89, 'exon': 54, 'tRNA': 52, 'intron': 29, 'rRNA': 5, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_040219.



{'Sample': 'NC_041000.1', 'Nfeatures_raw': 539, 'Ngenes_raw': 127, 'Nfeatures_sel': 205, 'Ngenes_sel': 119, 'Nidentical': 0, 'Nduplicates': 48, 'Nfeatures': 119, 'Ngenes': 119}
{'gene': 207, 'CDS': 135, 'exon': 70, 'tRNA': 57, 'intron': 45, 'rRNA': 20, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 85, 'tRNA': 30, 'rRNA': 4}
{'Sample': 'NC_041127.1', 'Nfeatures_raw': 444, 'Ngenes_raw': 123, 'Nfeatures_sel': 168, 'Ngenes_sel': 116, 'Nidentical': 0, 'Nduplicates': 32, 'Nfeatures': 116, 'Ngenes': 116}
{'gene': 158, 'CDS': 95, 'exon': 73, 'tRNA': 64, 'intron': 39, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 80, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_041261.1', 'Nfeatures_raw': 520, 'Ngenes_raw': 124, 'Nfeatures_sel': 191, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 35, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 188, 'CDS': 109, 'exon': 84, 'tRNA': 67, 'intron': 45, 'rRNA': 22, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRN



{'Sample': 'NC_042940.1', 'Nfeatures_raw': 510, 'Ngenes_raw': 122, 'Nfeatures_sel': 192, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 42, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 181, 'CDS': 111, 'exon': 83, 'tRNA': 68, 'intron': 48, 'rRNA': 14, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_043796.1', 'Nfeatures_raw': 487, 'Ngenes_raw': 124, 'Nfeatures_sel': 183, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 35, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 175, 'CDS': 107, 'exon': 77, 'tRNA': 64, 'intron': 43, 'rRNA': 16, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 78, 'tRNA': 33, 'rRNA': 4}
{'Sample': 'NC_043800.1', 'Nfeatures_raw': 464, 'Ngenes_raw': 122, 'Nfeatures_sel': 174, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 32, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 165, 'CDS': 96, 'exon': 77, 'tRNA': 66, 'intron': 41, 'rRNA': 14, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRN



{'Sample': 'NC_044826.1', 'Nfeatures_raw': 461, 'Ngenes_raw': 121, 'Nfeatures_sel': 173, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 35, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 161, 'CDS': 98, 'exon': 79, 'tRNA': 65, 'intron': 43, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_044828.1', 'Nfeatures_raw': 459, 'Ngenes_raw': 123, 'Nfeatures_sel': 172, 'Ngenes_sel': 116, 'Nidentical': 0, 'Nduplicates': 33, 'Nfeatures': 116, 'Ngenes': 116}
{'gene': 161, 'CDS': 97, 'exon': 78, 'tRNA': 66, 'intron': 42, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 80, 'tRNA': 32, 'rRNA': 4}




{'Sample': 'NC_045081.1', 'Nfeatures_raw': 469, 'Ngenes_raw': 121, 'Nfeatures_sel': 182, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 41, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 174, 'CDS': 101, 'tRNA': 71, 'exon': 70, 'intron': 38, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_045235.1', 'Nfeatures_raw': 466, 'Ngenes_raw': 119, 'Nfeatures_sel': 176, 'Ngenes_sel': 112, 'Nidentical': 0, 'Nduplicates': 42, 'Nfeatures': 112, 'Ngenes': 112}
{'gene': 164, 'CDS': 103, 'exon': 77, 'tRNA': 65, 'intron': 42, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 77, 'tRNA': 31, 'rRNA': 4}




{'Sample': 'NC_045274.1', 'Nfeatures_raw': 462, 'Ngenes_raw': 121, 'Nfeatures_sel': 173, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 36, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 163, 'CDS': 95, 'exon': 78, 'tRNA': 68, 'intron': 43, 'rRNA': 10, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_045280.1', 'Nfeatures_raw': 484, 'Ngenes_raw': 120, 'Nfeatures_sel': 184, 'Ngenes_sel': 113, 'Nidentical': 0, 'Nduplicates': 39, 'Nfeatures': 113, 'Ngenes': 113}
{'gene': 177, 'CDS': 103, 'exon': 74, 'tRNA': 69, 'intron': 42, 'rRNA': 14, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 77, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_045294.1', 'Nfeatures_raw': 497, 'Ngenes_raw': 124, 'Nfeatures_sel': 186, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 38, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 179, 'CDS': 104, 'exon': 80, 'tRNA': 69, 'intron': 42, 'rRNA': 18, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRN



{'Sample': 'NC_051913.1', 'Nfeatures_raw': 471, 'Ngenes_raw': 123, 'Nfeatures_sel': 171, 'Ngenes_sel': 115, 'Nidentical': 0, 'Nduplicates': 33, 'Nfeatures': 115, 'Ngenes': 115}
{'gene': 165, 'CDS': 97, 'exon': 80, 'tRNA': 63, 'intron': 45, 'rRNA': 16, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 79, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_051971.1', 'Nfeatures_raw': 470, 'Ngenes_raw': 120, 'Nfeatures_sel': 177, 'Ngenes_sel': 114, 'Nidentical': 0, 'Nduplicates': 37, 'Nfeatures': 114, 'Ngenes': 114}
{'gene': 166, 'CDS': 97, 'exon': 79, 'tRNA': 68, 'intron': 43, 'rRNA': 12, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 78, 'tRNA': 32, 'rRNA': 4}
{'Sample': 'NC_051978.1', 'Nfeatures_raw': 518, 'Ngenes_raw': 129, 'Nfeatures_sel': 194, 'Ngenes_sel': 122, 'Nidentical': 0, 'Nduplicates': 46, 'Nfeatures': 122, 'Ngenes': 122}
{'gene': 193, 'CDS': 114, 'exon': 74, 'tRNA': 66, 'intron': 46, 'rRNA': 20, 'misc_feature': 2, 'repeat_region': 2, 'source': 1}
{'CDS': 85, 'tRNA

In [76]:
Samples_stats = pd.DataFrame(Samples_ls)
Samples_stats.to_csv(ref_dir + 'Ref_stats.csv',index=False)
print('Ngenes:',Samples_stats.Ngenes.quantile([.1,.5,.9]).astype(int).to_dict())
print('Nduplicates:',Samples_stats.Nduplicates.quantile([.1,.5,.9]).astype(int).to_dict())
Samples_stats

Ngenes: {0.1: 113, 0.5: 115, 0.9: 119}
Nduplicates: {0.1: 32, 0.5: 36, 0.9: 47}


Unnamed: 0,Sample,Nfeatures_raw,Ngenes_raw,Nfeatures_sel,Ngenes_sel,Nidentical,Nduplicates,Nfeatures,Ngenes
0,NC_003386.1,546,129,208,119,0,49,119,119
1,NC_005086.1,467,120,177,115,0,38,115,115
2,NC_009962.1,448,119,169,114,0,32,114,114
3,NC_010323.1,455,121,170,115,0,32,115,115
4,NC_010654.1,390,103,148,96,0,25,96,96
...,...,...,...,...,...,...,...,...,...
119,NC_053912.1,453,120,170,114,0,33,114,114
120,NC_054296.1,457,120,171,114,0,33,114,114
121,NC_054306.1,490,121,181,114,0,38,114,114
122,NC_054357.1,436,108,168,101,0,39,101,101


In [77]:
# Concat .fasta files in a single fasta
all_fa_files = [file for file in os.listdir(ref_dir) if file.endswith("_CDS.fasta")]
print(len(all_fa_files))
all_seqs=[]; ls_id=[]
for ifa in all_fa_files:
    for record in SeqIO.parse(ref_dir + ifa,format='fasta'):
        all_seqs.append(record)
SeqIO.write(all_seqs,ref_dir + 'cpDNA_RefCDS_raw.fasta',format='fasta')
len(all_seqs)

124


14213

In [78]:
# https://blog.finxter.com/how-to-extract-numbers-from-a-string-in-python/
import re
def get_loc(sentence):
    return [float(s) for s in re.findall(r'-?\d+\.?\d*', sentence)]
print(get_loc('join{[126649:127202](-), [125052:125582](-)}	'))

ls_loc=[76120.0, 76234.0, 105286.0, 105518.0, 104729.0, 104829.0]
def len_exons(ls_loc):
    len_exon=0;
    for i in range(0,len(ls_loc),2):
        exon = ls_loc[i:i+2]
        len_exon += max(exon) - min(exon)
    return len_exon
def len_introns(ls_loc):
    len_intron=0;
    if len(ls_loc)/4>0:
        for i in range(1,len(ls_loc),4):
            intron = ls_loc[i:i+2]
            len_intron += max(intron) - min(intron)
    return len_intron
print(len_exons(ls_loc),len_introns(ls_loc))

[126649.0, 127202.0, 125052.0, 125582.0]
446.0 29052.0


In [79]:
ref_stats={}
for record in all_seqs:
    location = record.description.split('location=')[1].split(';')[0]
    ref_stats[record.id]={'len':len(record),'join':'join' in location,'forward':'+' in location,'reverse':'-' in location,
                         'location':location}
ref_stats = pd.DataFrame.from_dict(ref_stats,orient='index').reset_index().rename(columns={'index':'seqid'})
ref_stats[['Accession','gene']]=ref_stats.seqid.str.split('-',expand=True)
ref_stats['ls_loc'] = ref_stats.apply(lambda row: get_loc(row['location']),axis=1)
ref_stats['N_exons'] = ref_stats.apply(lambda row: len(row['ls_loc'])/2,axis=1)
ref_stats['len_exons'] = ref_stats.apply(lambda row: len_exons(row['ls_loc']),axis=1)
ref_stats['len_introns'] = ref_stats.apply(lambda row: len_introns(row['ls_loc']),axis=1)
ref_stats[:2]

Unnamed: 0,seqid,len,join,forward,reverse,location,Accession,gene,ls_loc,N_exons,len_exons,len_introns
0,NC_003386.1-accD,929,False,True,False,[57912:58841](+),NC_003386.1,accD,"[57912.0, 58841.0]",1.0,929.0,0.0
1,NC_003386.1-atpA,1527,False,False,True,[15691:17218](-),NC_003386.1,atpA,"[15691.0, 17218.0]",1.0,1527.0,0.0


In [82]:
gene_stats = ref_stats.groupby('gene').agg({'len':['min','median','max'],'Accession':['count'],'forward':['sum'],
                            'reverse':['sum'],'join':['sum'],'N_exons':['mean'],'len_exons':['median'],'len_introns':['median','max']})
gene_stats.columns = ['_'.join(col).strip() for col in gene_stats.columns.values]
gene_stats.reset_index().to_csv(ref_dir + 'gene_stats.csv',index=False)
gene_stats[gene_stats.join_sum>5]

Unnamed: 0_level_0,len_min,len_median,len_max,Accession_count,forward_sum,reverse_sum,join_sum,N_exons_mean,len_exons_median,len_introns_median,len_introns_max
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
atpF,424,555.0,576,124,13,111,123,1.991935,555.0,1279.5,2338.0
clpP1,273,591.0,948,121,10,111,106,2.727273,591.0,1160.0,1954.0
ndhA,301,1092.0,1119,116,23,93,113,1.974138,1092.0,2154.0,2553.0
ndhB,311,1533.0,1569,119,46,73,114,1.957983,1533.0,2203.0,2295.0
pafI,228,507.0,543,124,11,113,121,2.943548,507.0,1081.0,1907.0
petB,489,648.0,708,124,116,8,113,1.91129,648.0,781.5,1469.0
petD,479,483.0,539,124,116,8,90,1.725806,483.0,716.0,1226.0
rpl16,9,408.0,492,124,8,116,102,1.822581,408.0,1406.0,2195.0
rpl2,392,825.0,849,123,57,66,117,1.95935,825.0,1285.0,158597.0
rpoC1,1359,2049.0,2118,124,14,110,118,1.951613,2049.0,2795.0,3127.0


In [83]:
print(gene_stats.shape[0],gene_stats.Accession_count.sort_values(ascending=False).to_dict())

144 {'rpoC2': 124, 'rps12': 124, 'atpA': 124, 'rpoC1': 124, 'rpoB': 124, 'rpoA': 124, 'rpl36': 124, 'rpl33': 124, 'rpl20': 124, 'rpl16': 124, 'rpl14': 124, 'rbcL': 124, 'psbZ': 124, 'psbT': 124, 'psbM': 124, 'psbL': 124, 'psbK': 124, 'psbI': 124, 'psbH': 124, 'psbF': 124, 'psbE': 124, 'rps11': 124, 'rps14': 124, 'psbC': 124, 'rps18': 124, 'trnY_GUA': 124, 'trnW_CCA': 124, 'trnR_ACG': 124, 'trnQ_UUG': 124, 'trnM_CAU': 124, 'trnH_GUG': 124, 'trnE_UUC': 124, 'trnD_GUC': 124, 'trnC_GCA': 124, 'rrn5': 124, 'rrn4.5': 124, 'rrn23': 124, 'rrn16': 124, 'rps8': 124, 'rps7': 124, 'rps4': 124, 'rps3': 124, 'rps2': 124, 'rps19': 124, 'psbD': 124, 'ycf2': 124, 'psbB': 124, 'petG': 124, 'ccsA': 124, 'atpI': 124, 'pafI': 124, 'pafII': 124, 'pbf1': 124, 'atpH': 124, 'atpF': 124, 'petA': 124, 'petB': 124, 'petD': 124, 'psbA': 124, 'psaA': 124, 'psaB': 124, 'psaC': 124, 'atpB': 124, 'petN': 124, 'psaI': 124, 'psaJ': 124, 'atpE': 124, 'trnS_GCU': 123, 'trnL_UAG': 123, 'trnF_GAA': 123, 'trnL_UAA': 123, 'tr

In [84]:
# Remove outliers
for idx, record in enumerate(all_seqs):
    if (len(record) > gene_stats.loc[record.id.split('-')[1],:]['len_median']*2) | \
        (len(record) < gene_stats.loc[record.id.split('-')[1],:]['len_median']/2):
        print(record.id,'removed',len(record),gene_stats.loc[record.id.split('-')[1],:]['len_median'])
        del all_seqs[idx]
print(len(all_seqs))

NC_003386.1-cemA removed 1515 690.0
NC_003386.1-trnL_UAA removed 472 85.0
NC_005086.1-trnV_UAC removed 168 74.0
NC_010323.1-infA removed 105 234.0
NC_010654.1-trnS_UGA removed 402 93.0
NC_011156.4-cysT removed 170 474.0
NC_011156.4-ndhB removed 311 1533.0
NC_011156.4-ndhD removed 114 1503.0
NC_011156.4-rpl32 removed 456 171.0
NC_011156.4-rps16 removed 66 261.0
NC_011156.4-trnL_UAA removed 35 85.0
NC_012818.1-cemA removed 1521 690.0
NC_012818.1-clpP1 removed 273 591.0
NC_012818.1-rpoB removed 696 3213.0
NC_016986.1-rpl21 removed 39 360.0
NC_016986.1-rpl23 removed 81 282.0
NC_016986.1-trnS_AGA removed 336 108.0
NC_017006.1-cemA removed 1518 690.0
NC_017006.1-trnG_UCC removed 23 71.0
NC_020146.1-cemA removed 1428 690.0
NC_020146.1-rpl16 removed 81 408.0
NC_020146.1-ycf2 removed 3409 6862.5
NC_020146.1-trnL_UAA removed 345 85.0
NC_022137.1-cemA removed 1533 690.0
NC_022137.1-rps11 removed 168 417.0
NC_024157.1-cemA removed 1521 690.0
NC_024157.1-trnI_GAU removed 295 88.0
NC_024158.1-cemA r

In [86]:
ref_stats={}
for record in all_seqs:
    location = record.description.split('location=')[1].split(';')[0]
    ref_stats[record.id]={'len':len(record),'join':'join' in location,'forward':'+' in location,'reverse':'-' in location,
                         'location':location}
ref_stats = pd.DataFrame.from_dict(ref_stats,orient='index').reset_index().rename(columns={'index':'seqid'})
ref_stats[['Accession','gene']]=ref_stats.seqid.str.split('-',expand=True)
ref_stats['ls_loc'] = ref_stats.apply(lambda row: get_loc(row['location']),axis=1)
ref_stats['N_exons'] = ref_stats.apply(lambda row: len(row['ls_loc'])/2,axis=1)
gene_stats = ref_stats.groupby('gene').agg({'len':['min','median','max'],'Accession':['count'],
                                           'forward':['sum'],'reverse':['sum'],'join':['sum'],'N_exons':['mean']})
gene_stats.columns = ['_'.join(col).strip() for col in gene_stats.columns.values]
gene_stats.reset_index().to_csv(ref_dir + 'gene_stats.csv',index=False)
print(gene_stats.shape[0])
gene_stats[:2]

144


Unnamed: 0_level_0,len_min,len_median,len_max,Accession_count,forward_sum,reverse_sum,join_sum,N_exons_mean
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
accD,746,1476.0,1893,112,101,11,1,1.008929
atpA,882,1524.0,1545,124,14,110,0,1.0


In [87]:
gene_stats_clean = gene_stats[gene_stats.Accession_count>(gene_stats.Accession_count.max()*0.33)].astype({'len_median':'int'})
gene_stats_clean['N_exons_mean'] = gene_stats_clean['N_exons_mean'].round(2)
gene_stats_clean.reset_index().to_csv(ref_dir + 'cleaned_gene_stats.csv',index=False)
print(gene_stats_clean.shape[0],gene_stats_clean.Accession_count.sort_values(ascending=False).to_dict())
gene_stats_clean

113 {'psaI': 124, 'psaB': 124, 'rps19': 124, 'psaJ': 124, 'psbA': 124, 'psbB': 124, 'psbC': 124, 'psbD': 124, 'psbE': 124, 'psbF': 124, 'psbH': 124, 'psbI': 124, 'psbK': 124, 'psbL': 124, 'psbM': 124, 'psbT': 124, 'psbZ': 124, 'rbcL': 124, 'rpl14': 124, 'rrn4.5': 124, 'rrn23': 124, 'rpl20': 124, 'atpA': 124, 'rrn16': 124, 'rps8': 124, 'rpoA': 124, 'rpoC1': 124, 'rpoC2': 124, 'rps4': 124, 'rps14': 124, 'rps3': 124, 'psaC': 124, 'rps18': 124, 'psaA': 124, 'trnE_UUC': 124, 'atpB': 124, 'atpE': 124, 'atpF': 124, 'atpH': 124, 'atpI': 124, 'ccsA': 124, 'trnY_GUA': 124, 'trnW_CCA': 124, 'trnR_ACG': 124, 'petN': 124, 'trnM_CAU': 124, 'trnQ_UUG': 124, 'pbf1': 124, 'petG': 124, 'trnD_GUC': 124, 'trnC_GCA': 124, 'petD': 124, 'petB': 124, 'rrn5': 124, 'pafII': 124, 'trnF_GAA': 123, 'rps2': 123, 'petL': 123, 'rps12': 123, 'petA': 123, 'rps7': 123, 'rpl33': 123, 'rpl36': 123, 'trnL_UAG': 123, 'trnS_GCU': 123, 'trnR_UCU': 123, 'trnP_UGG': 123, 'trnN_GUU': 123, 'psbJ': 123, 'trnS_GGA': 122, 'rpl22': 1

Unnamed: 0_level_0,len_min,len_median,len_max,Accession_count,forward_sum,reverse_sum,join_sum,N_exons_mean
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
accD,746,1476,1893,112,101,11,1,1.01
atpA,882,1524,1545,124,14,110,0,1.00
atpB,849,1497,1520,124,12,112,0,1.00
atpE,396,402,450,124,13,111,0,1.00
atpF,424,555,576,124,13,111,123,1.99
...,...,...,...,...,...,...,...,...
trnV_UAC,37,74,119,113,9,104,104,1.92
trnW_CCA,74,74,76,124,11,113,1,1.01
trnY_GUA,82,86,89,124,16,108,0,1.00
ycf1,3164,5538,6951,113,29,84,1,1.01


In [88]:
# Remove genes not kept
for idx, record in enumerate(all_seqs):
    if (record.id.split('-')[1] not in gene_stats_clean.index):
        print(record.id,'removed',end=', ')
        del all_seqs[idx]
print(len(all_seqs))

psaM removed, rpl21 removed, trnF_AAA removed, trnP_GGG removed, trnR_CCG removed, trnT_CGU removed, psb30 removed, trnS_AGA removed, psb30 removed, trnG_ACC removed, trnP_GGG removed, trnR_CCG removed, chlB removed, chlN removed, psaM removed, trnP_GGG removed, trnR_CCG removed, trnY_AUA removed, trnfM_CAU removed, chlB removed, chlN removed, psaM removed, rpl21 removed, trnP_GGG removed, trnR_UCG removed, trnT_AGU removed, chlB removed, chlN removed, psaM removed, trnP_GGG removed, trnR_CCG removed, trnY_AUA removed, trnfM_CAU removed, chlB removed, chlN removed, psaM removed, rpl21 removed, trnP_GGG removed, trnR_CCG removed, chlB removed, chlN removed, psaM removed, rpl21 removed, trnD_AUC removed, trnP_GGG removed, trnR_CCG removed, chlB removed, chlN removed, psaM removed, rpl21 removed, trnA_AGC removed, trnP_GGG removed, chlB removed, chlN removed, psaM removed, rpl21 removed, trnK_CUU removed, trnP_GGG removed, trnR_CCG removed, trnT_CGU removed, chlB removed, chlN removed, ps

In [96]:
# Remove duplicates
ls_id = []
for idx, record in enumerate(all_seqs):
    if record.id in ls_id:
        print(record.id,'removed',end=', ')
        del all_seqs[idx]
    else:
        ls_id.append(record.id)
print(len(all_seqs))

13632


In [97]:
SeqIO.write(all_seqs,ref_dir + 'cpDNA_RefCDS.fasta',format='fasta')

13632

In [94]:
Multi_Exons = gene_stats_clean[(gene_stats_clean.N_exons_mean>1.33)]
print(Multi_Exons.index)
Multi_Exons

Index(['atpF', 'clpP1', 'ndhA', 'ndhB', 'pafI', 'petB', 'petD', 'rpl16',
       'rpl2', 'rpoC1', 'rps12', 'rps16', 'trnC_ACA', 'trnG_UCC', 'trnK_UUU',
       'trnL_UAA', 'trnS_CGA', 'trnV_UAC'],
      dtype='object', name='gene')


Unnamed: 0_level_0,len_min,len_median,len_max,Accession_count,forward_sum,reverse_sum,join_sum,N_exons_mean
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
atpF,424,555,576,124,13,111,123,1.99
clpP1,323,591,948,120,10,110,106,2.74
ndhA,574,1092,1119,114,22,92,113,1.99
ndhB,773,1533,1569,116,44,72,113,1.97
pafI,283,507,543,121,11,110,121,2.99
petB,489,648,708,124,116,8,113,1.91
petD,479,483,539,124,116,8,90,1.73
rpl16,392,408,492,120,7,113,102,1.85
rpl2,462,825,849,122,57,65,117,1.97
rpoC1,1359,2049,2118,124,14,110,118,1.95


Run 'makeblastdb -in References/cpDNA_RefCDS.fasta -dbtype nucl -parse_seqids'

blastn  -query ../GetOrganelles/PAFTOL/fasta_pt/PAFTOL_016977_pt.fasta  -db References/cpDNA_RefGenes.fasta -outfmt "6 qseqid sseqid pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore qseq" -out PAFTOL_016977.blast -max_target_seqs 5 -num_threads 1 -gapopen 0 -gapextend 0

python targets_from_blast.py PAFTOL_016977.blast PAFTOL_016977_ptGenes.fasta