# Downloading and parsing GenBank files

In [6]:
from genbankpy.parser import GenBankFastaWriter, GBK

import pandas as pd

meta = pd.read_csv('/home/robaina/Documents/GenBankpy/metadata.txt', sep='\t')
meta

Unnamed: 0,assembly_accession,bioproject,biosample,wgs_master,excluded_from_refseq,refseq_category,relation_to_type_material,taxid,species_taxid,organism_name,...,assembly_level,release_type,genome_rep,seq_rel_date,asm_name,submitter,gbrs_paired_asm,paired_asm_comp,ftp_path,local_filename
0,GCF_000372725.1,PRJNA222302,SAMN02744062,AHAL00000000.1,,representative genome,,280463,2903,Emiliania huxleyi CCMP1516,...,Scaffold,Major,Full,2013/05/02,Emiliana huxleyi CCMP1516 main genome assembly...,JGI,GCA_000372725.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,./GCF_000372725.1_Emiliana_huxleyi_CCMP1516_ma...
1,GCF_000865825.1,PRJNA485481,,,,na,ICTV species exemplar,181082,181082,Emiliania huxleyi virus 86,...,Complete Genome,Major,Full,2005/08/26,ViralProj15618,,GCA_000865825.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,./GCF_000865825.1_ViralProj15618_genomic.gbff.gz


In [13]:
# conda activate ncbi
from genbankpy.parser import GenBankFastaWriter, GBK

"""
This package requires:

pip install ncbi-acc-download
"""

# First we need to define the NCBI entry ids to download the data
entry_ids = [
    'AE001863.1',
    'AF000579.1',
    'AF242489.1', 
    'AP003593.1', 
    'NC_000911.1',
    'NC_007288.1'
]
gbkwriter = GenBankFastaWriter.fromAccessionIDs(entry_ids=entry_ids)
# gbkwriter = GenBankFastaWriter.fromGBKdirectory('gbk_data')

Downloading GenBank files
Downloading entry: NC_007288.1 (6 / 6) (5 / 6)

In [5]:
# Write fasta containing all peptide sequences of these two organisms
gbkwriter.writeSequencesInFasta(
    gene_keywords={'product': ['any']},
    output_fasta='results/allPeptides.faa', 
    sequence='protein',
    entry_ids=['AE001863.1', 'AP003593.1']
)

# Write fasta containing all nucleotide sequences of these two organisms
gbkwriter.writeSequencesInFasta(
    gene_keywords={'product': ['any']},
    output_fasta='results/allNucleotides.fasta', 
    sequence='nucleotide',
    entry_ids=['AE001863.1', 'AP003593.1']
)

# Write fasta containing nucleotide sequences of the two organisms corresponding to Urease alpha
gbkwriter.writeSequencesInFasta(
    gene_keywords={'product': ['urease', 'alpha']},
    output_fasta='results/ureC.fasta', 
    sequence='nucleotide'
)

# Write fasta containing peptide sequences of the two organisms corresponding to Urease alpha
gbkwriter.writeSequencesInFasta(
    gene_keywords={'product': ['urease', 'alpha']},
    output_fasta='results/ureC.faa', 
    sequence='protein',
    entry_ids=['AE001863.1', 'AP003593.1']
)

# Write fasta containing nucleotide sequences of all five corresponding to 16S
gbkwriter.writeSequencesInFasta(
    gene_keywords={'product': ['16S']},
    output_fasta='results/16s.fasta', 
    sequence='nucleotide',
    entry_ids=None
)

# Parsing GenBank files

In [3]:
gbk = GBK('gbk_data/AE001863.1.gbk')

In [4]:
gbk.cds.get_by_gene_id('DRA0303')

[SeqFeature(FeatureLocation(ExactPosition(113558), ExactPosition(113924), strand=-1), type='CDS')]