# Downloading and parsing GenBank files from Python

## Installation
1. Fork git repo into local machine (click on fork) and clone, or simply clone main branch with
```
git clone https://github.com/Robaina/GenBankpy.git
```
2. CD to project directory and set conda environment if not already set:
```
conda env create -n genbankpy -f environment.yml
```

3. Activate environment:
```
conda activate genbankpy
```

In [1]:
# conda activate ncbi
from genbankpy.parser import GenBankFastaWriter, GBK

"""
This package requires:

pip install ncbi-acc-download
"""

# First we need to define the NCBI entry ids to download the data
entry_ids = [
    'AE001863.1',
    'AF000579.1',
    'AF242489.1', 
    'AP003593.1', 
    'NC_000911.1',
    'NC_007288.1'
]
gbkwriter = GenBankFastaWriter.fromAccessionIDs(entry_ids=entry_ids)
# gbkwriter = GenBankFastaWriter.fromGBKdirectory('gbk_data')

Downloading GenBank files
Skipping donwloaded entry: NC_007288.1 (6 / 6)

In [79]:
from genbankpy.parser import GenBankFastaWriter, GBK


gbkwriter = GenBankFastaWriter.fromAccessionIDs(entry_ids=["1755381"])

Downloading GenBank files
Initializing parser...5381 (1 / 1)
Done!


In [2]:
# Write fasta containing all peptide sequences of these two organisms
gbkwriter.writeSequencesInFasta(
    gene_keywords={'product': ['any']},
    output_fasta='results/allPeptides.faa', 
    sequence='protein',
    entry_ids=['AE001863.1', 'AP003593.1']
)

# Write fasta containing all nucleotide sequences of these two organisms
gbkwriter.writeSequencesInFasta(
    gene_keywords={'product': ['any']},
    output_fasta='results/allNucleotides.fasta', 
    sequence='nucleotide',
    entry_ids=['AE001863.1', 'AP003593.1']
)

# Write fasta containing nucleotide sequences of the two organisms corresponding to Urease alpha
gbkwriter.writeSequencesInFasta(
    gene_keywords={'product': ['urease', 'alpha']},
    output_fasta='results/ureC.fasta', 
    sequence='nucleotide'
)

# Write fasta containing peptide sequences of the two organisms corresponding to Urease alpha
gbkwriter.writeSequencesInFasta(
    gene_keywords={'product': ['urease', 'alpha']},
    output_fasta='results/ureC.faa', 
    sequence='protein',
    entry_ids=['AE001863.1', 'AP003593.1']
)

# Write fasta containing nucleotide sequences of all five corresponding to 16S
gbkwriter.writeSequencesInFasta(
    gene_keywords={'product': ['16S']},
    output_fasta='results/16s.fasta', 
    sequence='nucleotide',
    entry_ids=None
)

# Initializing from list of species names

Checking if there are available genomes to download before actually downloading them, thus avoiding consequent error messages:

In [1]:
from pathlib import Path
from genbankpy.download import NCBIdownloader


downloader = NCBIdownloader(data_directory=Path("ncbi_test2"))
downloader.fromSpecies(species="Escherichia coli", dry_run=False)

New version of client (13.37.1) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/linux-amd64/datasets


In [2]:
from pathlib import Path
from genbankpy.download import NCBIdownloader


downloader = NCBIdownloader(data_directory=Path("ncbi_test2"))
downloader.fromAccessionIDs(ids=["GCF_000005845.2", "GCF_000008865.2"])

New version of client (13.36.0) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/linux-amd64/datasets


In [1]:
from pathlib import Path
from genbankpy.parser import GenBankFastaWriter, GBK


sp_list = [
    'Halobacterium salinarum',
    'Escherichia coli',
    'Pseudomonas aeruginosa',
    'Proteus mirabilis',
    'Klebsiella pneumoniae',
    'Prochlorococcus marinus',
    'Pelagibacter ubique'
]

In [2]:
gbkwriter = GenBankFastaWriter.fromSpecies(species_list=sp_list,
                                           only_latest=True,
                                           data_dir=Path("demo_data"))

New version of client (13.37.1) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/linux-amd64/datasets
New version of client (13.37.1) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/linux-amd64/datasets
New version of client (13.37.1) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/linux-amd64/datasets
New version of client (13.37.1) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/linux-amd64/datasets
New version of client (13.37.1) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/linux-amd64/datasets
New version of client (13.37.1) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/linux-amd64/datasets
New version of client (13.37.1) available at https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/linux-amd64/datasets


Done!
Done!
Done!
Done!
Done!
Done!
Done!


In [3]:
gbkwriter.writeSequencesInFasta(
    gene_keywords={'product': ['pyruvate kinase']},
    output_fasta='results/pyruvate_kinase_demo.faa', 
    sequence='protein'
)

In [49]:
from Bio import Entrez

Entrez.email = "srobaina@ull.edu.es"
Entrez.api_key = "1116927186225efd264f0dbbf20798626908"

database = "genome"
query = "Escherichia coli"

IDs = Entrez.read(Entrez.esearch(db=database, retmax=3, term=query))#["IdList"]
# for ID in IDs:
#     print(Entrez.efetch(db="nucleotide", id=ID, rettype="genbank", retmode="text").read())
IDs

{'Count': '1', 'RetMax': '1', 'RetStart': '0', 'IdList': ['167'], 'TranslationSet': [{'From': 'Escherichia coli', 'To': '"Escherichia coli"[Organism]'}], 'TranslationStack': [{'Term': '"Escherichia coli"[Organism]', 'Field': 'Organism', 'Count': '1', 'Explode': 'Y'}, 'GROUP'], 'QueryTranslation': '"Escherichia coli"[Organism]'}

In [4]:
gbkwriter = GenBankFastaWriter.fromAccessionIDs(entry_ids=['GCF_000157115.2'])

Downloading GenBank files
NCBI Entrez returned error code 400, are ID(s) GCF_000157115.2 valid?


Failed to download file with id GCF_000157115.2 from NCBI


In [None]:
gbkwriter.writeSequencesInFasta(
    gene_keywords={'product': ['pyruvate kinase']},
    output_fasta='results/pyruvate_kinase_demo.faa', 
    sequence='protein'
)

In [3]:
sp_list = ['Emiliania huxleyi']

gbkwriter = GenBankFastaWriter.fromSpecies(species_list=sp_list,
                                           only_representatives=True)

gbkwriter.writeSequencesInFasta(
    gene_keywords={'product': ['any']},
    output_fasta='results/allPeptidesEmiliania.faa', 
    sequence='protein'
)

Initializing parser...pecies: Emiliania huxleyi (1 / 1)
Done!


# Parsing GenBank files

In [3]:
gbk = GBK('gbk_data/AE001863.1.gbk')

In [4]:
gbk.cds.get_by_gene_id('DRA0303')

[SeqFeature(FeatureLocation(ExactPosition(113558), ExactPosition(113924), strand=-1), type='CDS')]