# Get viral sequences

Download Genbank viral sequences from NCBI using Biopython

In [2]:
import os
from process_genbank import GenBankRecord, GenBankFeature
home_dir = os.environ["HOME"]
gb_filepath = os.path.join(home_dir, "VIRAL_BIX", "data", "NC_045512_SarsCov2_refseq.gb")
gb_record = GenBankRecord(gb_filepath)

In [3]:
sequence = gb_record.get_sequence()

In [4]:
print(len(sequence))

29903


In [5]:
gb_record.get_first_accession()

'NC_045512'

In [6]:
gb_record.get_id()

In [7]:
gb_record.get_annotation_map()

{'molecule_type': 'ss-RNA',
 'topology': 'linear',
 'data_file_division': 'VRL',
 'date': '18-JUL-2020',
 'accessions': ['NC_045512'],
 'sequence_version': 2,
 'keywords': ['RefSeq'],
 'source': 'Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)',
 'organism': 'Severe acute respiratory syndrome coronavirus 2',
 'taxonomy': ['Viruses',
  'Riboviria',
  'Orthornavirae',
  'Pisuviricota',
  'Pisoniviricetes',
  'Nidovirales',
  'Cornidovirineae',
  'Coronaviridae',
  'Orthocoronavirinae',
  'Betacoronavirus',
  'Sarbecovirus'],
 'references': [Reference(title='A new coronavirus associated with human respiratory disease in China', ...),
  Reference(title='Programmed ribosomal frameshifting in decoding the SARS-CoV genome', ...),
  Reference(title='The structure of a rigorously conserved RNA element within the SARS virus genome', ...),
  Reference(title="A phylogenetically conserved hairpin-type 3' untranslated region pseudoknot functions in coronavirus RNA replication", ...),
  

In [8]:
first_gene_feature = gb_record.get_features_map()["gene"][0]

In [9]:
gf = GenBankFeature(first_gene_feature, sequence)

In [10]:
gf.get_feature_map()

{'ORF1ab': 'ATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTT

In [11]:
gf.get_qualifiers()

OrderedDict([('gene', ['ORF1ab']),
             ('locus_tag', ['GU280_gp01']),
             ('db_xref', ['GeneID:43740578'])])