# DNA juggler

This notebook provides a quick way of translating DNA using the Biopython library (Cock et al., 2009).

In [15]:
from Bio import SeqIO, Align, SeqRecord
from Bio.Seq import Seq, IUPAC
from Bio.Alphabet import _verify_alphabet
from biomart import BiomartServer
from biomart import BiomartServer
from io import StringIO
import pandas as pd
import pprint

In [2]:
def validate_sequence(sequence_str):
    seq = Seq(sequence_str, alphabet=IUPAC.IUPACUnambiguousDNA)
    if not _verify_alphabet(seq):
        raise ValueError("The provided sequence does not correspond to unambiguous DNA")
    if len(sequence_str) % 3 != 0:
        raise ValueError("The provided sequence must represent whole codons")

In [9]:
server = BiomartServer( "http://grch37.ensembl.org/biomart" )
# use this server for GRCh38
# server = BiomartServer( "http://ensembl.org/biomart" )
gene_ensembl = server.datasets['hsapiens_gene_ensembl']

In [5]:
def get_transcript_definition(transcript_id):
    response = gene_ensembl.search({
      'filters': {
          'ensembl_transcript_id': [transcript_id]
      },
      'attributes': [
          'ensembl_gene_id', 'ensembl_transcript_id', 'external_gene_name', 
          'chromosome_name', 'strand',
          # 'ensembl_exon_id', 
          #'cds_start', 'cds_end', 'exon_chrom_start', 'exon_chrom_end',
          #'genomic_coding_start', 'genomic_coding_end'
          'coding'
      ]
    }, header=1)
    results = pd.read_csv(StringIO(response.text), sep="\t")
    return results

## Input your DNA sequence or an Ensembl transcript ID

e.g.: CGTCAAGGCGCTCTTGCCTACGCCACCAGCTCCAACCAC or ENST00000311936

**NOTE**: beware that the coding sequence of genes in the negative strand is already reversed and complemented

In [16]:
print("Input your DNA or transcript ID sequence:")
sequence_str = input()
if sequence_str.startswith('ENST'):
    transcript = get_transcript_definition(transcript_id=transcript_id)
    seq_record = SeqRecord.SeqRecord(Seq(transcript['Coding sequence'].iloc[0]))
    pprint.pprint(transcript)
else:
    validate_sequence(sequence_str.upper())
    seq_record = SeqRecord.SeqRecord(Seq(sequence_str))
    
str(seq_record.seq)

Input your DNA or transcript ID sequence:


 ENST00000311936


                                     Coding sequence   Gene stable ID  \
0  ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGGCGTAGGCA...  ENSG00000133703   

  Transcript stable ID Gene name  Chromosome/scaffold name  Strand  
0      ENST00000311936      KRAS                        12      -1  


'ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGGCGTAGGCAAGAGTGCCTTGACGATACAGCTAATTCAGAATCATTTTGTGGACGAATATGATCCAACAATAGAGGATTCCTACAGGAAGCAAGTAGTAATTGATGGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAGAGGAGTACAGTGCAATGAGGGACCAGTACATGAGGACTGGGGAGGGCTTTCTTTGTGTATTTGCCATAAATAATACTAAATCATTTGAAGATATTCACCATTATAGAGAACAAATTAAAAGAGTTAAGGACTCTGAAGATGTACCTATGGTCCTAGTAGGAAATAAATGTGATTTGCCTTCTAGAACAGTAGACACAAAACAGGCTCAGGACTTAGCAAGAAGTTATGGAATTCCTTTTATTGAAACATCAGCAAAGACAAGACAGGGTGTTGATGATGCCTTCTATACATTAGTTCGAGAAATTCGAAAACATAAAGAAAAGATGAGCAAAGATGGTAAAAAGAAGAAAAAGAAGTCAAAGACAAAGTGTGTAATTATGTAA'

## Translated sequence

In [7]:
str(seq_record.translate().seq)

'MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQGVDDAFYTLVREIRKHKEKMSKDGKKKKKKSKTKCVIM*'

## Reversed and complemented DNA sequence

In [38]:
str(seq_record.reverse_complement().seq)

'TTACATAATTACACACTTTGTCTTTGACTTCTTTTTCTTCTTTTTACCATCTTTGCTCATCTTTTCTTTATGTTTTCGAATTTCTCGAACTAATGTATAGAAGGCATCATCAACACCCTGTCTTGTCTTTGCTGATGTTTCAATAAAAGGAATTCCATAACTTCTTGCTAAGTCCTGAGCCTGTTTTGTGTCTACTGTTCTAGAAGGCAAATCACATTTATTTCCTACTAGGACCATAGGTACATCTTCAGAGTCCTTAACTCTTTTAATTTGTTCTCTATAATGGTGAATATCTTCAAATGATTTAGTATTATTTATGGCAAATACACAAAGAAAGCCCTCCCCAGTCCTCATGTACTGGTCCCTCATTGCACTGTACTCCTCTTGACCTGCTGTGTCGAGAATATCCAAGAGACAGGTTTCTCCATCAATTACTACTTGCTTCCTGTAGGAATCCTCTATTGTTGGATCATATTCGTCCACAAAATGATTCTGAATTAGCTGTATCGTCAAGGCACTCTTGCCTACGCCACCAGCTCCAACTACCACAAGTTTATATTCAGTCAT'

## Reversed, complemented and translated sequence

In [39]:
str(seq_record.reverse_complement().translate().seq)

'LHNYTLCL*LLFLLFTIFAHLFFMFSNFSN*CIEGIINTLSCLC*CFNKRNSITSC*VLSLFCVYCSRRQITFISY*DHRYIFRVLNSFNLFSIMVNIFK*FSIIYGKYTKKALPSPHVLVPHCTVLLLTCCVENIQETGFSINYYLLPVGILYCWIIFVHKMILN*LYRQGTLAYATSSNYHKFIFSH'