# Python Translate Script - Jupyter Notebook

In [1]:
##Import BioPython if not already installed##

#Import these packages
from Bio import SeqIO
from Bio.Data import CodonTable
import pandas as pd

# Functions 

#1. Document Dr. X's function with comments and with markdown text.


In [7]:
def get_sequences_from_file(fasta_fn): #def is the keyword for defining our function, "get_sequences_from_file"
    
    """Description: Takes a fasta file and reduces the amount of information to just genus/species name and sequence information

    Arguments:
        fasta_fn: the name of the fasta file in which you're interested in

    Must Create before "for loop": empty dictonary for information to be stored in

    For loop - For each record within the fasta file:
            1. Splits each description,
            2. Takes 2nd and 3rd positions of record and adds a space inbetween, create new object "species name"
            3. Obtain the sequence information 
    
    Return: A dictonary of records with genus name, species name and sequence information
   
    Example of usage:
        >>> penguins_sequence_data = get_sequences_from_file("penguins_cytb.fasta")
        >>> print(penguins_sequence_data)
    
    """
    sequence_data_dict = {}
    for record in SeqIO.parse(fasta_fn, "fasta"): #creating a for loop    
        description = record.description.split() 
        species_name = description[1] + " " + description[2]
        sequence_data_dict[species_name] = record.seq
    return(sequence_data_dict) #ends the execution of a function and "returns" the value(s)

In [11]:
#Show that our function documentation works!
help(get_sequences_from_file)
#Let's also load the example into our environment for later...
penguins_sequence_data = get_sequences_from_file("penguins_cytb.fasta")

Help on function get_sequences_from_file in module __main__:

get_sequences_from_file(fasta_fn)
    Description: Takes a fasta file and reduces the amount of information to just genus/species name and sequence information
    
    Arguments:
        fasta_fn: the name of the fasta file in which you're interested in
    
    Must Create before "for loop": empty dictonary for information to be stored in
    
    For loop - For each record within the fasta file:
            1. Splits each description,
            2. Takes 2nd and 3rd positions of record and adds a space inbetween, create new object "species name"
            3. Obtain the sequence information 
    
    Return: A dictonary of records with genus name, species name and sequence information
    
    Example of usage:
        >>> penguins_sequence_data = get_sequences_from_file("penguins_cytb.fasta")
        >>> print(penguins_sequence_data)



#2. Write a function that translates a string of nucleotides to amino acids based on Dr. X's pseudo-code suggestion.


In [46]:
print(mito_table)

Table 2 Vertebrate Mitochondrial, SGC1

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA W   | A
T | TTG L   | TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L   | CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I(s)| ACT T   | AAT N   | AGT S   | T
A | ATC I(s)| ACC T   | AAC N   | AGC S   | C
A | ATA M(s)| ACA T   | AAA K   | AGA Stop| A
A | ATG M(s)| ACG T   | AAG K   | AGG Stop| G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V(s)| GCG A   | GAG E   | GGG G   

In [47]:
print(penguins_sequence_data)

{'Aptenodytes forsteri': Seq('ATGGCCCCAAATCTCCGAAAATCCCATCCCCTCCTAAAAATAATTAATAACTCC...TAA'), 'Aptenodytes patagonicus': Seq('ATGGCCCCAAACCTCCGAAAATCCCATCCTCTCCTAAAAATAATTAATAACTCC...TAA'), 'Eudyptes chrysocome': Seq('ATGGCCCCCAACCTCCGAAAATCCCACCCCCTCCTAAAAACAATCAATAACTCC...TAA'), 'Eudyptes chrysolophus': Seq('ATGGCCCCCAACCTCCGAAAATCCCACCCCCTCCTAAAAACAATCAATAACTCC...TAA'), 'Eudyptes sclateri': Seq('ATGGCCCCCAACCTCCGAAAATCCCACCCCCTCCTAAAAACAATCAATAACTCC...TAA'), 'Eudyptula minor': Seq('ATGGCCCCCAACCTCCGAAAATCTCACCCCCTCCTAAAAATAATCAACAACTCT...TAA'), 'Pygoscelis adeliae': Seq('ATGGCCCCCAACCTCCGAAAATCCCACCCTCTCCTAAAAATAATTAACAACTCC...TAA'), 'Pygoscelis antarctica': Seq('ATGGCCCCCAACCTCCGAAAATCCCACCCTCTCCTAAAAATAATCAACAACTCC...TAG'), 'Pygoscelis papua': Seq('ATGGCCCCCAACCTTCGAAAATCCCACCCTCTCCTAAAAATAATCAACAAATCC...TAG'), 'Spheniscus demersus': Seq('ATGGCCCCCAACCTCCGAAAATCCCACCCTCTCCTAAAAACAATCAACAACTCC...TAA'), 'Spheniscus humboldti': Seq('ATGGCCCCCAACCTCCGAAAATCCCACCCTCTCCTAAAAACAATCAACAAC

#3. Write an alternative translation function.

In [None]:
from Bio.Seq import Seq

In [55]:
mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]
coding_dna = penguins_sequence_data['Aptenodytes forsteri']
print(coding_dna.translate(table=mito_table, cds=True)[1:])

APNLRKSHPLLKMINNSLIDLPTPSNISAWWNFGSLLGICLTTQILTGLLLAMHYTADTTLAFSSVAHTCRNVQYGWLIRNLHANGASFFFICIYLHIGRGFYYGSYLYKETWNTGIILLLTLMATAFVGYVLPWGQMSFWGATVITNLFSAIPYIGQTLVEWTWGGFSVDNPTLTRFFALHFLLPFMIAGLTLIHLTFLHESGSNNPLGIVANSDKIPFHPYYSTKDILGFALMLLPLTTLALFSPNLLGDPENFTPANPLVTPPHIKPEWYFLFAYAILRSIPNKLGGVLALAASVLILFLIPLLHKSKQRTMAFRPLSQLLFWALVANLIILTWVGSQPVEHPFIIIGQLASLTYFTTLLILFPIAGALENKMLNH


In [81]:
translation_string = ''
for i in list(penguins_sequence_data.values()):
    translation_string += i.translate(table=mito_table, cds=True)[1:]

In [82]:
print(translation_string)

APNLRKSHPLLKMINNSLIDLPTPSNISAWWNFGSLLGICLTTQILTGLLLAMHYTADTTLAFSSVAHTCRNVQYGWLIRNLHANGASFFFICIYLHIGRGFYYGSYLYKETWNTGIILLLTLMATAFVGYVLPWGQMSFWGATVITNLFSAIPYIGQTLVEWTWGGFSVDNPTLTRFFALHFLLPFMIAGLTLIHLTFLHESGSNNPLGIVANSDKIPFHPYYSTKDILGFALMLLPLTTLALFSPNLLGDPENFTPANPLVTPPHIKPEWYFLFAYAILRSIPNKLGGVLALAASVLILFLIPLLHKSKQRTMAFRPLSQLLFWALVANLIILTWVGSQPVEHPFIIIGQLASLTYFTTLLILFPIAGALENKMLNHAPNLRKSHPLLKMINNSLIDLPTPSNISAWWNFGSLLGICLTTQILTGLLLAMHYTADTTLAFSSVAHTCRNVQYGWLIRNLHANGASFFFICIYLHIGRGFYYGSYLYKETWNTGIILLLTLMATAFVGYVLPWGQMSFWGATVITNLFSAIPYIGQTLVEWAWGGFSVDNPTLTRFFALHFLLPFMIAGLTLIHLTFLHESGSNNPLGIVANSDKIPFHPYYSTKDTLGFALMLLPLTTLALFSPNLLGDPENFTPANPLVTPPHIKPEWYFLFAYAILRSIPNKLGGVLALAASVLILFLIPLLHKSKQRTMTFRPLSQLLFWTLVANLTILTWIGSQPVEHPFIIIGQLASLTYFTILLILFPLIGTLENKMLNHAPNLRKSHPLLKTINNSLIDLPTPSNISAWWNFGSLLGICLATQILTGLLLAAHYTADTTLAFSSVAHTCRNVQYGWLIRNLHANGASFFFICIYLHIGRGLYYGSYLYKETWNTGIILLLTLMATAFVGYVLPWGQMSFWGATVITNLFSAIPYIGQTLVEWAWGGFSVDNPTLTRFFTLHFLLPFMIAGLTLIHLTFLHESGSNNPLGIVANSDKIPFHPYYSTKDILGFILLLLPLTTL

In [56]:
def new_translation(nucleotides): 
    translation_data_dict = {}
    for record in SeqIO.parse(nucleotides, "fasta"):    
        description = record.description.split() 
        species_name = description[1] + " " + description[2]
        translation_data_dict[species_name] = record.seq.translate(table=mito_table, cds=True)[1:]
    return(translation_data_dict) 

In [57]:
penguins_translation = new_translation("penguins_cytb.fasta")
print(penguins_translation)

{'Aptenodytes forsteri': Seq('APNLRKSHPLLKMINNSLIDLPTPSNISAWWNFGSLLGICLTTQILTGLLLAMH...LNH'), 'Aptenodytes patagonicus': Seq('APNLRKSHPLLKMINNSLIDLPTPSNISAWWNFGSLLGICLTTQILTGLLLAMH...LNH'), 'Eudyptes chrysocome': Seq('APNLRKSHPLLKTINNSLIDLPTPSNISAWWNFGSLLGICLATQILTGLLLAAH...LNH'), 'Eudyptes chrysolophus': Seq('APNLRKSHPLLKTINNSLIDLPTPSNISAWWNFGSLLGICLATQILTGLLLAAH...LNH'), 'Eudyptes sclateri': Seq('APNLRKSHPLLKTINNSLIDLPTPSNISAWWNFGSLLGICLATQILTGLLLAAH...LNH'), 'Eudyptula minor': Seq('APNLRKSHPLLKMINNSLIDLPTPSNISTWWNFGSLLGICLITQILTGLLLAAH...LSH'), 'Pygoscelis adeliae': Seq('APNLRKSHPLLKMINNSLIDLPTPSNISAWWNFGSLLGICLTTQILTGLLLAMH...LNH'), 'Pygoscelis antarctica': Seq('APNLRKSHPLLKMINNSLIDLPTPSNISAWWNFGSLLGICLTTQILTGLLLAMH...LNF'), 'Pygoscelis papua': Seq('APNLRKSHPLLKMINKSLIDLPTPPNISAWWNFGSLLGICLITQILTGLLLAMH...LNF'), 'Spheniscus demersus': Seq('APNLRKSHPLLKTINNSLIDLPTPSNISAWWNFGSLLGICLATQILTGLLLAAH...LNH'), 'Spheniscus humboldti': Seq('APNLRKSHPLLKTINNSLIDLPTPSNISAWWNFGSLLSICLATQILTGLLL

#4. Write a function that calculates the molecular weight of each 3 amino acid sequence.

#5. Write a function that computes the GC-content of each DNA sequence.