# Python Translate Script - Jupyter Notebook

# Required Packages

In [1]:
##Install BioPython if not already installed##

#Import these packages
from Bio import SeqIO
from Bio.Data import CodonTable
import pandas as pd
from Bio.Seq import Seq

# Functions 

#1. Document Dr. X's function with comments and with markdown text.


In [2]:
def get_sequences_from_file(fasta_fn): #def is the keyword for defining our function, "get_sequences_from_file"
    
    """Description: Takes a fasta file and reduces the amount of information to just genus/species name and sequence information

    Arguments:
        fasta_fn: the name of the fasta file in which you're interested in

    Must Create before "for loop": empty dictonary for information to be stored in

    For loop - For each record within the fasta file:
            1. Splits each description,
            2. Takes 2nd and 3rd positions of record and adds a space inbetween, create new object "species name"
            3. Obtain the sequence information 
    
    Return: A dictonary of records with genus name, species name and sequence information
   
    Example of usage:
        >>> penguins_sequence_data = get_sequences_from_file("penguins_cytb.fasta")
        >>> print(penguins_sequence_data)
    
    """
    sequence_data_dict = {}
    for record in SeqIO.parse(fasta_fn, "fasta"): #creating a for loop    
        description = record.description.split() 
        species_name = description[1] + " " + description[2]
        sequence_data_dict[species_name] = record.seq
    return(sequence_data_dict) #ends the execution of a function and "returns" the value(s)

In [3]:
#Show that our function documentation works!
help(get_sequences_from_file)
#Let's also load the example into our environment for later...
penguins_sequence_data = get_sequences_from_file("penguins_cytb.fasta")

Help on function get_sequences_from_file in module __main__:

get_sequences_from_file(fasta_fn)
    Description: Takes a fasta file and reduces the amount of information to just genus/species name and sequence information
    
    Arguments:
        fasta_fn: the name of the fasta file in which you're interested in
    
    Must Create before "for loop": empty dictonary for information to be stored in
    
    For loop - For each record within the fasta file:
            1. Splits each description,
            2. Takes 2nd and 3rd positions of record and adds a space inbetween, create new object "species name"
            3. Obtain the sequence information 
    
    Return: A dictonary of records with genus name, species name and sequence information
    
    Example of usage:
        >>> penguins_sequence_data = get_sequences_from_file("penguins_cytb.fasta")
        >>> print(penguins_sequence_data)



#2. Write a function that translates a string of nucleotides to amino acids based on Dr. X's pseudo-code suggestion.


In [4]:
def translate_function(string_nucleotides):
    mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]
    codons = [(string_nucleotides[i:i + 3]) for i in range(0, len(string_nucleotides), 3)] # make codons a list iterate over every codon
    aa_seq_list = [] # empty list we'll need to put amino acids into
    stop_codons = ["TAA", "TAG", "AGA", "AGG"] 
    for codon in codons: 
        if codon not in stop_codons:
            aa_seq_list.append(mito_table.forward_table[codon])
        else:
           aa_seq_list.append("")
    #print(mito_table.forward_table[codon]) # take every amino acid from codon table and append    
    #print(aa_seq_string)
    #aa_seq_list.pop()
    aa_seq_string = ''.join(aa_seq_list)
    return(aa_seq_string)
    
print(translate_function(penguins_sequence_data["Aptenodytes forsteri"]))

MAPNLRKSHPLLKMINNSLIDLPTPSNISAWWNFGSLLGICLTTQILTGLLLAMHYTADTTLAFSSVAHTCRNVQYGWLIRNLHANGASFFFICIYLHIGRGFYYGSYLYKETWNTGIILLLTLMATAFVGYVLPWGQMSFWGATVITNLFSAIPYIGQTLVEWTWGGFSVDNPTLTRFFALHFLLPFMIAGLTLIHLTFLHESGSNNPLGIVANSDKIPFHPYYSTKDILGFALMLLPLTTLALFSPNLLGDPENFTPANPLVTPPHIKPEWYFLFAYAILRSIPNKLGGVLALAASVLILFLIPLLHKSKQRTMAFRPLSQLLFWALVANLIILTWVGSQPVEHPFIIIGQLASLTYFTTLLILFPIAGALENKMLNH


#3. Write an alternative translation function.

In [5]:
def alternate_translate_function(string_nucleotides2):
    alternate_mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]
    alternate_aa_seq = string_nucleotides2.translate(table = alternate_mito_table, cds = True)
    alternate_aa_seq_string = str(alternate_aa_seq)
    return(alternate_aa_seq_string)
print(alternate_translate_function(penguins_sequence_data["Aptenodytes forsteri"]))

MAPNLRKSHPLLKMINNSLIDLPTPSNISAWWNFGSLLGICLTTQILTGLLLAMHYTADTTLAFSSVAHTCRNVQYGWLIRNLHANGASFFFICIYLHIGRGFYYGSYLYKETWNTGIILLLTLMATAFVGYVLPWGQMSFWGATVITNLFSAIPYIGQTLVEWTWGGFSVDNPTLTRFFALHFLLPFMIAGLTLIHLTFLHESGSNNPLGIVANSDKIPFHPYYSTKDILGFALMLLPLTTLALFSPNLLGDPENFTPANPLVTPPHIKPEWYFLFAYAILRSIPNKLGGVLALAASVLILFLIPLLHKSKQRTMAFRPLSQLLFWALVANLIILTWVGSQPVEHPFIIIGQLASLTYFTTLLILFPIAGALENKMLNH


#4. Write a function that calculates the molecular weight of each amino acid sequence.

In [6]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [7]:
translate_A_forsteri = alternate_translate_function(penguins_sequence_data["Aptenodytes forsteri"])
print(translate_A_forsteri)

MAPNLRKSHPLLKMINNSLIDLPTPSNISAWWNFGSLLGICLTTQILTGLLLAMHYTADTTLAFSSVAHTCRNVQYGWLIRNLHANGASFFFICIYLHIGRGFYYGSYLYKETWNTGIILLLTLMATAFVGYVLPWGQMSFWGATVITNLFSAIPYIGQTLVEWTWGGFSVDNPTLTRFFALHFLLPFMIAGLTLIHLTFLHESGSNNPLGIVANSDKIPFHPYYSTKDILGFALMLLPLTTLALFSPNLLGDPENFTPANPLVTPPHIKPEWYFLFAYAILRSIPNKLGGVLALAASVLILFLIPLLHKSKQRTMAFRPLSQLLFWALVANLIILTWVGSQPVEHPFIIIGQLASLTYFTTLLILFPIAGALENKMLNH


In [8]:
def compute_molecular_weight(nucleotide_seq_string2):
    prot_analysis = ProteinAnalysis(nucleotide_seq_string2)
    mol_weight = prot_analysis.molecular_weight()
    return(mol_weight)

In [9]:
mol_weight_A_forsteri = compute_molecular_weight(translate_A_forsteri)
print(mol_weight_A_forsteri)

42459.602100000004


#5. Write a function that computes the GC-content of each DNA sequence.

In [11]:
from Bio.SeqUtils import GC

In [41]:
def GC(sequence):
    """Calculate G+C content, return percentage (as float between 0 and 100).
    Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
    when counting the G and C content.  The percentage is calculated against
    the full length, e.g.:
    >>> from Bio.SeqUtils import GC
    >>> GC("ACTGN")
    40.0
    Note that this will return zero for an empty sequence.
    """
    gc = sum(sequence.count(x) for x in ["G", "C", "g", "c", "S", "s"])
    try:
        return gc / len(sequence)
    except ZeroDivisionError:
        return 0.0

In [42]:
nucleotide_A_forsteri = penguins_sequence_data["Aptenodytes forsteri"]
nucleotide_A_forsteri = str(nucleotide_A_forsteri)
print(nucleotide_A_forsteri)

ATGGCCCCAAATCTCCGAAAATCCCATCCCCTCCTAAAAATAATTAATAACTCCCTAATCGACCTGCCCACCCCATCAAACATCTCTGCCTGATGAAACTTCGGATCTCTCCTAGGCATCTGCCTAACTACACAAATTTTAACCGGCCTCCTACTAGCTATACACTACACTGCAGACACAACCCTAGCCTTCTCCTCAGTCGCCCACACATGCCGAAACGTACAGTACGGCTGACTGATCCGCAACCTACATGCAAACGGAGCATCATTCTTCTTCATCTGCATCTATCTCCACATTGGCCGTGGATTTTACTATGGCTCCTATCTATACAAAGAAACCTGAAACACAGGCATTATCCTCCTACTCACCCTCATGGCAACCGCCTTCGTAGGCTACGTCCTACCATGAGGACAAATATCTTTCTGAGGAGCCACAGTCATTACCAACTTATTCTCAGCCATCCCTTACATTGGCCAAACCCTCGTAGAATGGACCTGAGGTGGCTTTTCAGTAGACAACCCCACATTAACCCGATTTTTCGCACTACACTTCCTCCTTCCCTTCATAATCGCAGGCCTCACCCTCATCCACCTCACCTTCCTCCACGAATCAGGCTCAAATAACCCACTGGGCATCGTAGCTAACTCCGATAAAATCCCATTCCACCCCTACTACTCCACAAAAGACATCCTAGGATTCGCACTCATACTTCTCCCACTAACAACCCTTGCCCTATTCTCCCCCAACCTACTAGGAGACCCAGAAAACTTCACCCCAGCAAACCCACTAGTCACACCCCCACACATCAAACCAGAATGATACTTCCTATTTGCATACGCTATCCTACGCTCAATCCCCAACAAACTAGGAGGAGTCCTTGCCCTAGCAGCATCCGTGCTAATCCTATTCCTAATCCCTCTCCTCCACAAATCCAAGCAACGCACAATAGCTTTCCGCCCTCTCTCCCAACTCCTATTCTGAGCCCTAGTAGCCAATCTCA

In [43]:
GC(nucleotide_A_forsteri)

0.48381452318460194

In [None]:
translation_string = ''
for i in list(penguins_sequence_data.values()):
    translation_string += i.translate(table=mito_table, cds=True)

In [10]:
def new_translation(codons): 
    translation_data_dict = {}
    for record in SeqIO.parse(codons, "fasta"):    
        description = record.description.split() 
        species_name = description[1] + " " + description[2]
        translation_data_dict[species_name] = record.seq.translate(table=mito_table, cds=True)[1:]
    return(translation_data_dict) 

In [None]:
penguins_translation = new_translation("penguins_cytb.fasta")
print(penguins_translation)