In [37]:
import numpy as np
import scipy.stats

# Predicting Mutations


### mutation(sequence)

Mutation(sequence) uses the idea that if a position is chosen at random, then a possible mutation for a sequence would depend on the letter and its possibility to undergo various types of transversions and transitions. The data is rounded percentages from page 302 of *Cell Biology by the Numbers* (2016).



In [36]:
def mutation(sequence):
    """
        Input:
            sequence- a biological sequence
        Output:
            sequence- the inputted sequence after a mutation has occurred.
    """
    #Standard mutation rate for E. Coli
    mut_rate = 10 ** -10
    
    #A mutation will occur, checking by base pair
    for position, letter in enumerate(sequence):
        flip = np.random.random()
        
        #Determining which letter and mutation that will occur
        if flip < mut_rate:
            if letter == "A":
                new_letter = np.random.choice(["g", "c", "t"], p=[0.54, 0.23, 0.23])
            elif letter == "G":
                new_letter = np.random.choice(["a", "c", "t"], p=[0.74, 0.13, 0.13])
            elif letter == "T":
                new_letter = np.random.choice(["a", "c", "g"], p=[0.23, 0.54, 0.23])
            elif letter == "C":
                new_letter = np.random.choice(["a", "t", "g"], p=[0.13, 0.74, 0.13])
             
            #Placing the mutation in the sequence
            sequence = sequence[:position] + new_letter + sequence[position + 1 :]
            
    return sequence

### sequence_change(sequence)

A function that performs a mutation for a sequence given the idea that the likelihood of a position mutating is dependent upon its letter, a departure from the ideals in mutation(sequence). It uses the idea that a G is likely to mutate 40% of the time, C 40% of the time, A 10% of the time, and T 10% of the time. 

In [26]:
def sequence_change(sequence):
    """
        Input:
            sequence- a biological sequence
        Output:
            sequence- the inputted sequence after a mutation has occurred
    """

    sequence_array = np.array(list(sequence))

    # A random number between 0 and 1 is generated
    flip = np.random.random()

    # Probability if the letter chosen is a G
    if flip < 0.40:
        allpositions = list(np.where(sequence_array == "G"))
        chosen_position = np.random.choice(allpositions[0])
        new_letter = np.random.choice(["A", "C", "T"])
        sequence = (
            sequence[:chosen_position] + new_letter + sequence[chosen_position + 1 :]
        )

    # Probability if the letter chosen is a C
    if 0.40 < flip < 0.8:
        allpositions = list(np.where(sequence_array == "C"))
        chosen_position = np.random.choice(allpositions[0])
        new_letter = np.random.choice(["A", "T", "G"])
        sequence = (
            sequence[:chosen_position] + new_letter + sequence[chosen_position + 1 :]
        )
        sequence[chosen_position]

    # Probability if the letter chosen is an A
    if 0.8 < flip < 0.9:
        allpositions = list(np.where(sequence_array == "A"))
        chosen_position = np.random.choice(allpositions[0])
        new_letter = np.random.choice(["G", "C", "T"])
        sequence = (
            sequence[:chosen_position] + new_letter + sequence[chosen_position + 1 :]
        )

    # Probability if the letter chosen is a T
    if 0.9 < flip < 1:
        allpositions = list(np.where(sequence_array == "T"))
        chosen_position = np.random.choice(allpositions[0])
        new_letter = np.random.choice(["A", "C", "G"])
        sequence = (
            sequence[:chosen_position] + new_letter + sequence[chosen_position + 1 :]
        )

    return sequence

### generation_dict(population, mutation_rate=10**-10)

The generation_dict(population, mutation_rate=10**-10) function replicates the process of duplication, while factoring in the probability that a mutation could occur, given by the mutation rate. Note that the default is $10^{-10}$ mutations per base pair per replication for E. Coli, taken from *Cell Biology by the Numbers* (2016). We will use sequence_change(sequence) to perform a mutation if any from the dictionary would need to undergo a mutation. Note that all sequences in the dictionary must have the same base pair length for this function to be true.

In [76]:
def generation_dict(population, mutation_rate=10 ** -10):
    """
    Inputs:
    population- a dictionary containing the biological sequences associated with cells
    mutation_rate = 10**-10 is default, could be changed if needed
    Outputs:
    population- the dictionary after undergoing its first replication
    """
    #Defining our original population
    original_population = population.copy()
    
    # Calculates the number of mutations that will occur
    number_of_sequences = np.sum(list((population.values())))
    number_of_bases = len(np.random.choice(list(population.keys())))
    number_of_mutations = int(number_of_bases * number_of_sequences * mutation_rate)
    pop_list = list(population)
    number_of_distinct_sequences = len(pop_list)
    
    
    # Doubles the population of cells
    for sequence in population.keys():
        original_frequency = population.get(sequence)
        population.update({sequence: 2 * original_frequency})
    
    

    # If the number is [0,1], then it would flip to see if one mutation would happen
    if number_of_mutations < 1:
        flip = np.random.choice(
            [1, 0], p=[number_of_mutations, 1 - number_of_mutations]
        )

        # If the flip is successful = 1 mutation
        if flip == 1:
            number_of_mutations = 1
            p = np.array(list(original_population.values())) / number_of_sequences
            p /= p.sum()
            original_sequence = np.random.choice(list(original_population.keys()), p=p)
            mutated_sequence = sequence_change(original_sequence)
            original_number = population.get(original_sequence)
            population.update({mutated_sequence: 1})
            population.update({original_sequence: original_number - 1})
        #No mutations occur
        else:
            new_population = population

    # Performs mutations for number_of_mutations
    else:
        for pop_list in range(number_of_mutations):
            p = np.array(list(original_population.values())) / number_of_sequences
            p /= p.sum()
            original_sequence = np.random.choice(list(original_population.keys()), p=p)
            mutated_sequence = sequence_change(original_sequence)
            original_number = population.get(original_sequence)
            population.update({mutated_sequence: 1})
            population.update({original_sequence: original_number - 1})

    return population

We will test our function generation_dict(population, mutation_rate=10**-10) to show an example for the given dictionary shown below. We will adjust the mutation_rate to 1/100 to ensure that mutations will occur in our example.

In [84]:
dic = {'ACGCAACTCGACTATACGACTCATTACCGA': 12, 'ACGCAACTCGACTATGCGACTCATAACCGA': 8, 'ACGCAACTCGCCTATGCGACTCATAACCGA': 4}
generation_dict(dic, 1/100)

{'ACGCAACTCGACTATACGACTCATTACCGA': 21,
 'ACGCAACTCGACTATGCGACTCATAACCGA': 13,
 'ACGCAACTCGCCTATGCGACTCATAACCGA': 7,
 'ACGCAACTCGACCATGCGACTCATAACCGA': 1,
 'ACTCAACTCGACTATACGACTCATTACCGA': 1,
 'ACGCAACTCGACTACGCGACTCATAACCGA': 1,
 'ACGCAACTCGACTATATGACTCATTACCGA': 1,
 'ACGCAACTCGACTATACTACTCATTACCGA': 1,
 'ACACAACTCGACTATGCGACTCATAACCGA': 1,
 'ACGCAACTCGCCTATGAGACTCATAACCGA': 1}

In [85]:
np.sum(list((dic.values()))) 

48