# Reading Energy Matrix

In [91]:
import pandas as pd
import numpy as np 

In [138]:
# read in the energy matrix
data = pd.read_csv("../../data/brewster_matrixS2.txt", sep=" ", comment="#", header=None)
data = data[5: -6] #trimming matrix to 30 bp
data = data.reset_index(drop=True)
data.columns = ['A','C','G','T']
data.head()

Unnamed: 0,A,C,G,T
0,0.305961,0.681616,0.36014,-0.313427
1,0.122283,0.247441,0.171605,-0.313427
2,1.500683,1.490967,-0.313427,0.633869
3,-0.313427,1.032246,-0.138758,0.699062
4,1.064641,-0.214039,1.119622,-0.313427


In [127]:
RandSeq1 = "ATAGGAGCGTCATCAAACGCGCCGTTCAGGTTCTGGTTCTCCATGCTATAGTTAAGCCGCACAACGGGTACTACCACTCCCTGTAGTCCGCTTTACCGTTCTC"
RandSeq1_trimmed='CGTTCAGGTTCTGGTTCTCCATGCCATAGT'

### energy(sequence)

The energy(sequence) function returns the energy for a provided biological string. It utilizes the data from the energy matrix above.

In [130]:
def energy(sequence):
    """
    Input:
         sequence: 30 bp for the promoter region
    Output:
        total_energy: the total energy for the given sequence in K_bT"""
    #Initializing the counter for the total energy.
    total_energy = 0
    #Adds the energy value for each base together for the entire sequence
    for position, letter in enumerate(sequence):
        #Determines the energy for a given position and base pair location using the energy matrix
        energy_of_base = data.loc[position,letter]
        total_energy += energy_of_base
        
    return(total_energy)

For the example, example_sequence, the function yields:

In [141]:
energy(RandSeq1_trimmed)

-1.8063224473818504

### binding_site(sequence)

The binding_site function shows the location along with the lowest energy matrix for a provided biological sequence. This function is useful for determining a potential binding site for RNA polymerase.

In [100]:
def binding_site(sequence):
    """
    Input: 
        sequence: string, biological sequence
    Outputs:
        position: the base pair with the lowest energy matrix
        lowest_energy: the lowest energy for a base pair within the sequence"""
    # Initializes the energy of position and lowest energy to zero.
    energy_of_position = 0 
    lowest_energy = 0
    #Loops for the position within the range of the sequence to check every 30 base pairs
    for position in range(len(sequence)-29):
        energy_of_position = energy(sequence[position:position+30])
    #Checks if the energy of the position is less than the lowest_energy, substitutes that value so that it could be compared when the loop runs again    
        if energy_of_position < lowest_energy:
            lowest_energy= energy_of_position
            position_of_lowest_energy = sequence[position:position+30]
            
    return(position,lowest_energy)

For RandSeq1 and RandSe1_trimmed:

In [118]:
binding_site(RandSeq1)

(73, -2.9115778711106644)

In [122]:
binding_site(RandSeq1_trimmed)

(0, -1.8063224473818504)

### firstmutation(sequence)

The firstmutation(sequence) predicts the first mutation within a given promoter region through applying the principle that the base pair most likely to change will be the one that shows the greatest decrease in energy, and it will apply this mutation to yield a new sequence, the altered letter, and the change in energy.

In [136]:
def firstmutation(sequence):
    """
    Input: 
        string, promoter region of a biological sequence (30 bp)
    Outputs:
        Final_sequence: the input sequence after its first mutation
        Letter_change: the letter that the mutated sequence displays
        Energy_difference: the difference between the change of letter and its original base pair in K_bT.
        """
    #Sets best position to the first one as it will be updated later
    #best energy is set to an infintely high number so that it could be changed for comparison reasons
    #Energy difference is set to zero, it will record the highest decrease between changing a base pair's letter and its original letter
    bestposition = 0
    bestenergy = np.inf
    energy_difference = 0
    
    #The first loop sets the energy equal to a known value in the energy matrix, loops for every base pair.
    for position,let in enumerate(sequence):
        energy=data.loc[position, letter]
        #Scans all the possibilities for letters within the column of the energy matrix
        for current_letter in data.columns:
            #Checks for the largest decrease in energy and save the location as the bestposition and letter_change for the energy with the lower energy
            if energy_difference < (energy-data.loc[position, current_letter]):
                energy_difference = (energy-data.loc[position, current_letter])
                bestenergy = data.loc[position, current_letter]
                letter_change = current_letter
                bestposition = position
   #Converts the sequence to a list to make it mutable             
    list_sequence = list(sequence)
    #Changes the letter for the bestposition in the list to letter_change
    list_sequence[bestposition] = letter_change
    #Converts the list back to a sequence
    final_sequence = "".join(list_sequence)
    return(final_sequence,letter_change,bestposition, energy_difference)


In [134]:
firstmutation(RandSeq1_trimmed)

('CGTTCAGGTTCTGGTTCTCCATGCCATAGT', 'A', 25, 1.5605388418079094)