In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

random.seed(10)

In [2]:
# Hidden state space
S = [0, 1]
# Emission space
V = [1, 2, 3, 4, 5]
# Transition matrix
A = np.matrix([[0.8, 0.2], 
               [0.1, 0.9]])
# Initial distribution 
mu_0 = [0.5, 0.5]
# Emission matrix
B = np.matrix([[0.2, 0.5, 0.2, 0.1, 0],
               [0, 0.1, 0.4, 0.4, 0.1]])

In [3]:
def writeParametersToFile(S, V, A, mu_0, B, file):
    # Write parameters to file
    f = open(file, "w")
    f.write("State space: " + " ".join(str(s) for s in S) + "\n")
    f.write("Emission space: " + " ".join(str(v) for v in V) + "\n")
    f.write("Transition matrix: " + " ".join(str(a) for a in np.array(A).flatten()) + "\n")
    f.write("Initial distribution: " + " ".join(str(m) for m in mu_0) + "\n")
    f.write("Emission matrix: " + " ".join(str(b) for b in np.array(B).flatten()))
    f.close()

In [6]:
chromosome_file = open("s_cerevisiae_chromosome_III.fa", "r")

bases = "".join([line.strip() for line in chromosome_file.readlines()[1:]])
print("Number of bases: " + str(len(bases)))

Number of bases: 316620


In [21]:
# Split into 100 base pair windows
window_size = 100

split_bases = [bases[i : i + window_size] for i in range(0, len(bases), window_size)]

['CCCACACACCACACCCACACCACACCCACACACCACACACACCACACCCACACACCCACACCACACCACACCCACACCACACCCACACACCCACACCCAC',
 'ACACCACACCCACACACACCACACCCACACACACCCACACCCACACACCACACCCACACACACACCACACCCACACACACCACACCACACCCACACCACA',
 'CCCACACCCACACACCACACCCACACCCACACCCCACACCCACACACCACACCCACACACACCACACCCACACACACCCACACCACACCCACACACCACA',
 'CCCACACACCCACACCCACACACACCACACCCACACCACACCCACACCCACACACCCACACCCTAACACTACCCTAACACTACCCTATTCTAACCCTGAT',
 'TTTACCTGTCTCCCAACTTACTCTCCATTACCCTACCTCTCCACTCGTTACCCTGTCTCATTCAACCGTACCACTCCCAACCACCATCCATCTCTCTACT',
 'TACTACCACCAACCCACCGTCCACCATAACCGTTACCCTCCAACTACCCGTATCCAACTCCACTACCGCTTACCCTACCATCGACCATGTCCTACTCACT',
 'GTACTGTTGTTCACCCACCATATTGAAACGTCTACAAATGATCGTAAATAATACACATATACTTATCCTACCACTCTAATCCCACTACCACATGCCATAC',
 'TCACCTTCACTTGTATTCTGATCGGTCATACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTTTCATATTCCACTCCATCACCCATC',
 'TCTCACCATCAGTACCAAATGCACTCGCATCATTATGCACGGCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACGCATAACGCCCATCATTATCCA',
 'CATTTTAATATCTATATCTCATTCGGCGACACCAAATATTGTATAACTGCCCT

In [49]:
# Calculate the % of GC in each window
gc_content = [window.count("G") + window.count("C") / len(window) for window in split_bases]

decimal_places = 3

print("GC Statistics")
print("-------------")
print("Mean: " + str(round(np.mean(gc_content), decimal_places)) + "%")
print("Median: " + str(round(np.median(gc_content), decimal_places)) + "%")
print("Standard deviation: " + str(round(np.std(gc_content), decimal_places)) + "%")
print("Minimum: " + str(round(min(gc_content), decimal_places)) + "%")
print("Maximum: " + str(round(max(gc_content), decimal_places)) + "%")
print("-------------")

GC Statistics
-------------
Mean: 19.029%
Median: 18.34%
Standard deviation: 5.547%
Minimum: 0.6%
Maximum: 58.0%
-------------


In [63]:
chromosome_emission = np.zeros(len(gc_content))

for i in range(len(gc_content)):
    if 15 < gc_content[i] < 20:
        chromosome_emission[i] = 1
    elif 20 <= gc_content[i] < 25:
        chromosome_emission[i] = 2
    elif 25 <= gc_content[i]:
        chromosome_emission[i] = 3

In [64]:
chromosome_emission

array([0., 0., 0., ..., 3., 3., 0.])

In [350]:
class HMM():
    """
    S: state space
    V: emission space
    A: transition matrix
    mu_0: initial distribution
    B: emission matrix
    N: markov chain length
    """
    
    def __init__(self, S = None, V = None, A = None, mu_0 = None, B = None,
                 N = 100, file = None):
        
        if isinstance(file, type(None)):
            # If no file given, directly set parameters
            self.S = S
            self.V = V
            self.A = A
            self.mu_0 = mu_0
            self.B = B
            
        else:
            # Otherwise read parameters from a file
            f = open(file, "r")
            
            readArray = lambda x: np.array(x.readline().split()[2:]).astype(float)
            readMatrix = lambda x: np.matrix(x.readline().split()[2:]).astype(float)

            self.S = readArray(f)
            self.V = readArray(f)
            self.A = readMatrix(f)
            self.A.shape = (len(self.S), len(self.S))
            self.mu_0 = readArray(f)
            self.B = readMatrix(f)
            self.B.shape = (len(self.S), len(self.V))
            
        # Set the chain length
        self.N = N

        # Map emitted state values to numerical indexes
        self.emit_idxs = dict(zip(self.V, [i for i in range(len(self.V))]))
        
        # Create the markov chain
        self.markov_chain = self.markovChain()
        # Get the emitted states
        self.emission = self.emitStates()
        
        self.forwardAlgorithm(self.emission)
        
    def markovChain(self):
        # Create markov chain
        chain = np.zeros(self.N)
        # Set the first state by sampling from the initial distribution
        chain[0] = np.random.choice(self.S, p = self.mu_0)
        
        # Set the remaining states in the chain
        for i in range(1, self.N):
            # Get the probability row from the transition matrix
            state_prob = np.array(self.A[int(chain[i - 1])]).flatten()
            # Sample from the state space
            chain[i] = np.random.choice(self.S, p = state_prob)
        
        return(chain)
    
    def emitStates(self):
        emission = np.zeros(self.N)
        
        for i in range(self.N):
            # Get the probability row from the emission matrix
            emit_prob = np.array(self.B[int(self.markov_chain[i])]).flatten()
            # Sample from the emission matrix
            emission[i] = np.random.choice(self.V, p = emit_prob)
            
        return(emission)
    
    # Scaled forward algorithm
    def forwardAlgorithm(self, emitted_sequence):
        # Initialise alpha values and scale
        a_hat = np.matrix(np.zeros((len(self.S), len(emitted_sequence))))
        c = np.zeros(len(emitted_sequence))
        
        # Map emitted sequence values to numerical indexes
        sequence_idxs = [self.emit_idxs[x] for x in emitted_sequence]

        # Set initial values
        for i in range(len(self.S)):
            a_0 = self.B[i, int(sequence_idxs[0])] * self.mu_0[i]
            c[0] += a_0
            a_hat[i,0] = (1 / c[0]) * a_0
            
        for t in range(1, len(emitted_sequence)):
            for j in range(len(self.S)):
                a_hat[i,t] = a_hat[i,t-1] * 
            
        print(a)
        print(a_hat)
        print(c)

In [351]:
hmm = HMM(file = "hmm_params.txt", N = 115)
states = hmm.markov_chain
values = hmm.emission

[[0.25 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.  ]
 [0.05 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  

In [None]:
colour_dict = {0: "red", 1: "darkorange", 2: "yellow", 3: "greenyellow", 4: "purple"}
colour_dict = {0: "mediumpurple", 1: "paleturquoise", 2: "yellow", 3: "greenyellow", 4: "purple"}

fig = plt.figure()
ax = fig.add_axes([0,0,2,1])
ax.bar([i for i in range(len(values))], values, color = [colour_dict[int(s)] for s in states])
plt.grid(color = '#95a5a6', linestyle = '--', linewidth = 2, axis = 'y', alpha = 0.2)
plt.title('HMM States and Emission')
plt.xlabel('N')
plt.ylabel('Emission')

plt.legend(handles = [mpatches.Patch(color = colour_dict[i],
                                     label = np.unique(states)[i]) for i in range(len(np.unique(states)))],
           title = "State")
#plt.savefig(".pdf", bbox_inches = 'tight')
plt.show()