### Problem 23
HMM Parameter Estimation Problem

Given: A sequence of emitted symbols x = x1 . . . xn in an alphabet ∑ and a path π = π1 . . . πn generated by a k-state HMM with unknown transition and emission probabilities.

Return: A matrix of transition probabilities Transition and a matrix of emission probabilities Emission that maximize Pr(x,π) over all possible matrices of transition and emission probabilities

In [51]:
import numpy as np
class ProbString():
    def __init__(self, infile):
        '''
        contructor: saves attributes 
        
        Parameters
        ----------
            infile : file name
                
        '''
        self.file=infile
    
    def readHMM(self):
        """
        read HMM file
        
        Return
        ----------
        string:str
            the index of observable string the path emits
        state:list
            all the states
        path:str
            hidden path
        emission:list
            all the emission 
        """
        with open(self.file) as rawData:
            data=rawData.readlines()
        string=data[0].rstrip() #the first row is the obserable string         
        emission=data[2].rstrip().split('\t') #emission list
        indexString=[emission.index(i) for i in string] #the index of string
        path=data[4].rstrip() #hidden path
        state=data[6].rstrip().split('\t') #state list
        indexPath=[state.index(i) for i in path] #the index of path       
        return indexString,state,indexPath,emission
    
    def estimateEmit(self,string,state,path,emission):
        """
        Estimate emission Matrix 
        
        Parameters
        ----------
        string:str
            the index of observable string the path emits
        state:list
            all the states
        path:str
            hidden path
        emission:list
            all the emission 
        
        Return
        ----------
        np.around(emissionMatrix, decimals=3):ndarray
            The emission Matrix 
        """
        emissionMatrix=np.zeros([len(state),len(emission)]) #initialize the emissionMatrix with 0
        for i in range(len(path)): 
            emissionMatrix[path[i],string[i]]+=1
        for i in range(len(state)):
            if sum(emissionMatrix[i])!=0:#if there are such pairs
                emissionMatrix[i]=emissionMatrix[i]/sum(emissionMatrix[i]) 
            else:#we take the average
                emissionMatrix[i]=1/len(state)
        return np.around(emissionMatrix, decimals=3)
    
    def estimateTrans(self,string,state,path,emission):
        """
        Estimate transition Matrix 
        
        Parameters
        ----------
        string:str
            the index of observable string the path emits
        state:list
            all the states
        path:str
            hidden path
        emission:list
            all the emission 
        
        Return
        ----------
        np.around(transMatrix, decimals=3):ndarray
            The transition Matrix 
        """
        transMatrix=np.zeros([len(state),len(state)])#initialize the transMatrix with 0
        for i in range(len(path)-1):#calulate the counts of 2-mer states
            transMatrix[path[i],path[i+1]]+=1
        for i in range(len(state)):
            if sum(transMatrix[i])!=0:#if there are such pairs
                transMatrix[i]=transMatrix[i]/sum(transMatrix[i])
            else:#we take the average
                transMatrix[i]=1/len(state)
        return np.around(transMatrix, decimals=3)

### Main

In [52]:
def main(infile):
    '''
    Get the probability of the path here
    
    Parameters
        ----------
        infile : str 
            the filename  

        Returns
        -------
        STDOUT
    '''
    hmm=ProbString(infile) #instantiation
    string,state,path,emission=hmm.readHMM() #extrct infromation from the infile
    estimateTrans=hmm.estimateTrans(string,state,path,emission) #get the estimated Transition matrix
    estimateEmit=hmm.estimateEmit(string,state,path,emission) #get the estimated emission matrix
    print('\t'+'\t'.join(state)) #column name
    for i in range(len(state)): #print the rowname and the matrix
        print(state[i]+'\t'+'\t'.join([str(j) for j in estimateTrans[i]]))
    print('--------')#separate two matrices
    print('\t'+'\t'.join(emission)) #column name
    for i in range(len(state)):#print the rowname and the matrix
        print(state[i]+'\t'+'\t'.join([str(j) for j in estimateEmit[i]]))

### Run the program here

In [53]:
if __name__ == "__main__":
    main('rosalind_ba10h.txt')

	A	B	C	D
A	0.16	0.24	0.32	0.28
B	0.231	0.231	0.231	0.308
C	0.28	0.32	0.28	0.12
D	0.304	0.261	0.174	0.261
--------
	x	y	z
A	0.4	0.2	0.4
B	0.308	0.423	0.269
C	0.2	0.4	0.4
D	0.083	0.5	0.417
