# A Two-State HMM: Vowels and Consonants
## Part 1.
1. Code for setting up the states.
2. Initialize A,B,Pi as distributions. 
3. Calculate alpha and beta.
4. Calculate the probability of each word in two ways so they match

## Part 2. Expectation & Part 3. Maximization
### Training objective: minimize total plog (negative base-2 log)
1. Calculate the soft (or expected) count of each letter.
2. Set up tables for soft counts in general, and word-initially
3. Recomputing A, B, and Π.
4. Putting expectation and maximization into a loop sensibly, with a stop condition that makes sense.
5. Show the preference of each letter for one of the states by computing the log ratios of the emission proabilities
6. Determine the values of A for the best analysis (highest probability) of the English data.

## Part 4. Viterbi

In [19]:
import numpy as np
import pandas as pd
np.random.seed(25020)

In [46]:
def read_corpus(file):
    with open(file, 'rt') as f:
        lines = f.readlines()
    lines = [line.rstrip().lower() + '#' for line in lines]
    return lines

def create_lookup(corpus):
    alphabet = set()
    for word in corpus:
        alphabet.update(list(word))
    idx_to_char_dict = sorted(alphabet)
    char_to_idx_dict = {char: idx for idx, char in enumerate(idx_to_char_dict)}
    return idx_to_char_dict, char_to_idx_dict

# encode and decode sequence
def encode(lookup, seq):
    ret = np.array([lookup[char] for char in seq]) # an idx array
    return ret

In [47]:
class HMM:
    def __init__(self, num_states, num_emissions):
        self.num_states = num_states
        self.num_emissions = num_emissions
        
        self.pi = np.random.random(num_states).astype('float128')
        self.A = np.random.random((num_states, num_states)).astype('float128')
        self.B = np.random.random((num_emissions, num_states)).astype('float128')
        # normalize probabilities
        self.pi /= self.pi.sum() # sum up to 1
        self.A /= self.A.sum(axis=1) # row summing to 1
        self.B /= self.B.sum(axis=0) # col summing to 1
        
        self.alpha = None
        self.beta = None
        self.xi = None
        
        self.plog = 0
        
    def train(self, seqs):
        self.plog = 0
        self.pi_update = np.zeros(self.num_states)
        self.A_update = np.zeros((self.num_states, self.num_states)) 
        self.B_update = np.zeros((self.num_emissions, self.num_states))
        num_seq = 0
        for seq in seqs:
            self.expectation(seq)
            self.maximization(seq)
            num_seq += 1
        # acutally apply the maximization updates
        self.pi = self.pi_update / num_seq
        self.A = self.A_update / num_seq
        self.B = self.B_update / num_seq

        return self.plog
        
    def forward(self, seq):
        T = seq.shape[0]
        self.alpha = np.empty((T, self.num_states), dtype='float128') # T * num_states
        self.alpha[0] = self.pi * self.B[seq[0]]
        for t in range(1, T):
            self.alpha[t] = self.alpha[t - 1] @ self.A * self.B[seq[t]]
    
    def backward(self, seq):
        num_states = self.A.shape[0]
        T = seq.shape[0]
        self.beta = np.empty((T, self.num_states), dtype='float128')
        self.beta[T - 1] = 1
        for t in range(T - 2, -1, -1):
            self.beta[t] = self.A * self.B[seq[t + 1]] @ self.beta[t + 1]
            
    def alpha_prob(self):
        return sum(self.alpha[-1])
    
    def beta_prob(self, seq):
        return sum(self.pi * self.B[seq[0]] * self.beta[0])
    
    def expectation(self, seq):
        self.forward(seq)
        self.backward(seq)
        self.plog += -np.log2(self.alpha_prob())

        T = seq.shape[0]
        alpha_p = self.alpha_prob()
        self.xi = np.empty((T - 1, self.num_states, self.num_states))
        for t in range(T - 1):
            for i in range(self.num_states):
                for j in range(self.num_states):
                    numerator = self.alpha[t, i] * self.A[i, j] * \
                    self.B[seq[t + 1], j] * self.beta[t + 1, j]
                    self.xi[t, i, j] = numerator
        self.xi /= self.alpha_prob()
        
        self.gamma = self.alpha * self.beta / alpha_p
    
    def maximization(self, seq):
        # record updates based on expectations
        # update pi
        self.pi_update += self.gamma[0] # soft counts of each state at time 1
        # update A
        numerator = self.xi.sum(axis=0) # soft counts of transitions from i to j
        denom = self.gamma[:-1].sum(axis=0) # soft counts of transitions out of i
        self.A_update += numerator / denom[:, None]
        # update B
        temp = np.empty((self.num_emissions, self.num_states), dtype='float128')
        for k in range(self.num_emissions):
            temp[k] = self.gamma[seq == k].sum(axis=0)
        self.B_update += temp / denom
        
    def check_probabilities(self):
        assert np.isclose(self.pi.sum(), 1)
        assert np.allclose(self.A.sum(axis=1), 1)
        assert np.allclose(self.B.sum(axis=0), 1)

# Training

In [74]:
corpus = read_corpus('english1000.txt')
idx_to_char_dict, char_to_idx_dict = create_lookup(corpus)
num_states = 2
hmm = HMM(num_states, len(idx_to_char_dict))
seqs = [encode(char_to_idx_dict, word) for word in corpus]

In [79]:
for i in range(50):
    plog = hmm.train(seqs)
    if i % 10 == 0:
        print(i, plog)

0 21681.110577283260897
10 21578.03338733745411
20 21468.360200198870347
30 21351.992836152168897
40 21228.982993432943045


In [89]:
# analyze log ratios of emissions from the two states
np.set_printoptions(precision=3, suppress=True)
error = 1e-20 # prevent division by 0
logs = np.log(hmm.B[:, 0] / hmm.B[:, 1] + error)
letters = [idx_to_char_dict[idx] for idx in range(logs.shape[0])]
df = pd.DataFrame.from_records({'Letter': letters, 'Log Ratio': logs})

In [98]:
df[df['Log Ratio'] > 0].sort_values(by='Log Ratio', ascending=False)

Unnamed: 0,Letter,Log Ratio
19,q,553.387894
12,j,3.468864
4,b,2.634073
25,w,1.623673
8,f,1.600722
21,s,1.196283
5,c,1.056815
18,p,0.884228
15,m,0.862484
9,g,0.625687


In [99]:
df[df['Log Ratio'] <= 0].sort_values(by='Log Ratio')

Unnamed: 0,Letter,Log Ratio
0,#,-46.051702
26,x,-46.051702
28,z,-46.051702
1,',-46.051702
2,.,-46.051702
7,e,-1.486292
23,u,-1.378457
17,o,-1.344148
24,v,-1.317596
16,n,-1.092313
