HMM for phoneme transitions in the word 'speech'

Hidden states (phonemes): /s/, /p/, /i:/, /tʃ/
Observations (acoustic features): Energy, Pitch, Duration

In [1]:
# (a) Represent HMM parameters


states = ["/s/", "/p/", "/i:/", "/tʃ/"]
observations = ["Energy", "Pitch", "Duration"]

# Initial probabilities (start with /s/)
initial_prob = {
    "/s/": 1.0,
    "/p/": 0.0,
    "/i:/": 0.0,
    "/tʃ/": 0.0
}


In [2]:
# Transition probabilities: P(next_state | current_state)
# From rows  →  To columns:   /s/   /p/   /i:/  /tʃ/
transition_prob = {
    "/s/": {"/s/": 0.1, "/p/": 0.8, "/i:/": 0.1, "/tʃ/": 0.0},
    "/p/": {"/s/": 0.0, "/p/": 0.1, "/i:/": 0.8, "/tʃ/": 0.1},
    "/i:/": {"/s/": 0.0, "/p/": 0.0, "/i:/": 0.2, "/tʃ/": 0.8},
    "/tʃ/": {"/s/": 0.2, "/p/": 0.0, "/i:/": 0.0, "/tʃ/": 0.8}
}

In [3]:
# Emission probabilities: P(observation | state)
#           Energy  Pitch Duration
emission_prob = {
    "/s/":  {"Energy": 0.7, "Pitch": 0.2, "Duration": 0.1},
    "/p/":  {"Energy": 0.5, "Pitch": 0.3, "Duration": 0.2},
    "/i:/": {"Energy": 0.3, "Pitch": 0.5, "Duration": 0.2},
    "/tʃ/": {"Energy": 0.4, "Pitch": 0.4, "Duration": 0.2}
}

In [4]:
# ---------------------------------------------------
# (b) Neatly display the parameters of the HMM
# ---------------------------------------------------

def print_initial_prob(initial_prob):
    print("Initial State Probabilities (π):")
    for state, prob in initial_prob.items():
        print(f"  P(start = {state:3}) = {prob:.2f}")
    print()


def print_transition_matrix(states, transition_prob):
    print("State Transition Matrix (A):")
    header = "From/To | " + "  ".join(f"{s:4}" for s in states)
    print(header)
    print("-" * len(header))
    for s_from in states:
        row = f"{s_from:7}| "
        row += "  ".join(f"{transition_prob[s_from][s_to]:4.1f}" for s_to in states)
        print(row)
    print()


def print_emission_matrix(states, observations, emission_prob):
    print("Emission Probability Matrix (B):")
    header = "State  | " + "  ".join(f"{o:8}" for o in observations)
    print(header)
    print("-" * len(header))
    for s in states:
        row = f"{s:6}| "
        row += "  ".join(f"{emission_prob[s][o]:8.1f}" for o in observations)
        print(row)
    print()


In [5]:
# ---------------------------------------------------
# (c) Generate phoneme and observation sequences
# ---------------------------------------------------

def most_likely_next_state(current_state):
    """Return the next state with maximum transition probability."""
    next_state = max(transition_prob[current_state],
                     key=transition_prob[current_state].get)
    return next_state


def most_likely_observation(state):
    """Return the observation with maximum emission probability."""
    return max(emission_prob[state], key=emission_prob[state].get)


def generate_sequence(length=4):
    """
    Generate a single sequence of phonemes and observations
    using the most probable transitions and emissions.

    For the word 'speech' we expect 4 phonemes:
    ['/s/', '/p/', '/i:/', '/tʃ/']
    """
    # Start from the state with highest initial probability (here, /s/)
    current_state = max(initial_prob, key=initial_prob.get)
    phoneme_sequence = [current_state]
    observation_sequence = [most_likely_observation(current_state)]

    # Generate remaining states
    while len(phoneme_sequence) < length:
        current_state = most_likely_next_state(current_state)
        phoneme_sequence.append(current_state)
        observation_sequence.append(most_likely_observation(current_state))

    return phoneme_sequence, observation_sequence


In [6]:
# ---------------------------------------------------
# (d) Main + simple inference
# ---------------------------------------------------

if __name__ == "__main__":
    # Display HMM parameters
    print_initial_prob(initial_prob)
    print_transition_matrix(states, transition_prob)
    print_emission_matrix(states, observations, emission_prob)

    # Generate a phoneme and observation sequence
    phonemes, observations_seq = generate_sequence(length=4)

    print("Generated phoneme sequence:")
    print(phonemes)
    print("\nCorresponding acoustic observation sequence:")
    print(observations_seq)

    # Inference / conclusion
    print("\nInference:")
    print(
        "From the defined HMM, the most probable phoneme sequence for the word "
        "'speech' is /s/ → /p/ → /i:/ → /tʃ/. "
        "The associated acoustic observations show that the early consonants "
        "(/s/ and /p/) are most strongly linked with high Energy, "
        "while the vowel /i:/ is more strongly associated with Pitch. "
        "This simple HMM therefore captures both the typical phoneme order "
        "of the word and a plausible pattern of acoustic features."
    )

Initial State Probabilities (π):
  P(start = /s/) = 1.00
  P(start = /p/) = 0.00
  P(start = /i:/) = 0.00
  P(start = /tʃ/) = 0.00

State Transition Matrix (A):
From/To | /s/   /p/   /i:/  /tʃ/
--------------------------------
/s/    |  0.1   0.8   0.1   0.0
/p/    |  0.0   0.1   0.8   0.1
/i:/   |  0.0   0.0   0.2   0.8
/tʃ/   |  0.2   0.0   0.0   0.8

Emission Probability Matrix (B):
State  | Energy    Pitch     Duration
-------------------------------------
/s/   |      0.7       0.2       0.1
/p/   |      0.5       0.3       0.2
/i:/  |      0.3       0.5       0.2
/tʃ/  |      0.4       0.4       0.2

Generated phoneme sequence:
['/s/', '/p/', '/i:/', '/tʃ/']

Corresponding acoustic observation sequence:
['Energy', 'Energy', 'Pitch', 'Energy']

Inference:
From the defined HMM, the most probable phoneme sequence for the word 'speech' is /s/ → /p/ → /i:/ → /tʃ/. The associated acoustic observations show that the early consonants (/s/ and /p/) are most strongly linked with high Energ