In [1]:
import numpy as np
import itertools

# this is decode.py

def generate_mcombs(alphabet, m):
    """
    Generate all possible combinations of length m from the given alphabet.

    :param alphabet: List or array of possible elements.
    :param m: Length of the combinations to generate.
    :return: List of all combinations of the given length.
    """
    # Initialize a list to hold combinations
    combinations = [[]]
    for i in range(m): 
        # For each position in the combination, append all possible elements
        combinations = [j + [k] for k in alphabet for j in combinations]
    
    # Debug: print the number of combinations generated
    print(f"Generated {len(combinations)} combinations of length {m}")
    
    return combinations

def compute_prob(x, y, W, T):
    """
    Compute the score for a given sequence of observations x and label sequence y.

    :param x: Sequence of observations.
    :param y: Sequence of labels.
    :param W: Weight matrix.
    :param T: Transition matrix.
    :return: The score of the sequence.
    """
    # Initialize sums for features and transitions
    x_sum, t_sum = 0, 0
    # Calculate the score based on features and transitions
    for i in range(len(x) - 1):
        x_sum += np.dot(x[i, :], W[y[i], :])
        t_sum += T[y[i], y[i + 1]]
    # Add the score for the last observation
    x_sum += np.dot(x[len(x) - 1, :], W[y[len(x) - 1], :])

    return x_sum + t_sum

In [2]:

def find_max(x, combinations, W, T):
    """
    Find the most likely label sequence for a given observation sequence x.

    :param x: Sequence of observations.
    :param combinations: All possible label combinations.
    :param W: Weight matrix.
    :param T: Transition matrix.
    :return: The most likely label sequence and its corresponding score.
    """
    # Initialize the max score and corresponding label sequence
    max_val, likely_y = float('-inf'), None
    # Iterate through all combinations to find the best one
    for y in combinations:
        val = compute_prob(x, y, W, T)
        if max_val < val:
            max_val = val
            likely_y = y
    
    # Debug: print the most likely label sequence and its score
    print(f"Most likely sequence: {likely_y}, Score: {max_val}")
    
    return likely_y, max_val

def max_sum(X, W, T):
    """
    Decode the most likely label sequence for each observation sequence in X using the max-sum algorithm.

    :param X: Array of observation sequences.
    :param W: Weight matrix.
    :param T: Transition matrix.
    :return: Array of most likely label sequences for each observation sequence.
    """
    # Initialize the trellis for dynamic programming
    alpha_len = 26
    trellis = np.zeros((X.shape[0], alpha_len))
    interior = np.zeros(alpha_len)
    y_star = np.zeros(X.shape[0], dtype=int)

    # Fill in the trellis
    for i in range(1, X.shape[0]):
        for j in range(alpha_len):
            for k in range(alpha_len):
                interior[k] = np.dot(W[k], X[i - 1]) + T[k, j] + trellis[i - 1, k]
            trellis[i, j] = np.max(interior)

    # Backtrack to find the most likely sequence
    for i in range(alpha_len):
        interior[i] = np.dot(W[i], X[-1]) + trellis[-1, i]
    y_star[-1] = np.argmax(interior)

    for i in range(X.shape[0] - 1, 0, -1):
        for j in range(alpha_len):
            interior[j] = np.dot(W[j], X[i - 1]) + T[j, y_star[i]] + trellis[i - 1, j]
        y_star[i - 1] = np.argmax(interior)
    
    # Debug: print the final most likely label sequence
    print("Decoded sequence:", y_star)
    
    return y_star

These functions are part of a CRF (Conditional Random Field) model, which is used for sequence modeling and labeling tasks. The generate_mcombs function generates all possible sequences of labels of length m from a given alphabet. The compute_prob function computes the probability of a given label sequence for an observation sequence. The find_max function iterates over all possible label sequences to find the one with the highest probability. Finally, the max_sum function decodes the most likely label sequence for a given observation sequence using the max-sum algorithm, which is a dynamic programming approach.