In [13]:
import numpy

In [93]:
import numpy as np
import os

# Get the current directory
current_directory = os.getcwd()

# Define file names
decode_input_file = "decode_input.txt"
train_struct_file = "train_struct.txt"
model_file = "model.txt"
train_data_file = "train.txt"
test_data_file = "test.txt"

# Build full paths by appending file names to the base path
decode_input_path = os.path.join(current_directory, "data", decode_input_file)
train_struct_path = os.path.join(current_directory, "data", train_struct_file)
model_path = os.path.join(current_directory, "data", model_file)
train_data_path = os.path.join(current_directory, "data", train_data_file)
test_data_path = os.path.join(current_directory, "data", test_data_file)

# Now you can use these paths in your code
print("File paths:")
print("Decode input path:", decode_input_path)
print("Train struct path:", train_struct_path)
print("Model path:", model_path)
print("Train data path:", train_data_path)
print("Test data path:", test_data_path)


def read_decode_input(file_path = decode_input_path):
    """
    Reads the decode_input data from the file.
    Each line represents one letter with 128 elements.
    There are 26 weight vectors each with 128 elements and a transition matrix T with size 26x26.
    The transition matrix T is in row-major order.
    """
    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    X = np.array(raw_data[:100 * 128], dtype=float).reshape(100, 128)
    W = np.array(raw_data[100 * 128:100 * 128 + 26 * 128], dtype=float).reshape(26, 128)
    T = np.array(raw_data[100 * 128 + 26 * 128:-1], dtype=float).reshape(26, 26)
    T = np.swapaxes(T, 0, 1)
    
    print("Shapes of decode input:")
    print("X:", X.shape, "W:", W.shape, "T:", T.shape)
    #print("Top 5 rows of X:\n", X[:5])

    return X, W, T

def read_train_struct(file_path = train_struct_path):
    """
    Reads the train_struct data from the file.
    Each line represents a label and a feature vector (in a sparse representation).
    """
    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    dataX, dataY = [], []
    for line in raw_data[:-1]:  # The last element is empty
        line = line.split(" ")
        dataY.append([int(line[0]) - 1, int(line[1][4:])])
        datax = np.zeros(128, dtype=int)
        for f1 in line[2:]:
            idx, val = f1.split(":")
            datax[int(idx) - 1] = int(val)
        dataX.append(datax)
    
    dataX_np = np.array(dataX, dtype=int)
    print("Shapes of train_struct:")
    print("dataX:", dataX_np.shape, "dataY length:", len(dataY))
    #print("Top 5 rows of dataX:\n", dataX_np[:5])
    
    return dataX_np, dataY

def read_model(file_path = model_path):
    """
    Reads the model data from the file.
    The data consists of weight vectors for each label and a transition matrix T.
    """
    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    W = np.array(raw_data[:26 * 128], dtype=float).reshape(26, 128)
    T = np.array(raw_data[26 * 128:-1], dtype=float).reshape(26, 26)
    T = np.swapaxes(T, 0, 1)
    
    print("Shapes of model data:")
    print("W:", W.shape, "T:", T.shape)
    #print("Top 5 rows of W:\n", W[:5])
    
    return W, T

def read_train(file_path = train_data_path):
    """
    Reads the train data from the file.
    Each row corresponds to an example and is split into the label and the feature vector.
    """
    from string import ascii_lowercase
    mapping = {letter: idx for idx, letter in enumerate(ascii_lowercase)}

    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    dataX, dataY = [], []
    tempX, tempY = [], []
    for row in raw_data[:-1]:
        row = row.split(" ")
        tempY.append(mapping[row[1]])
        tempX.append(np.array(row[5:], dtype=float))
        if int(row[2]) < 0:  # End of sequence
            dataX.append(np.array(tempX))
            dataY.append(np.array(tempY, dtype=int))
            tempX, tempY = [], []  # Reset for the next sequence

    print("Number of training sequences:", len(dataX))
    print("First 5 sequences' labels:\n", dataY[:5])
    
    return list(zip(dataX, dataY))

import numpy as np

def read_test(file_path = test_data_path):
    """
    Reads the test data from the file.
    Each row corresponds to an example and is split into the label and the feature vector.
    The function assumes that each example ends when a row with the third column less than 0 is encountered.
    """
    from string import ascii_lowercase
    mapping = {letter: idx for idx, letter in enumerate(ascii_lowercase)}

    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    dataX, dataY = [], []
    tempX, tempY = [], []
    for row in raw_data[:-1]:  # Skip the last empty line if it exists
        row = row.split(" ")
        tempY.append(mapping[row[1]])
        tempX.append(np.array(row[5:], dtype=float))
        if int(row[2]) < 0:  # Check for the end of a sequence
            dataX.append(np.array(tempX))
            dataY.append(np.array(tempY, dtype=int))
            tempX, tempY = [], []  # Reset for the next sequence

    print("Number of test sequences:", len(dataX))
    print("First 5 sequences' labels:\n", dataY[:5])

    return list(zip(dataX, dataY))

def read_test_decoder_modified(file_path = test_data_path):
    """
    Reads the test data for decoding and returns a NumPy array
    where each sub-array from the list becomes a row in the final
    two-dimensional array. This function only extracts the features
    and does not deal with the labels.
    """
    with open(file_path, 'r') as file:
        raw_data = file.read().strip().split('\n')

    # Initialize an empty list to store all feature vectors
    dataX = []
    
    for row in raw_data:
        if row:  # Skip any empty lines
            features = row.split(' ')[5:]  # Features start from the 6th element in the row
            feature_vector = list(map(float, features))  # Convert string features to float
            dataX.append(feature_vector)

    # Convert the list of lists (features for each word) into a 2D NumPy array
    dataX_np = np.array(dataX)

    print("Shape of test data for decoder:", dataX_np.shape)
    print("Top 5 feature vectors:\n", dataX_np[:5, :])

    return dataX_np

File paths:
Decode input path: C:\Users\prana\Desktop\Fall 23\Adv ML\LAB1\data\decode_input.txt
Train struct path: C:\Users\prana\Desktop\Fall 23\Adv ML\LAB1\data\train_struct.txt
Model path: C:\Users\prana\Desktop\Fall 23\Adv ML\LAB1\data\model.txt
Train data path: C:\Users\prana\Desktop\Fall 23\Adv ML\LAB1\data\train.txt
Test data path: C:\Users\prana\Desktop\Fall 23\Adv ML\LAB1\data\test.txt


In [80]:
X, W, T = read_decode_input()

Shapes of decode input:
X: (100, 128) W: (26, 128) T: (26, 26)


In [81]:
len(W)

26

In [82]:
import numpy as np
import itertools

def generate_mcombs(alphabet, m):
    """
    Generate all possible combinations of length m from the given alphabet.

    :param alphabet: List or array of possible elements.
    :param m: Length of the combinations to generate.
    :return: List of all combinations of the given length.
    """
    # Initialize a list to hold combinations
    combinations = [[]]
    for i in range(m): 
        # For each position in the combination, append all possible elements
        combinations = [j + [k] for k in alphabet for j in combinations]
    
    # Debug: print the number of combinations generated
    print(f"Generated {len(combinations)} combinations of length {m}")
    
    return combinations

def compute_prob(x, y, W, T):
    """
    Compute the score for a given sequence of observations x and label sequence y.

    :param x: Sequence of observations.
    :param y: Sequence of labels.
    :param W: Weight matrix.
    :param T: Transition matrix.
    :return: The score of the sequence.
    """
    # Initialize sums for features and transitions
    x_sum, t_sum = 0, 0
    # Calculate the score based on features and transitions
    for i in range(len(x) - 1):
        x_sum += np.dot(x[i, :], W[y[i], :])
        t_sum += T[y[i], y[i + 1]]
    # Add the score for the last observation
    x_sum += np.dot(x[len(x) - 1, :], W[y[len(x) - 1], :])

    return x_sum + t_sum


def find_max(x, combinations, W, T):
    """
    Find the most likely label sequence for a given observation sequence x.

    :param x: Sequence of observations.
    :param combinations: All possible label combinations.
    :param W: Weight matrix.
    :param T: Transition matrix.
    :return: The most likely label sequence and its corresponding score.
    """
    # Initialize the max score and corresponding label sequence
    max_val, likely_y = float('-inf'), None
    # Iterate through all combinations to find the best one
    for y in combinations:
        val = compute_prob(x, y, W, T)
        if max_val < val:
            max_val = val
            likely_y = y
    
    # Debug: print the most likely label sequence and its score
    print(f"Most likely sequence: {likely_y}, Score: {max_val}")
    
    return likely_y, max_val



def max_sum(X, W, T):
    """
    Decode the most likely label sequence for each observation sequence in X using the max-sum algorithm.

    :param X: Array of observation sequences.
    :param W: Weight matrix.
    :param T: Transition matrix.
    :return: Array of most likely label sequences for each observation sequence.
    """
    # Define the length of the alphabet
    alpha_len = 26
    # Initialize the trellis matrix with zeros
    trellis = numpy.zeros((X.shape[0], alpha_len))
    # Initialize the interior vector with zeros
    interior = numpy.zeros(alpha_len)
    # Initialize the array to store the optimal path
    y_star = numpy.zeros(X.shape[0], dtype=int)

    # Iterate over each time step
    for i in range(1, X.shape[0]):
        # Iterate over each possible current state
        for j in range(alpha_len):
            # Iterate over each possible previous state
            for k in range(alpha_len):
                # Calculate the score for transitioning from state k to state j
                interior[k] = numpy.dot(W[k], X[i - 1]) + T[k, j] + trellis[i - 1, k]
            # Store the maximum score for the current state j
            trellis[i, j] = numpy.max(interior)
    
    # Calculate the optimal path in reverse order
    for i in range(alpha_len):
        # Calculate the score for transitioning from the last state to state i
        interior[i] = numpy.dot(W[i], X[-1]) + trellis[-1, i]
    # Determine the last state in the optimal path
    y_star[-1] = numpy.argmax(interior)

    # Print the score of the optimal path
    print(interior[y_star[-1]])

    # Continue calculating the optimal path backwards in time
    for i in range(X.shape[0] - 1, 0, -1):
        # Iterate over each possible current state
        for j in range(alpha_len):
            # Calculate the score for transitioning from state j to the previous optimal state
            interior[j] = numpy.dot(W[j], X[i - 1]) + T[j, y_star[i]] + trellis[i - 1, j]
        # Determine the optimal state at time step i-1
        y_star[i - 1] = numpy.argmax(interior)

    return y_star

In [83]:
y_star = max_sum(X, W, T)

199.41772558210562


In [84]:
from string import ascii_lowercase
mapping = dict(enumerate(ascii_lowercase))


In [85]:
y_star

array([17, 10, 22,  8, 18, 14, 11, 21, 22,  4, 10, 10, 19,  6, 18, 13,  7,
       14,  8,  8,  4,  4, 15,  2, 20,  5, 19, 20, 20, 21,  3, 25, 10, 20,
       10, 17,  0, 15,  4, 20,  1,  8,  4,  4, 16,  2, 25,  9, 15,  2, 25,
       12, 11,  3, 21,  9, 19, 21, 10, 10, 22, 15, 24,  3, 14, 25, 18,  7,
       12,  1, 11,  4,  9,  2, 11,  0,  7,  0,  2,  7, 19,  3, 15, 19,  5,
       19, 12, 13, 16, 23, 16, 19,  7, 11,  0, 10, 16,  0, 16, 22])

In [86]:
for i in range(y_star.shape[0]):
    print(y_star[i]+1) # added +1 offset as labels are from 0 to 25

18
11
23
9
19
15
12
22
23
5
11
11
20
7
19
14
8
15
9
9
5
5
16
3
21
6
20
21
21
22
4
26
11
21
11
18
1
16
5
21
2
9
5
5
17
3
26
10
16
3
26
13
12
4
22
10
20
22
11
11
23
16
25
4
15
26
19
8
13
2
12
5
10
3
12
1
8
1
3
8
20
4
16
20
6
20
13
14
17
24
17
20
8
12
1
11
17
1
17
23


In [87]:
#Bruteforce implementation

X, W, T = read_decode_input()
dataX, dataY = read_train_struct()

alphabet = [ i for i in range(26) ]
m = 3

combinations = generate_mcombs(alphabet, m)

print(find_max(dataX[:3], combinations, W, T))
#remember to add one to all of these to get a valid letter sequence


Shapes of decode input:
X: (100, 128) W: (26, 128) T: (26, 26)
Shapes of train_struct:
dataX: (25953, 128) dataY length: 25953
Generated 17576 combinations of length 3
Most likely sequence: [16, 5, 12], Score: 9.643693053917156
([16, 5, 12], 9.643693053917156)


In [92]:
import numpy

f = numpy.loadtxt("C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/result/best_params")


W, T = f[:26*128].reshape((26, 128)), f[26*128:].reshape((26, 26))


X = read_test_decoder_modified()

y_star = max_sum(X, W, T)

from string import ascii_lowercase
mapping = dict(enumerate(ascii_lowercase))

for i in range(y_star.shape[0]):
	print(y_star[i]+1)


Shape of test data for decoder: (26198, 128)
Top 5 feature vectors:
 [[0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1.
  0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.
  0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
  0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
  1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0

In [29]:
# 3a and 3b is SVMhmm and SVMMC for plots