In [10]:
import numpy
import numpy as np
import os

# Base path where your data resides, relative to your code directory
base_data_path = os.path.join("..", "data")

# Define file names
decode_input_file = "decode_input.txt"
train_struct_file = "train_struct.txt"
model_file = "model.txt"
train_data_file = "train.txt"
test_data_file = "test.txt"
parameter_file = "Parameters"
solution_file = "solution.txt"

# Build full paths by appending file names to the base data path
decode_input_path = os.path.join(base_data_path, decode_input_file)
train_struct_path = os.path.join(base_data_path, train_struct_file)
model_path = os.path.join(base_data_path, model_file)
train_data_path = os.path.join(base_data_path, train_data_file)
test_data_path = os.path.join(base_data_path, test_data_file)
parameter_path = os.path.join("..", "results", parameter_file)
solution_path = os.path.join("..", "results", solution_file)

# Now you can use these paths in your code
print("File paths:")
print("Decode input path:", decode_input_path)
print("Train struct path:", train_struct_path)
print("Model path:", model_path)
print("Train data path:", train_data_path)
print("Test data path:", test_data_path)
print("Parameter file path:", parameter_path)
print("Solution file path:", solution_path)



def read_decode_input(file_path = decode_input_path):
    """
    Reads the decode_input data from the file.
    Each line represents one letter with 128 elements.
    There are 26 weight vectors each with 128 elements and a transition matrix T with size 26x26.
    The transition matrix T is in row-major order.
    """
    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    X = np.array(raw_data[:100 * 128], dtype=float).reshape(100, 128)
    W = np.array(raw_data[100 * 128:100 * 128 + 26 * 128], dtype=float).reshape(26, 128)
    T = np.array(raw_data[100 * 128 + 26 * 128:-1], dtype=float).reshape(26, 26)
    T = np.swapaxes(T, 0, 1)
    
    print("Shapes of decode input:")
    print("X:", X.shape, "W:", W.shape, "T:", T.shape)
    #print("Top 5 rows of X:\n", X[:5])

    return X, W, T

def read_train_struct(file_path = train_struct_path):
    """
    Reads the train_struct data from the file.
    Each line represents a label and a feature vector (in a sparse representation).
    """
    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    dataX, dataY = [], []
    for line in raw_data[:-1]:  # The last element is empty
        line = line.split(" ")
        dataY.append([int(line[0]) - 1, int(line[1][4:])])
        datax = np.zeros(128, dtype=int)
        for f1 in line[2:]:
            idx, val = f1.split(":")
            datax[int(idx) - 1] = int(val)
        dataX.append(datax)
    
    dataX_np = np.array(dataX, dtype=int)
    print("Shapes of train_struct:")
    print("dataX:", dataX_np.shape, "dataY length:", len(dataY))
    #print("Top 5 rows of dataX:\n", dataX_np[:5])
    
    return dataX_np, dataY

def read_model(file_path = model_path):
    """
    Reads the model data from the file.
    The data consists of weight vectors for each label and a transition matrix T.
    """
    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    W = np.array(raw_data[:26 * 128], dtype=float).reshape(26, 128)
    T = np.array(raw_data[26 * 128:-1], dtype=float).reshape(26, 26)
    T = np.swapaxes(T, 0, 1)
    
    print("Shapes of model data:")
    print("W:", W.shape, "T:", T.shape)
    #print("Top 5 rows of W:\n", W[:5])
    
    return W, T

def read_train(file_path = train_data_path):
    """
    Reads the train data from the file.
    Each row corresponds to an example and is split into the label and the feature vector.
    """
    from string import ascii_lowercase
    mapping = {letter: idx for idx, letter in enumerate(ascii_lowercase)}

    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    dataX, dataY = [], []
    tempX, tempY = [], []
    for row in raw_data[:-1]:
        row = row.split(" ")
        tempY.append(mapping[row[1]])
        tempX.append(np.array(row[5:], dtype=float))
        if int(row[2]) < 0:  # End of sequence
            dataX.append(np.array(tempX))
            dataY.append(np.array(tempY, dtype=int))
            tempX, tempY = [], []  # Reset for the next sequence

    print("Number of training sequences:", len(dataX))
    print("First 5 sequences' labels:\n", dataY[:5])
    
    return list(zip(dataX, dataY))

import numpy as np

def read_test(file_path = test_data_path):
    """
    Reads the test data from the file.
    Each row corresponds to an example and is split into the label and the feature vector.
    The function assumes that each example ends when a row with the third column less than 0 is encountered.
    """
    from string import ascii_lowercase
    mapping = {letter: idx for idx, letter in enumerate(ascii_lowercase)}

    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    dataX, dataY = [], []
    tempX, tempY = [], []
    for row in raw_data[:-1]:  # Skip the last empty line if it exists
        row = row.split(" ")
        tempY.append(mapping[row[1]])
        tempX.append(np.array(row[5:], dtype=float))
        if int(row[2]) < 0:  # Check for the end of a sequence
            dataX.append(np.array(tempX))
            dataY.append(np.array(tempY, dtype=int))
            tempX, tempY = [], []  # Reset for the next sequence

    print("Number of test sequences:", len(dataX))
    print("First 5 sequences' labels:\n", dataY[:5])

    return list(zip(dataX, dataY))

def read_test_decoder_modified(file_path = test_data_path):
    """
    Reads the test data for decoding and returns a NumPy array
    where each sub-array from the list becomes a row in the final
    two-dimensional array. This function only extracts the features
    and does not deal with the labels.
    """
    with open(file_path, 'r') as file:
        raw_data = file.read().strip().split('\n')

    # Initialize an empty list to store all feature vectors
    dataX = []
    
    for row in raw_data:
        if row:  # Skip any empty lines
            features = row.split(' ')[5:]  # Features start from the 6th element in the row
            feature_vector = list(map(float, features))  # Convert string features to float
            dataX.append(feature_vector)

    # Convert the list of lists (features for each word) into a 2D NumPy array
    dataX_np = np.array(dataX)

    print("Shape of test data for decoder:", dataX_np.shape)
    #print("Top 5 feature vectors:\n", dataX_np[:5, :])

    return dataX_np

File paths:
Decode input path: ..\data\decode_input.txt
Train struct path: ..\data\train_struct.txt
Model path: ..\data\model.txt
Train data path: ..\data\train.txt
Test data path: ..\data\test.txt
Parameter file path: ..\results\Parameters
Solution file path: ..\results\solution.txt


In [3]:
import math
import numpy as np


def compute_log_p(X, y, W, T):
    """
    Computes the log probability of a sequence of labels given inputs X and parameters W, T.
    
    Parameters:
    X : 2D array where each row is the feature vector for one observation.
    y : 1D array of labels corresponding to the observations in X.
    W : Weight matrix where each row corresponds to the weights for one label.
    T : Transition matrix where T[i, j] is the transition weight from label i to label j.
    
    Returns:
    log probability of the label sequence given the inputs and parameters.
    """

    alpha_len = 26  # Alphabet size; ideally passed as a parameter for flexibility
    
    # Initialize the sum of numerator terms for log probability calculation
    sum_num = numpy.dot(W[y[0]], X[0])  # Initial state score
    for i in range(1, X.shape[0]):
        # Add scores for observed states and transitions
        sum_num += numpy.dot(W[y[i]], X[i]) + T[y[i-1], y[i]]
    
    # Initialize the forward trellis for dynamic programming
    trellisfw = numpy.zeros((X.shape[0], alpha_len))
    # Temporary storage for computations at each step
    interior = numpy.zeros(alpha_len)
    # Messages used in the forward pass for dynamic programming
    messages = numpy.zeros((26, 26))

    # Compute forward messages
    for i in range(1, X.shape[0]):
        # Compute interior scores based on current observation and previous states
        numpy.matmul(W, X[i-1], out=interior)
        numpy.add(interior, trellisfw[i-1], out=interior)
        # Compute messages for all transitions
        numpy.add(T, interior[:, numpy.newaxis], out=messages)
        # Normalize to avoid numerical instability
        maxes = messages.max(axis=0)
        numpy.add(messages, -1*maxes, out=messages)
        numpy.exp(messages, out=messages)
        # Sum messages to compute new state values
        numpy.sum(messages, axis=0, out=interior)
        numpy.log(interior, out=interior)
        # Update trellis with log-sums
        numpy.add(maxes, interior, out=trellisfw[i])

    # Compute final scores
    dots = numpy.matmul(W, X[-1])
    numpy.add(dots, trellisfw[-1], out=interior)

    # Normalize final log-sum to prevent underflow
    M = numpy.max(interior)
    numpy.add(interior, -1*M, out=interior)
    numpy.exp(interior, out=interior)
    
    # Calculate log partition function (log Z)
    log_z = M + math.log(numpy.sum(interior))

    # Return the log probability as difference of scores and log partition function
    return sum_num - log_z

import numpy as np
import math

def fb_prob(X, W, T):
    """
    Forward-backward algorithm to compute probabilities over label sequences.
    
    Parameters:
    - X: Input features for each observation in the sequence. Shape: [sequence_length, num_features]
    - W: Weight matrix for label features. Shape: [num_labels, num_features]
    - T: Transition matrix between labels. Shape: [num_labels, num_labels]
    
    Returns:
    - trellisfw: Forward probabilities. Shape: [sequence_length, num_labels]
    - trellisbw: Backward probabilities. Shape: [sequence_length, num_labels]
    - log_z: Log partition function, scalar.
    """
    sequence_length, num_features = X.shape
    num_labels = W.shape[0]
    
    # Initialize forward and backward trellises
    trellisfw = np.zeros((sequence_length, num_labels))
    trellisbw = np.zeros_like(trellisfw)
    
    # Forward pass
    for i in range(1, sequence_length):
        # Compute the weighted input features for all labels at this step
        weighted_inputs = np.dot(W, X[i-1])
        # Update the trellis with contributions from transitions and previous states
        for j in range(num_labels):
            transition_scores = T[:, j] + trellisfw[i-1]
            trellisfw[i, j] = log_sum_exp(weighted_inputs + transition_scores)
    
    # Backward pass
    trellisbw[-1, :] = 0  # Log-probability of 1 at the end of the sequence
    for i in range(sequence_length - 2, -1, -1):
        # Similar to forward pass but in reverse
        weighted_inputs = np.dot(W, X[i+1])
        for j in range(num_labels):
            transition_scores = T[j, :] + weighted_inputs
            trellisbw[i, j] = log_sum_exp(transition_scores + trellisbw[i+1])
    
    # Compute log partition function using the forward trellis
    final_forward_scores = np.dot(W, X[-1]) + trellisfw[-1]
    log_z = log_sum_exp(final_forward_scores)
    
    return trellisfw, trellisbw, log_z

def log_sum_exp(scores):
    """
    Numerically stable computation of log-sum-exp.
    
    Parameters:
    - scores: Input array of scores to be summed in log-space.
    
    Returns:
    - result: Log-sum-exp of input scores.
    """
    max_score = np.max(scores)
    return max_score + np.log(np.sum(np.exp(scores - max_score)))


# The following functions compute gradients for the weight matrix W and transition matrix T respectively
# given a single example (X, y), where X is the feature matrix for the sequence and y is the corresponding label sequence.

def log_p_wgrad(W, X, y, T):
    """
    Computes the gradient of the log probability with respect to the weight matrix W.
    
    Parameters:
    W : Weight matrix where each row corresponds to the weights for one label.
    X : 2D array where each row is the feature vector for one observation.
    y : 1D array of labels corresponding to the observations in X.
    T : Transition matrix where T[i, j] is the transition weight from label i to label j.
    
    Returns:
    Gradient of the log probability with respect to W.
    """
    grad_W = np.zeros(W.shape)  # Gradient matrix for W
    trellisfw, trellisbw, log_z = fb_prob(X, W, T)

    # Iterate over the sequence
    for i in range(X.shape[0]):
        # Combine forward and backward messages
        marginal = trellisfw[i] + trellisbw[i]
        # Incorporate the evidence from input features
        evidence = np.matmul(W, X[i])
        # Subtract the log partition function
        marginal -= log_z
        # Normalize to get probabilities
        marginal = np.exp(marginal)

        # Calculate the gradient for the current position
        for j in range(26):  # Iterate over all possible labels
            if j == y[i]:
                grad_W[j] += X[i]  # Add the feature vector for the true label
            grad_W[j] -= marginal[j] * X[i]  # Subtract the expected feature vector

    return grad_W


def log_p_tgrad(T, X, y, W):
    """
    Computes the gradient of the log probability with respect to the transition matrix T.
    
    Parameters:
    - T: Transition matrix where T[i, j] is the transition weight from label i to label j.
    - X: 2D array where each row is the feature vector for one observation.
    - y: 1D array of labels corresponding to the observations in X.
    - W: Weight matrix where each row corresponds to the weights for one label.
    
    Returns:
    - grad: Gradient of the log probability with respect to T.
    """
    num_labels = T.shape[0]
    grad = np.zeros_like(T)  # Initialize the gradient matrix for T with zeros
    
    # Compute the forward and backward probabilities and the log partition function (log Z)
    trellisfw, trellisbw, log_z = fb_prob(X, W, T)

    for i in range(X.shape[0] - 1):
        potential = np.zeros_like(T)  # Potential for transitions
        
        # Calculate potential scores for transitions considering features and transition scores
        for j in range(num_labels):
            for k in range(num_labels):
                potential[j, k] = np.dot(W[j], X[i]) + np.dot(W[k], X[i+1]) + T[j, k]
        
        potential += trellisfw[i][:, np.newaxis] + trellisbw[i+1]
        potential -= log_z  # Normalize by subtracting log partition function
        potential = np.exp(potential)  # Convert to probabilities
        
        # Update the gradient
        grad[y[i], y[i+1]] += 1  # Increment gradient for observed transition
        grad -= potential  # Subtract expected transition probabilities
        
    return grad



# Example usage:
# Assuming X, y, W, and T are already loaded
# grad_W = log_p_wgrad(W, X, y, T)
# grad_T = log_p_tgrad(T, X, y, W)
# The gradients are used in optimization algorithm to update W and T


In [4]:
import math, numpy
import sys
numpy.set_printoptions(threshold=sys.maxsize)

W, T = read_model()
data = read_train()

# Initialize gradient array
grad = numpy.zeros((26, 128))

import time

# Start timing
t0 = time.time()

# Loop through each data point
for i in range(len(data)):
    # Compute log probability gradient and add to grad array
    numpy.add(log_p_wgrad(W, data[i][0], data[i][1], T), grad, out=grad)

# End timing
t1 = time.time()

# Print elapsed time
print("Time taken:", t1 - t0, "seconds")



Shapes of model data:
W: (26, 128) T: (26, 26)
Number of training sequences: 3438
First 5 sequences' labels:
 [array([ 0, 10,  4]), array([14, 12, 12,  0, 13,  3,  8, 13,  6]), array([ 4, 17, 14]), array([13,  4, 23, 15,  4,  2, 19,  4,  3]), array([ 4,  2, 11,  0, 17,  8, 13,  6])]
Time taken: 24.778865098953247 seconds


In [6]:
# Print the elapsed time
print(f"Time: {t1-t0}")

# Compute the average gradient
avg = numpy.divide(grad, len(data))

# Iterate through each element of the average gradient array
for i in range(26):
    for j in range(128):
        # Print the value of each element
        print(avg[i, j])
        
# this is 2a 

Time: 24.778865098953247
-0.0024057277392203046
-0.0016008939031424052
-0.001905491453634534
-0.004921188619842632
-0.003988676163585031
-0.001675510274691985
0.0026504655468429865
0.0016382586381457833
-0.001528609547161226
0.006119165589023254
0.010330882820046145
0.010867212999460554
0.015283817170994708
0.020957696643504716
0.016333767362104946
0.008902608558900042
-0.0003049464369496
0.016081468867792496
0.027510381274201298
0.046181168794575
0.0603529802631012
0.057255963015412865
0.03991871092933131
0.029852104586305158
0.008756802051789398
0.0429344640804203
0.08557242441731572
0.11783143684326965
0.12261465395740662
0.10156499094709183
0.07510711069893662
0.050523933798109
0.03658378230705551
0.138087013968371
0.20354518966288918
0.22695919517422497
0.21535792428392886
0.15927249231082932
0.11530874669150004
0.08439035556617416
0.13460084207348536
0.26447358416072475
0.2694624537218465
0.22369191743267886
0.225267695976453
0.19250434798768834
0.1475483083007004
0.0904403284108

In [7]:
# Set print options to display full arrays
numpy.set_printoptions(threshold=sys.maxsize)

# Read the model parameters
W, T = read_model()

# Read the training data
data = read_train()

# Initialize gradient array
grad = numpy.zeros((26, 26))

import time

# Start timing
t0 = time.time()

# Iterate through each data point
for i in range(len(data)):
    # Compute T gradient and add to grad array
    numpy.add(log_p_tgrad(T, data[i][0], data[i][1], W), grad, out=grad)

# End timing
t1 = time.time()

# Print the elapsed time
print(f"Time: {t1-t0}")

# Compute the average gradient
avg = numpy.divide(grad, len(data))

# Iterate through each element of the average gradient array
for i in range(26):
    for j in range(26):
        # Print the value of each element
        print(avg[j, i])

Shapes of model data:
W: (26, 128) T: (26, 26)
Number of training sequences: 3438
First 5 sequences' labels:
 [array([ 0, 10,  4]), array([14, 12, 12,  0, 13,  3,  8, 13,  6]), array([ 4, 17, 14]), array([13,  4, 23, 15,  4,  2, 19,  4,  3]), array([ 4,  2, 11,  0, 17,  8, 13,  6])]
Time: 112.03873634338379
-0.005492964900728397
-0.0028287329084289163
0.05886267446828188
-0.009323359179155643
0.0139611182892585
-0.006350847583301298
-0.0032146103055229578
0.017048216581379918
0.040357692497905
-0.003165344715108468
0.01953569436231517
-0.00360237881130604
0.02574192474633251
-0.015763876924504615
0.016142268166404142
0.01574890267778991
-0.012400399686420155
0.07147637775023508
-0.012419645099838719
0.021639441409350105
0.010467479609014235
-0.008483937592805432
0.01790912403699967
-0.004559641604206523
-0.004560320114666668
0.007061935299392682
0.10119101894295401
0.01967391384773764
-0.0017461965501729684
-0.010848470577878067
-0.0027786535448625963
-0.00418174524406869
-0.0025440121

In [8]:
import itertools

def generate_mcombs(alphabet, m):
    """
    Generate all possible combinations of length m from the given alphabet.

    :param alphabet: List or array of possible elements.
    :param m: Length of the combinations to generate.
    :return: List of all combinations of the given length.
    """
    # Initialize a list to hold combinations
    combinations = [[]]
    for i in range(m): 
        # For each position in the combination, append all possible elements
        combinations = [j + [k] for k in alphabet for j in combinations]
    
    # Debug: print the number of combinations generated
    print(f"Generated {len(combinations)} combinations of length {m}")
    
    return combinations

def compute_prob(x, y, W, T):
    """
    Compute the score for a given sequence of observations x and label sequence y.

    :param x: Sequence of observations.
    :param y: Sequence of labels.
    :param W: Weight matrix.
    :param T: Transition matrix.
    :return: The score of the sequence.
    """
    # Initialize sums for features and transitions
    x_sum, t_sum = 0, 0
    # Calculate the score based on features and transitions
    for i in range(len(x) - 1):
        x_sum += np.dot(x[i, :], W[y[i], :])
        t_sum += T[y[i], y[i + 1]]
    # Add the score for the last observation
    x_sum += np.dot(x[len(x) - 1, :], W[y[len(x) - 1], :])

    return x_sum + t_sum


def find_max(x, combinations, W, T):
    """
    Find the most likely label sequence for a given observation sequence x.

    :param x: Sequence of observations.
    :param combinations: All possible label combinations.
    :param W: Weight matrix.
    :param T: Transition matrix.
    :return: The most likely label sequence and its corresponding score.
    """
    # Initialize the max score and corresponding label sequence
    max_val, likely_y = float('-inf'), None
    # Iterate through all combinations to find the best one
    for y in combinations:
        val = compute_prob(x, y, W, T)
        if max_val < val:
            max_val = val
            likely_y = y
    
    # Debug: print the most likely label sequence and its score
    print(f"Most likely sequence: {likely_y}, Score: {max_val}")
    
    return likely_y, max_val

def max_sum(X, W, T):
    """
    Decode the most likely label sequence for each observation sequence in X using the max-sum algorithm.

    :param X: Array of observation sequences.
    :param W: Weight matrix.
    :param T: Transition matrix.
    :return: Array of most likely label sequences for each observation sequence.
    """
    # Initialize the trellis for dynamic programming
    alpha_len = 26
    trellis = np.zeros((X.shape[0], alpha_len))
    interior = np.zeros(alpha_len)
    y_star = np.zeros(X.shape[0], dtype=int)

    # Fill in the trellis
    for i in range(1, X.shape[0]):
        for j in range(alpha_len):
            for k in range(alpha_len):
                interior[k] = np.dot(W[k], X[i - 1]) + T[k, j] + trellis[i - 1, k]
            trellis[i, j] = np.max(interior)

    # Backtrack to find the most likely sequence
    for i in range(alpha_len):
        interior[i] = np.dot(W[i], X[-1]) + trellis[-1, i]
    y_star[-1] = np.argmax(interior)

    for i in range(X.shape[0] - 1, 0, -1):
        for j in range(alpha_len):
            interior[j] = np.dot(W[j], X[i - 1]) + T[j, y_star[i]] + trellis[i - 1, j]
        y_star[i - 1] = np.argmax(interior)
    
    # Debug: print the final most likely label sequence
    print("Decoded sequence:", y_star)
    
    return y_star



In [9]:
# Read input for decoding
X, W, T = read_decode_input()

# Read training data for structured prediction
dataX, dataY = read_train_struct()

# Define the alphabet as a list of integers representing letters
alphabet = [i for i in range(26)]

# Define the size of combinations
m = 3

# Generate all combinations of length m from the alphabet
combinations = generate_mcombs(alphabet, m)

# Find the maximum probability sequence for the first 3 dataX samples
# by searching through the combinations using the Viterbi algorithm
print(find_max(dataX[:3], combinations, W, T))



Shapes of decode input:
X: (100, 128) W: (26, 128) T: (26, 26)
Shapes of train_struct:
dataX: (25953, 128) dataY length: 25953
Generated 17576 combinations of length 3
Most likely sequence: [16, 5, 12], Score: 9.643693053917156
([16, 5, 12], 9.643693053917156)


In [11]:
import numpy as np
from scipy.optimize import fmin_bfgs
import time
import pickle

# Reading training data and initial model parameters
X_y = read_train()  # Reading training data
W, T = read_model()  # Reading initial model parameters

def func(params, *args):
    """
    Objective function as specified in the handout. It calculates the negative log likelihood
    of the data given the parameters, regularized with L2 norm of the parameters.
    
    Parameters:
    params (numpy.ndarray): Flattened parameter vector containing both W and T.
    args (tuple): Additional arguments where args[0] is the dataset and args[1] is the regularization coefficient C.
    
    Returns:
    float: The value of the objective function.
    """
    # Reshape the first part of params to a 26x128 matrix for W, and the rest to a 26x26 matrix for T
    W, T = params[:26*128].reshape((26, 128)), params[26*128:].reshape((26, 26))
    
    # Unpack the data and regularization parameter C from args
    data = args[0]  # The dataset, a list of tuples where each tuple contains X and y for a sample
    C = args[1]     # Regularization parameter
    
    # Initialize log_sum to accumulate the log probabilities of the sequence given the parameters
    log_sum = 0
    for example in data:  # Loop over each example in the dataset
        # Compute the log probability of the sequence given the model parameters, and add it to log_sum
        log_sum += compute_log_p(example[0], example[1], W, T)
    
    # Compute the L2 norm of each row in W and store in norm
    norm = numpy.zeros(26)  # Initialize a vector of zeros for storing the norms
    for i in range(26):     # Loop over each row of W
        norm[i] = numpy.linalg.norm(W[i])  # Compute L2 norm of the i-th row of W
    
    # Square the L2 norms in-place
    numpy.square(norm, out=norm)
    
    # Compute the average log probability of the sequences given the model parameters
    log_value = (1/len(data)) * log_sum
    # Print the average log probability for debugging or monitoring
    print("Provide the value of 1/ n∑nt=1 log p(yt|Xt) :", log_value)
    
    # Calculate the objective function value combining the negative average log probability, 
    # the L2 norm regularization for W, and the L2 norm regularization for T
    return -1 * (C / len(data)) * log_sum + 0.5 * numpy.sum(norm) + 0.5 * numpy.sum(numpy.square(T))


def func_prime(params, *args):
    """
    Derivative of the objective function as specified in the handout.
    
    Parameters:
    params (numpy.ndarray): Flattened parameter vector containing both W and T.
    args (tuple): Additional arguments where args[0] is the dataset and args[1] is the regularization coefficient C.
    
    Returns:
    numpy.ndarray: Gradient of the objective function with respect to the parameters.
    """
    # Reshape the parameters back into W and T matrices
    W = params[:26*128].reshape((26, 128))
    T = params[26*128:].reshape((26, 26))
    data = args[0]
    C = args[1]

    # Initialize gradients for W and T
    log_grad_w = np.zeros((26, 128))
    log_grad_t = np.zeros((26, 26))

    # Accumulate gradients from all examples
    for example in data:
        log_grad_w += log_p_wgrad(W, example[0], example[1], T)
        log_grad_t += log_p_tgrad(T, example[0], example[1], W)

    # Apply regularization term
    log_grad_w = -1*C/len(data)*log_grad_w + W
    log_grad_t = -1*C/len(data)*log_grad_t + T

    # Return the concatenated gradient vector
    return np.concatenate([log_grad_w.reshape(26*128), log_grad_t.reshape(26*26)])

# Initial guess for the optimization
initial_guess = np.zeros((26*128 + 26*26))

def get_params(x_y, C):
    """
    Optimizes the model parameters using the BFGS algorithm.
    
    Parameters:
    x_y (list): The training dataset.
    """
    t0 = time.time()
    ret = fmin_bfgs(func, initial_guess, fprime=func_prime, args=(x_y, C),
                    maxiter=1, retall=True, full_output=True)
    t1 = time.time()

    # Saving the optimized parameters
    with open("best_Weights_tampered", "wb") as f: # best iterations is stored in solution.txt in appt folder
        pickle.dump(ret, f)
    np.savetxt("best_Weights_tampered", ret[0]) # best_weights_c1, best_weights_c10 were saved from here similarly

    print(f"Time: {t1-t0}")


get_params(X_y, 1000) # change value of C from 1, 10, 100, 1000 for different evaluations for q3 plot


Number of training sequences: 3438
First 5 sequences' labels:
 [array([ 0, 10,  4]), array([14, 12, 12,  0, 13,  3,  8, 13,  6]), array([ 4, 17, 14]), array([13,  4, 23, 15,  4,  2, 19,  4,  3]), array([ 4,  2, 11,  0, 17,  8, 13,  6])]
Shapes of model data:
W: (26, 128) T: (26, 26)
Provide the value of 1/ n∑nt=1 log p(yt|Xt) : -24.594932941033264
Provide the value of 1/ n∑nt=1 log p(yt|Xt) : -21.034947183274976
Provide the value of 1/ n∑nt=1 log p(yt|Xt) : -24.894957407087823
Provide the value of 1/ n∑nt=1 log p(yt|Xt) : -19.926267654347203
Provide the value of 1/ n∑nt=1 log p(yt|Xt) : -22.0480533547116
Provide the value of 1/ n∑nt=1 log p(yt|Xt) : -19.744879244259035
Provide the value of 1/ n∑nt=1 log p(yt|Xt) : -20.82348561980722
Provide the value of 1/ n∑nt=1 log p(yt|Xt) : -19.76141427804329
Provide the value of 1/ n∑nt=1 log p(yt|Xt) : -19.741730524495303
Provide the value of 1/ n∑nt=1 log p(yt|Xt) : -19.74918831694408
Provide the value of 1/ n∑nt=1 log p(yt|Xt) : -19.74147044115

In [12]:
# Define the path to your test.txt file
# Note: Uncomment and replace with your actual file path
# test_data_path = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/data/test.txt"  

# Load parameters from the file specified in parameter_path
f = numpy.loadtxt(parameter_path)

# Split the loaded parameters into W and T matrices
W, T = f[:26*128].reshape((26, 128)), f[26*128:].reshape((26, 26))

# Read test data for decoding (modified function)
X = read_test_decoder_modified(test_data_path)

# Find the most probable sequence using the Viterbi algorithm
y_star = max_sum(X, W, T)

# Import lowercase alphabet for mapping output indices to letters
from string import ascii_lowercase
# Create a dictionary mapping integers to lowercase letters
mapping = dict(enumerate(ascii_lowercase))

# Print the decoded sequence
for i in range(y_star.shape[0]):
    # Print the corresponding letter for each output index
    print(y_star[i]+1)  # Adding 1 to convert 0-indexed to 1-indexed


Shape of test data for decoder: (26198, 128)
Decoded sequence: [24 11 14 15  7 14 13  2 14 22 14 17 10  0  1 11  4  2  2 14 20 13 19  0
  1  8 11  8 19  8 13  8 11 24  4 17  0 11 24 11  4  2 14 12 15 17  4 18
 18  7 24  6  8 13 18  4 13  6 17  0 15  7 24 17  8 18  7  0  5 20 25 25
 25  0 17  4  4 17 14 13  3 20 18 19 17  8  0 11  8 25  4  3 10  8  8 13
  6 12  1 17  0  2  4 18  4  9 20 19  8 13  4 17  8 13  6 14 12 15 11  4
 23 20 12 15 17  8 11  8  3  3 20 11 11 24 14 12 12  0 13  3  8 13  6  6
  8  2 11 14  9  4  2 19  8 14 13 18 13  2 14 12 15 14 17 12  0  1 11 24
 11  0  3  8 13  6  4  4  3  3  4 17  0 11 24 11  8 13  6 14 17 12  0 11
  8 25  0 19  8 14 13 14 22  0 11  4 12  0 13 19  4 13 14 20 13  2  8 13
  6 22  0 10  4 17  8 13 14 13  2 14 12 12  0 13  3  8 13  6 13  8 14 15
  7 14 13 19  4 17 14 17  0 11  0 17  8 25  8 13  2 14 13 18  4 16 20  4
 13 19  8  0 19  7  0  3 14 22  0  1 20 11 14 20 18 11 24 17 14  9  4  0
 19  8 14 13 18 19  4 17  4 17  0 19  4 17 12  0 23 15 17 14 

In [13]:
import numpy

# Load parameters from the file specified in solution_path
f = numpy.loadtxt(solution_path)

# Split the loaded parameters into W and T matrices
W, T = f[:26*128].reshape((26, 128)), f[26*128:].reshape((26, 26))

# Print the elements of matrix W
for i in range(26):
    for j in range(128):
        # Print the value of each element in W
        print(W[i, j])

# Print the elements of matrix T
for i in range(26):
    for j in range(26):
        # Print the value of each element in T
        print(T[j, i])


-0.37715325660970006
-0.4640010087166905
0.36710565320023497
0.2962035896467515
0.38148741306783523
-0.2528992694046261
0.13119585802511494
-0.24143166210626335
-0.2599859143140288
0.36803576931027426
0.3488406886283731
0.21670609926593198
0.19649993073823147
0.1775542398158361
0.3472056824159796
0.08243289190402978
-0.4391312746803779
-0.01591521996303192
-0.07055446359780375
0.20893270551353885
0.18473930919491152
0.1413728863784083
-0.0767304469810371
0.23623401021112989
-0.3256785295083575
0.006452721142909051
0.164322747932267
0.2721206353169367
0.327358805239312
0.2393871085451655
0.12725391014237092
-0.122691001224319
-0.5936059366877564
0.2853926887666524
0.10757065362602301
0.233123435013137
0.3813313709375376
-0.031363032692504095
0.11369312550514413
0.026835516905808134
-0.50834863118473
0.21275240851148194
0.30335660586442326
0.16141188445657217
0.1384448482601517
0.10636403154154092
0.036121472239292095
-0.029507931766845172
-0.29951664205787126
-0.18090446180728212
0.2287