In [4]:
def objective_func(params, X, y, C):
    W, T = params[:26*128].reshape((26, 128)), params[26*128:].reshape((26, 26))
    log_likelihood_sum = 0
    for i in range(len(X)):
        log_likelihood_sum += compute_log_likelihood(X[i], y[i], W, T)
    norm_W = np.linalg.norm(W, axis=1)**2
    norm_T = np.linalg.norm(T)**2
    regularization_term = 0.5 * (np.sum(norm_W) + np.sum(norm_T))
    return -C * log_likelihood_sum / len(X) + regularization_term

def gradient_func(params, X, y, C):
    W, T = params[:26*128].reshape((26, 128)), params[26*128:].reshape((26, 26))
    grad_W = np.zeros_like(W)
    grad_T = np.zeros_like(T)
    for i in range(len(X)):
        grad_W += compute_grad_W(X[i], y[i], W, T)
        grad_T += compute_grad_T(X[i], y[i], W, T)
    grad_W = -C * grad_W / len(X) + W
    grad_T = -C * grad_T / len(X) + T
    return np.concatenate([grad_W.flatten(), grad_T.flatten()])

def stochastic_train(X_train, y_train, X_test, y_test, method='sgd', **kwargs):
    params = np.zeros((26*128 + 26*26))
    effective_passes = []
    train_objectives = []
    test_errors = []
    for i in range(num_iterations):
        batch_indices = np.random.choice(len(X_train), size=batch_size, replace=False)
        X_batch, y_batch = X_train[batch_indices], y_train[batch_indices]
        if method == 'sgd':
            params -= learning_rate * gradient_func(params, X_batch, y_batch, C)
        elif method == 'momentum':
            # Similar to sgd, but includes momentum update
        train_objectives.append(objective_func(params, X_train, y_train, C))
        test_errors.append(compute_test_error(params, X_test, y_test))
        effective_passes.append(i * batch_size / len(X_train))
    return effective_passes, train_objectives, test_errors


import matplotlib.pyplot as plt

def plot_results(effective_passes, train_objectives, test_errors, title):
    plt.figure(figsize=(10, 6))
    plt.subplot(1, 2, 1)
    plt.plot(effective_passes, train_objectives, label='Training Objective')
    plt.xlabel('Effective Passes')
    plt.ylabel('Objective Value')
    plt.title('Training Objective Over Time')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(effective_passes, test_errors, label='Test Error')
    plt.xlabel('Effective Passes')
    plt.ylabel('Test Error')
    plt.title('Test Error Over Time')
    plt.legend()

    plt.suptitle(title)
    plt.show()


IndentationError: expected an indented block after 'elif' statement on line 32 (166539856.py, line 34)

In [42]:
def compute_log_likelihood(X, y, W, T):
    """
    Compute the log likelihood for a given sequence and its labels.
    """
    log_likelihood = 0.0
    for i in range(len(y) - 1):
        feature_vector = X[i]
        transition_score = T[y[i], y[i+1]]
        emission_score = np.dot(W[y[i]], feature_vector)
        log_likelihood += (emission_score + transition_score)
    # Subtract log partition function (Z) for normalization
    # Assuming a function compute_log_z to compute log(Z)
    log_z = compute_log_z(X, W, T)
    log_likelihood -= log_z
    return log_likelihood


def compute_grad_W(X, y, W, T):
    """
    Compute the gradient of the log likelihood with respect to W.
    """
    grad_W = np.zeros_like(W)
    for i in range(len(y)):
        feature_vector = X[i]
        grad_W[y[i]] += feature_vector
        # Subtract expected feature counts
        # Assuming a function compute_expected_features that computes expected feature counts
        expected_features = compute_expected_features(X, W, T)
        grad_W -= expected_features
    return grad_W

def compute_grad_T(X, y, W, T):
    num_states = T.shape[0]
    grad_T = np.zeros_like(T)
    
    for sequence, labels in zip(X, y):
        seq_len = len(labels)
        for i in range(seq_len - 1):
            current_label = labels[i]
            next_label = labels[i + 1]
            grad_T[current_label, next_label] += 1
        
        # Here, add logic to compute the expected transitions based on the sequence
        # This might involve calling `compute_expected_transitions` or equivalent logic
        # Ensure this function accounts for the variable length of `sequence`
        expected_transitions = compute_expected_transitions([sequence], W, T)
        grad_T -= expected_transitions  # Adjust this line as necessary based on your implementation
    
    # Normalize or adjust grad_T as required for your model
    return grad_T


def compute_test_error(params, X_test, y_test):
    """
    Compute test error given the current parameters.
    """
    W, T = params[:26*128].reshape((26, 128)), params[26*128:].reshape((26, 26))
    errors = 0
    total = 0
    for i in range(len(X_test)):
        # Assuming a function predict_labels to predict labels for a sequence
        predicted_labels = predict_labels(X_test[i], W, T)
        errors += np.sum(predicted_labels != y_test[i])
        total += len(y_test[i])
    return errors / total

def compute_expected_transitions(X, W, T):
    """
    Compute the expected number of transitions between states for the given data X.
    
    Args:
    - X: The input sequences, a list of 2D numpy arrays.
    - W: The weight matrix for the feature functions.
    - T: The transition matrix.
    
    Returns:
    - expected_transitions: A matrix of expected transition counts.
    """
    num_states = T.shape[0]  # Number of states
    expected_transitions = np.zeros_like(T)  # Initialize expected transitions matrix

    for sequence in X:  # Loop through each sequence
        # Initialize forward and backward matrices
        alpha = forward_pass(sequence, W, T)
        beta = backward_pass(sequence, W, T)

        # Compute the expected transitions for the current sequence
        for i in range(len(sequence) - 1):
            for s1 in range(num_states):
                for s2 in range(num_states):
                    # Probability of transitioning from state s1 to s2 at position i
                    transition_prob = np.exp(W[s2] @ sequence[i + 1] + T[s1, s2])
                    # Compute the expected count for this transition
                    expected_count = alpha[i, s1] * transition_prob * beta[i + 1, s2]
                    expected_transitions[s1, s2] += expected_count
    
    # Normalize the expected transitions to get probabilities
    expected_transitions /= np.sum(expected_transitions)

    return expected_transitions


def sgd(params, grad_func, X_train, y_train, C, learning_rate, epochs, batch_size):
    params_history = [params.copy()]
    for epoch in range(epochs):
        # Shuffle the data at the start of each epoch
        shuffled_indices = np.random.permutation(len(X_train))
        for start_idx in range(0, len(X_train), batch_size):
            # Indices for the current batch
            batch_indices = shuffled_indices[start_idx:start_idx+batch_size]

            # Since X_train and y_train are lists of arrays, we need to select the batches differently
            X_batch = [X_train[i] for i in batch_indices]
            y_batch = [y_train[i] for i in batch_indices]

            # Here you need to adjust how you calculate the gradient
            # This might involve iterating through X_batch and y_batch
            # Or altering grad_func to accept lists of sequences
            gradient = grad_func(params, X_batch, y_batch, C)

            # Update parameters
            params -= learning_rate * gradient
        
        params_history.append(params.copy())
    
    return params, params_history


def sgd_with_momentum(params, grad_func, X_train, y_train, C, learning_rate, momentum_coeff, epochs, batch_size):
    # Initialize velocity vector for momentum
    velocity = np.zeros_like(params)
    params_history = [params.copy()]
    
    for epoch in range(epochs):
        shuffled_indices = np.random.permutation(len(X_train))
        for start_idx in range(0, len(X_train), batch_size):
            batch_indices = shuffled_indices[start_idx:start_idx+batch_size]
            X_batch = [X_train[i] for i in batch_indices]
            y_batch = [y_train[i] for i in batch_indices]
            
            gradient = grad_func(params, X_batch, y_batch, C)
            
            # Update the velocity and parameters
            velocity = momentum_coeff * velocity - learning_rate * gradient
            params += velocity
        
        params_history.append(params.copy())

    return params, params_history

In [43]:
def read_train(file_path = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/data/train.txt"):
    """
    Reads the train data from the file.
    Each row corresponds to an example and is split into the label and the feature vector.
    """
    from string import ascii_lowercase
    mapping = {letter: idx for idx, letter in enumerate(ascii_lowercase)}

    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    dataX, dataY = [], []
    tempX, tempY = [], []
    for row in raw_data[:-1]:
        row = row.split(" ")
        tempY.append(mapping[row[1]])
        tempX.append(np.array(row[5:], dtype=float))
        if int(row[2]) < 0:  # End of sequence
            dataX.append(np.array(tempX))
            dataY.append(np.array(tempY, dtype=int))
            tempX, tempY = [], []  # Reset for the next sequence

    print("Number of training sequences:", len(dataX))
    print("First 5 sequences' labels:\n", dataY[:5])
    
    return list(zip(dataX, dataY))

import numpy as np

def read_test(file_path = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/data/test.txt"):
    """
    Reads the test data from the file.
    Each row corresponds to an example and is split into the label and the feature vector.
    The function assumes that each example ends when a row with the third column less than 0 is encountered.
    """
    from string import ascii_lowercase
    mapping = {letter: idx for idx, letter in enumerate(ascii_lowercase)}

    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    dataX, dataY = [], []
    tempX, tempY = [], []
    for row in raw_data[:-1]:  # Skip the last empty line if it exists
        row = row.split(" ")
        tempY.append(mapping[row[1]])
        tempX.append(np.array(row[5:], dtype=float))
        if int(row[2]) < 0:  # Check for the end of a sequence
            dataX.append(np.array(tempX))
            dataY.append(np.array(tempY, dtype=int))
            tempX, tempY = [], []  # Reset for the next sequence

    print("Number of test sequences:", len(dataX))
    print("First 5 sequences' labels:\n", dataY[:5])

    return list(zip(dataX, dataY))

In [44]:
# Assuming read_train and read_test are already defined and loaded

# Call the functions
train_data = read_train()
test_data = read_test()

# Unpack the feature matrices and labels vectors from the list of tuples
X_train = [x for x, _ in train_data]
y_train = [y for _, y in train_data]

X_test = [x for x, _ in test_data]
y_test = [y for _, y in test_data]


Number of training sequences: 3438
First 5 sequences' labels:
 [array([ 0, 10,  4]), array([14, 12, 12,  0, 13,  3,  8, 13,  6]), array([ 4, 17, 14]), array([13,  4, 23, 15,  4,  2, 19,  4,  3]), array([ 4,  2, 11,  0, 17,  8, 13,  6])]
Number of test sequences: 3439
First 5 sequences' labels:
 [array([24, 11, 14, 15,  7, 14, 13,  4]), array([13, 22, 14, 17, 10,  0,  1, 11,  4]), array([ 2,  2, 14, 20, 13, 19,  0,  1,  8, 11,  8, 19, 24]), array([17,  8,  6,  7, 19,  5, 20, 11, 11, 24]), array([ 4,  2, 14, 12, 15, 17,  4, 18, 18])]


In [45]:
num_labels = 26  # Number of labels
feature_size = 128  # Assuming each input feature vector has a dimension of 128
W_init = np.zeros((num_labels, feature_size))
T_init = np.zeros((num_labels, num_labels))
params_init = np.concatenate([W_init.flatten(), T_init.flatten()])


In [46]:
def gradient_func(params, X_batch, y_batch, C):
    W = params[:num_labels*feature_size].reshape((num_labels, feature_size))
    T = params[num_labels*feature_size:].reshape((num_labels, num_labels))
    grad_W = compute_grad_W(X_batch, y_batch, W, T)
    grad_T = compute_grad_T(X_batch, y_batch, W, T)
    return np.concatenate([grad_W.flatten(), grad_T.flatten()])

def compute_expected_features(X, W, T):
    """
    Computes expected feature counts for a given dataset X, weights W, and transition matrix T.
    This function is a placeholder and needs to be implemented according to the specific model.
    """
    # Placeholder for the expected features calculation
    expected_features = np.zeros_like(W)
    
    # Assuming here that each X[i] is a sequence of feature vectors
    for sequence in X:
        # Here you would compute the expected features for the sequence
        # This might involve forward-backward calculations for sequence models like HMMs or CRFs
        # For simplicity, let's assume we have a function that does this:
        sequence_expected_features = forward_backward(sequence, W, T)
        
        # Aggregate the expected features from each sequence
        expected_features += sequence_expected_features
        
    return expected_features

def forward_backward(sequence, W, T):
    # Initialize forward and backward matrices
    forward = np.zeros((len(sequence), len(W)))
    backward = np.zeros((len(sequence), len(W)))

    # Initialize expected feature counts
    expected_features = np.zeros_like(W)
    
    # Forward pass to fill the forward matrix
    forward[0] = initialize_forward(sequence, W)  # This needs to be defined based on your model
    for i in range(1, len(sequence)):
        for current_state in range(len(W)):
            for previous_state in range(len(W)):
                # Update forward[i][current_state] with the transition probability from previous_state to current_state
                # and the emission probability of sequence[i] given current_state
                pass  # The actual logic involves dynamic programming updates

    # Backward pass to fill the backward matrix
    backward[-1] = 1  # Typically initialized to 1 for all states
    for i in range(len(sequence) - 2, -1, -1):
        for current_state in range(len(W)):
            for next_state in range(len(W)):
                # Update backward[i][current_state] similar to the forward pass but in reverse
                pass  # The actual logic involves dynamic programming updates

    # Compute the expected features using forward and backward matrices
    for i in range(len(sequence)):
        for state in range(len(W)):
            # Update expected_features based on the forward and backward probabilities
            # and the feature vector of sequence[i]
            pass  # Incorporate logic to accumulate expected feature counts

    return expected_features

def initialize_forward(sequence, W):
    """
    Initialize the forward matrix for the first position in the sequence.
    
    Args:
    - sequence: The input sequence for which forward probabilities are computed.
    - W: The weight matrix of shape (num_states, num_features) for the CRF model.
    
    Returns:
    - forward_initial: The initialized forward probabilities for the first position.
    """
    num_states = W.shape[0]  # Assuming W's shape is (num_states, num_features)
    
    # Option 1: Uniform probabilities for the first position
    forward_initial = np.ones(num_states) / num_states
    
    return forward_initial



In [47]:
C = 1000  # The best value found in the previous section
learning_rate = 0.01  # Hyperparameter, tune accordingly
momentum_coeff = 0.9  # Momentum coefficient, tune accordingly
batch_size = 32  # Batch size for stochastic mini-batch
epochs = 100  # Number of epochs

# Run SGD
params_sgd = sgd(params_init, gradient_func, X_train, y_train, C, learning_rate, epochs, batch_size)

# Run SGD with Momentum
params_sgd_momentum = sgd_with_momentum(params_init, gradient_func, X_train, y_train, C, learning_rate, momentum_coeff, epochs, batch_size)


NameError: name 'forward_pass' is not defined

In [1]:
test_error_sgd = compute_test_error(params_sgd, X_test, y_test)
test_error_sgd_momentum = compute_test_error(params_sgd_momentum, X_test, y_test)
print(f"Test Error (SGD): {test_error_sgd}")
print(f"Test Error (SGD with Momentum): {test_error_sgd_momentum}")


In [2]:
import matplotlib.pyplot as plt

# Function to plot the decline of training objective values
def plot_training_objective(train_objectives, labels, title='Training Objective Decline'):
    plt.figure(figsize=(10, 6))
    for objectives, label in zip(train_objectives, labels):
        plt.plot(objectives, label=label)
    plt.xlabel('Effective Number of Passes')
    plt.ylabel('Training Objective Value')
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.show()

# Function to plot test errors
def plot_test_errors(test_errors, labels, title='Test Error'):
    plt.figure(figsize=(10, 6))
    for errors, label in zip(test_errors, labels):
        plt.plot(errors, label=label)
    plt.xlabel('Effective Number of Passes')
    plt.ylabel('Word-wise Test Error (%)')
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.show()

# Combine data for plotting
train_objectives = [train_objective_values_sgd, train_objective_values_sgd_momentum, train_objective_values_lbfgs]
test_errors = [test_errors_sgd, test_errors_sgd_momentum, test_errors_lbfgs]
labels = ['SGD', 'SGD with Momentum', 'L-BFGS']

# Plot
plot_training_objective(train_objectives, labels)
plot_test_errors(test_errors, labels)


# from here


In [3]:
import numpy as np
import os

# Define the paths for the train and test data files
train_data_path = "path_to_your_train_data.txt"  # Update this path
test_data_path = "path_to_your_test_data.txt"  # Update this path

# Adaptation of read_train function to load training data
def read_train(train_data_path):
    mapping = {letter: index for index, letter in enumerate("abcdefghijklmnopqrstuvwxyz")}
    with open(train_data_path, "r") as f:
        raw_data = f.read().split("\n")
    
    dataX, dataY = [], []
    tempX, tempY = [], []
    for row in raw_data[:-1]:
        row = row.split(" ")
        tempY.append(mapping[row[1]])
        tempX.append(np.array(row[5:], dtype=float))
        if int(row[2]) < 0:
            dataX.append(np.array(tempX))
            dataY.append(np.array(tempY, dtype=int))
            tempX, tempY = [], []
    
    return zip(dataX, dataY)

# Adaptation of read_train function to load test data
def read_test(test_data_path):
    mapping = {letter: index for index, letter in enumerate("abcdefghijklmnopqrstuvwxyz")}
    with open(test_data_path, "r") as f:
        raw_data = f.read().split("\n")
    
    dataX, dataY = [], []
    tempX, tempY = [], []
    for row in raw_data[:-1]:
        row = row.split(" ")
        tempY.append(mapping[row[1]])
        tempX.append(np.array(row[5:], dtype=float))
        if int(row[2]) < 0:
            dataX.append(np.array(tempX))
            dataY.append(np.array(tempY, dtype=int))
            tempX, tempY = [], []
    
    return zip(dataX, dataY)

# Now let's load the train and test data
train_dataset = read_train(train_data_path)
test_dataset = read_test(test_data_path)

# Implement the callback function to track the progress during optimization
function_evaluations = 0
def callback(params, *args):
    global function_evaluations
    function_evaluations += 1
    
    # Unpack arguments (assuming test dataset and possibly other args are passed)
    X_test, y_test = args[:2]
    
    # Implement or call your test error function here
    test_error = compute_test_error(params, X_test, y_test)
    
    # Implement or call your training objective function here
    training_objective = compute_training_objective(params, X_train, y_train)
    
    # Print out the current progress
    print(f"Iteration {function_evaluations}: Test Error = {test_error}, Training Objective = {training_objective}")
    
    # Optional: Store metrics for later analysis and plotting
    test_errors.append(test_error)
    train_objectives.append(training_objective)

# Initialize lists to store the progress
test_errors = []
train_objectives = []

# Note: You would also need to implement or provide the `compute_test_error` and `compute_training_objective` functions.
# Replace "X_train, y_train" with the actual variables containing your training data.


FileNotFoundError: [Errno 2] No such file or directory: 'path_to_your_train_data.txt'

In [49]:
def read_model(file_path = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/data/model.txt"):
    """
    Reads the model data from the file.
    The data consists of weight vectors for each label and a transition matrix T.
    """
    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    W = np.array(raw_data[:26 * 128], dtype=float).reshape(26, 128)
    T = np.array(raw_data[26 * 128:-1], dtype=float).reshape(26, 26)
    T = np.swapaxes(T, 0, 1)
    
    print("Shapes of model data:")
    print("W:", W.shape, "T:", T.shape)
    #print("Top 5 rows of W:\n", W[:5])
    
    return W, T

In [65]:
import numpy, data_read, prob_grad
from scipy.optimize import fmin_bfgs
import time
import pickle


X_y = read_train()
W, T = read_model()

import numpy as np

def func(params, *args):
    """
    Objective function specified in the handout.

    Parameters:
    - params: Array containing model parameters (W and T)
    - args: Additional arguments (data and regularization parameter C)

    Returns:
    - Value of the objective function
    """
    # Unpack model parameters
    W, T = params[:26*128].reshape((26, 128)), params[26*128:].reshape((26, 26))
    data = args[0]  # Training data
    C = args[1]     # Regularization parameter

    # Compute the log sum
    log_sum = 0
    for example in data:
        log_sum += prob_grad.compute_log_p(example[0], example[1], W, T)

    # Compute the L2 norm of each row of W
    norm = np.linalg.norm(W, axis=1) ** 2

    # Compute the objective function value
    objective_value = -1 * (C / len(data)) * log_sum + 0.5 * np.sum(norm) + 0.5 * np.sum(np.square(T))

    return objective_value

def func_prime(params, *args):
    """
    Derivative of the objective function specified in the handout.

    Parameters:
    - params: Array containing model parameters (W and T)
    - args: Additional arguments (data and regularization parameter C)

    Returns:
    - Gradient of the objective function
    """
    # Unpack model parameters
    W, T = params[:26*128].reshape((26, 128)), params[26*128:].reshape((26, 26))
    data = args[0]  # Training data
    C = args[1]     # Regularization parameter

    # Initialize gradients
    log_grad_w = np.zeros((26, 128))
    log_grad_t = np.zeros((26, 26))

    # Compute the gradient of logP w.r.t. W and T
    for example in data:
        log_grad_w += prob_grad.log_p_wgrad(W, example[0], example[1], T)
        log_grad_t += prob_grad.log_p_tgrad(T, example[0], example[1], W)

    # Multiply by -C/N
    log_grad_w *= -1 * C / len(data)
    log_grad_t *= -1 * C / len(data)

    # Add gradient of norm
    log_grad_w += W

    # Add normalizing factor
    log_grad_t += T

    # Flatten and concatenate gradients
    gradient = np.concatenate([log_grad_w.reshape(26*128), log_grad_t.reshape(26*26)])

    return gradient


on = numpy.concatenate([W.reshape(26*128), T.reshape(26*26)])

res = func(on, X_y[:9], 1000)
result = func_prime(on, X_y[:9], 1000)

#need to flatten for the optimizer
initial_guess = numpy.zeros((26*128+26*26))

bounds = [(-10000000, 10000000)]*(28*128+26*26)


ret = fmin_bfgs(func, initial_guess, fprime=func_prime, args=(X_y[:5], 1000), maxiter=2, retall=True, full_output=True)

def get_params(x_y):
    t0 = time.time()
    ret=fmin_bfgs(func, initial_guess, fprime=func_prime, args=(x_y,1000), maxiter=1,retall=True, full_output=True)
    t1 = time.time()
    with open("best_Weights_tampered","+bw") as f :
        pickle.dump(ret,f)
    numpy.savetxt("best_Weights_tampered",ret[0])
    #numpy.savetxt("best_func_c_10",ret[1])
    
    print(f"Time: {t1-t0}")
    

Number of training sequences: 3438
First 5 sequences' labels:
 [array([ 0, 10,  4]), array([14, 12, 12,  0, 13,  3,  8, 13,  6]), array([ 4, 17, 14]), array([13,  4, 23, 15,  4,  2, 19,  4,  3]), array([ 4,  2, 11,  0, 17,  8, 13,  6])]
Shapes of model data:
W: (26, 128) T: (26, 26)
         Current function value: 6331.055729
         Iterations: 2
         Function evaluations: 3
         Gradient evaluations: 3


In [89]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
from scipy.optimize import fmin_bfgs
import prob_grad
import data_read

def sgd_step(params, X_batch, y_batch, C, learning_rate):
    # We need to pack X_batch and y_batch into data because func_prime expects it that way
    data = (X_batch, y_batch)
    gradient = func_prime(params, data, C)
    params -= learning_rate * gradient
    return params

def sgd_momentum_step(params, velocity, X_batch, y_batch, C, learning_rate, momentum_coeff):
    # Similar packing of X_batch and y_batch into data
    data = (X_batch, y_batch)
    gradient = func_prime(params, data, C)
    velocity = momentum_coeff * velocity - learning_rate * gradient
    params += velocity
    return params, velocity


# Implementation of the error computation function for a CRF
def compute_error(W, T, x_seq, y_seq):
    # Assuming that W and T are your model's parameters
    # x_seq is a single sequence of observations
    # y_seq is the corresponding sequence of true labels
    # We will calculate error as the number of incorrect label predictions
    error_count = 0
    # Assume you have a function that can give you the predicted sequence based on W, T, and x_seq
    predicted_seq = prob_grad.predict_sequence(W, T, x_seq)
    for true_label, predicted_label in zip(y_seq, predicted_seq):
        if true_label != predicted_label:
            error_count += 1
    return error_count

def compute_test_error(W, T, X_test, y_test):
    total_errors = 0
    total_labels = 0
    for x_seq, y_seq in zip(X_test, y_test):
        total_errors += compute_error(W, T, x_seq, y_seq)
        total_labels += len(y_seq)
    return total_errors / total_labels  # return the error rate

# You can use similar logic to compute the training objective in a loop for each sequence.
# For example:
def compute_training_objective(W, T, X_train, y_train, C):
    # Assuming that compute_log_p gives log probability for a sequence
    total_log_p = 0
    for x_seq, y_seq in zip(X_train, y_train):
        total_log_p += prob_grad.compute_log_p(W, T, x_seq, y_seq)
    # Add the regularization term (assuming L2 regularization)
    reg_term = (C / len(X_train)) * (np.sum(W**2) + np.sum(T**2))
    return -total_log_p / len(X_train) + reg_term

def run_sgd(X_train, y_train, X_test, y_test, C, num_iterations, batch_size, learning_rate, momentum_coeff, use_momentum=False):
    global params, velocity
    train_objectives = []
    test_errors = []

    # Initialize params and velocity if not already initialized
    if params.size == 0:
        params = np.zeros((26*128+26*26,))
    if velocity.size == 0:
        velocity = np.zeros((26*128+26*26,))

    for iteration in range(num_iterations):
        # Randomly sample indices for this iteration
        indices = np.random.choice(len(X_train), size=batch_size, replace=False)
        data_batch = [(X_train[i], y_train[i]) for i in indices]  # This is a list of tuples

        # Perform a single SGD step
        if use_momentum:
            params, velocity = sgd_momentum_step(params, velocity, data_batch, C, learning_rate, momentum_coeff)
        else:
            params = sgd_step(params, data_batch, C, learning_rate)

        # Compute training objective and test error after this step
        W, T = params[:26*128].reshape((26, 128)), params[26*128:].reshape((26, 26))
        train_objective = compute_training_objective(W, T, X_train, y_train, C)
        test_error = compute_test_error(W, T, X_test, y_test)

        train_objectives.append(train_objective)
        test_errors.append(test_error)

    return train_objectives, test_errors



In [90]:
# For example:
C = 1000  # Replace with the actual value
learning_rate = 0.01  # Replace with the actual value
momentum_coeff = 0.9  # Replace with the actual value
batch_size = 10  # Replace with the actual value
num_iterations = 100  # Replace with the actual value


# Assuming that read_train and read_model are available
X_y = read_train()
W, T = read_model()
data = X_y

# Assuming parameter_shape is 26*128+26*26 based on your func_prime implementation
params = np.concatenate([W.flatten(), T.flatten()])  # Concatenate W and T into a single parameter vector
velocity = np.zeros_like(params)
velocity = np.zeros(parameter_shape)

# Call the functions
train_data = read_train()
test_data = read_test()

# Unpack the feature matrices and labels vectors from the list of tuples
X_train = [x for x, _ in train_data]
y_train = [y for _, y in train_data]

X_test = [x for x, _ in test_data]
y_test = [y for _, y in test_data]


Number of training sequences: 3438
First 5 sequences' labels:
 [array([ 0, 10,  4]), array([14, 12, 12,  0, 13,  3,  8, 13,  6]), array([ 4, 17, 14]), array([13,  4, 23, 15,  4,  2, 19,  4,  3]), array([ 4,  2, 11,  0, 17,  8, 13,  6])]
Shapes of model data:
W: (26, 128) T: (26, 26)
Number of training sequences: 3438
First 5 sequences' labels:
 [array([ 0, 10,  4]), array([14, 12, 12,  0, 13,  3,  8, 13,  6]), array([ 4, 17, 14]), array([13,  4, 23, 15,  4,  2, 19,  4,  3]), array([ 4,  2, 11,  0, 17,  8, 13,  6])]
Number of test sequences: 3439
First 5 sequences' labels:
 [array([24, 11, 14, 15,  7, 14, 13,  4]), array([13, 22, 14, 17, 10,  0,  1, 11,  4]), array([ 2,  2, 14, 20, 13, 19,  0,  1,  8, 11,  8, 19, 24]), array([17,  8,  6,  7, 19,  5, 20, 11, 11, 24]), array([ 4,  2, 14, 12, 15, 17,  4, 18, 18])]


In [92]:
# Run SGD with the updated compute_test_error function
train_objectives, test_errors = run_sgd(X_train, y_train, X_test, y_test, C, num_iterations, batch_size, learning_rate, momentum_coeff, use_momentum=False)

TypeError: sgd_step() missing 1 required positional argument: 'learning_rate'

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Assuming compute_training_objective and compute_test_error are available
# Assuming X_train, y_train, X_test, y_test are loaded and available
# Assuming parameter_shape is known and appropriate for the CRF model
# Assuming gradient_func and objective_func are implemented for the CRF model
# The learning_rate, momentum_coeff, batch_size, num_iterations, n, and C should be appropriately set

# Initialize parameters for the CRF model
params = np.zeros(parameter_shape)
velocity = np.zeros(parameter_shape)

# Initialize lists to store the progress of training
train_objectives_sgd = []
train_objectives_momentum = []
test_errors_sgd = []
test_errors_momentum = []

# Function for stochastic gradient descent with and without momentum
def run_sgd(X_train, y_train, X_test, y_test, use_momentum=False):
    global params, velocity
    train_objectives = []
    test_errors = []
    
    for iteration in range(num_iterations):
        # Stochastic mini-batch sampling
        indices = np.random.choice(len(X_train), size=batch_size, replace=False)
        X_batch, y_batch = X_train[indices], y_train[indices]

        # Compute the gradient using the sampled mini-batch
        gradient = gradient_func(params, X_batch, y_batch, C)
        
        # Update parameters
        if use_momentum:
            # Update rule for SGD with momentum
            velocity = momentum_coeff * velocity - learning_rate * gradient
            params += velocity
        else:
            # Update rule for simple SGD
            params -= learning_rate * gradient
        
        # Compute training objective value and test error
        train_objective = objective_func(params, X_train, y_train, C)
        test_error = compute_test_error(params, X_test, y_test)
        
        # Store the computed values
        train_objectives.append(train_objective)
        test_errors.append(test_error)
    
    return train_objectives, test_errors

# Run SGD and SGD with momentum
train_objectives_sgd, test_errors_sgd = run_sgd(X_train, y_train, X_test, y_test, use_momentum=False)
train_objectives_momentum, test_errors_momentum = run_sgd(X_train, y_train, X_test, y_test, use_momentum=True)





In [None]:
# Plot training objective decline
plt.figure(figsize=(10, 5))
plt.plot(np.arange(num_iterations) * batch_size / n, train_objectives_sgd, label='SGD Training Objective')
plt.plot(np.arange(num_iterations) * batch_size / n, train_objectives_momentum, label='SGD with Momentum Training Objective')
plt.xlabel('Effective number of passes')
plt.ylabel('Training Objective Value')
plt.legend()
plt.show()

# Plot test error decline
plt.figure(figsize=(10, 5))
plt.plot(np.arange(num_iterations) * batch_size / n, test_errors_sgd, label='SGD Test Error')
plt.plot(np.arange(num_iterations) * batch_size / n, test_errors_momentum, label='SGD with Momentum Test Error')
plt.xlabel('Effective number of passes')
plt.ylabel('Word-wise Test Error')
plt.legend()
plt.show()


# For LBFGS, you would use scipy.optimize.fmin_tnc or similar functions
# You would also implement a callback function similar to the one described earlier
# The callback would need to track function evaluations, test errors, and training objectives
# The plots for LBFGS would need to be added to the above plots for comparison