In [3]:
import numpy as np

# Define file paths for ease of access
decode_input_path = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/data/decode_input.txt"
train_struct_path = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/data/train_struct.txt"
model_path = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/data/model.txt"
train_data_path = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/data/train.txt"
test_data_path = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/data/test.txt"

def read_decode_input(file_path):
    """
    Reads the decode_input data from the file.
    Each line represents one letter with 128 elements.
    There are 26 weight vectors each with 128 elements and a transition matrix T with size 26x26.
    The transition matrix T is in row-major order.
    """
    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    X = np.array(raw_data[:100 * 128], dtype=float).reshape(100, 128)
    W = np.array(raw_data[100 * 128:100 * 128 + 26 * 128], dtype=float).reshape(26, 128)
    T = np.array(raw_data[100 * 128 + 26 * 128:-1], dtype=float).reshape(26, 26)
    T = np.swapaxes(T, 0, 1)
    
    print("Shapes of decode input:")
    print("X:", X.shape, "W:", W.shape, "T:", T.shape)
    #print("Top 5 rows of X:\n", X[:5])

    return X, W, T

def read_train_struct(file_path):
    """
    Reads the train_struct data from the file.
    Each line represents a label and a feature vector (in a sparse representation).
    """
    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    dataX, dataY = [], []
    for line in raw_data[:-1]:  # The last element is empty
        line = line.split(" ")
        dataY.append([int(line[0]) - 1, int(line[1][4:])])
        datax = np.zeros(128, dtype=int)
        for f1 in line[2:]:
            idx, val = f1.split(":")
            datax[int(idx) - 1] = int(val)
        dataX.append(datax)
    
    dataX_np = np.array(dataX, dtype=int)
    print("Shapes of train_struct:")
    print("dataX:", dataX_np.shape, "dataY length:", len(dataY))
    #print("Top 5 rows of dataX:\n", dataX_np[:5])
    
    return dataX_np, dataY

def read_model(file_path):
    """
    Reads the model data from the file.
    The data consists of weight vectors for each label and a transition matrix T.
    """
    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    W = np.array(raw_data[:26 * 128], dtype=float).reshape(26, 128)
    T = np.array(raw_data[26 * 128:-1], dtype=float).reshape(26, 26)
    T = np.swapaxes(T, 0, 1)
    
    print("Shapes of model data:")
    print("W:", W.shape, "T:", T.shape)
    #print("Top 5 rows of W:\n", W[:5])
    
    return W, T

def read_train(file_path):
    """
    Reads the train data from the file.
    Each row corresponds to an example and is split into the label and the feature vector.
    """
    from string import ascii_lowercase
    mapping = {letter: idx for idx, letter in enumerate(ascii_lowercase)}

    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    dataX, dataY = [], []
    tempX, tempY = [], []
    for row in raw_data[:-1]:
        row = row.split(" ")
        tempY.append(mapping[row[1]])
        tempX.append(np.array(row[5:], dtype=float))
        if int(row[2]) < 0:  # End of sequence
            dataX.append(np.array(tempX))
            dataY.append(np.array(tempY, dtype=int))
            tempX, tempY = [], []  # Reset for the next sequence

    print("Number of training sequences:", len(dataX))
    print("First 5 sequences' labels:\n", dataY[:5])
    
    return list(zip(dataX, dataY))

import numpy as np

def read_test(file_path):
    """
    Reads the test data from the file.
    Each row corresponds to an example and is split into the label and the feature vector.
    The function assumes that each example ends when a row with the third column less than 0 is encountered.
    """
    from string import ascii_lowercase
    mapping = {letter: idx for idx, letter in enumerate(ascii_lowercase)}

    with open(file_path, "r") as f:
        raw_data = f.read().split("\n")

    dataX, dataY = [], []
    tempX, tempY = [], []
    for row in raw_data[:-1]:  # Skip the last empty line if it exists
        row = row.split(" ")
        tempY.append(mapping[row[1]])
        tempX.append(np.array(row[5:], dtype=float))
        if int(row[2]) < 0:  # Check for the end of a sequence
            dataX.append(np.array(tempX))
            dataY.append(np.array(tempY, dtype=int))
            tempX, tempY = [], []  # Reset for the next sequence

    print("Number of test sequences:", len(dataX))
    print("First 5 sequences' labels:\n", dataY[:5])

    return list(zip(dataX, dataY))

def read_test_decoder_modified(file_path):
    """
    Reads the test data for decoding and returns a NumPy array
    where each sub-array from the list becomes a row in the final
    two-dimensional array. This function only extracts the features
    and does not deal with the labels.
    """
    with open(file_path, 'r') as file:
        raw_data = file.read().strip().split('\n')

    # Initialize an empty list to store all feature vectors
    dataX = []
    
    for row in raw_data:
        if row:  # Skip any empty lines
            features = row.split(' ')[5:]  # Features start from the 6th element in the row
            feature_vector = list(map(float, features))  # Convert string features to float
            dataX.append(feature_vector)

    # Convert the list of lists (features for each word) into a 2D NumPy array
    dataX_np = np.array(dataX)

    print("Shape of test data for decoder:", dataX_np.shape)
    print("Top 5 feature vectors:\n", dataX_np[:5, :])

    return dataX_np


# Example:
X, W, T = read_decode_input(decode_input_path)
train_struct_X, train_struct_Y = read_train_struct(train_struct_path)
W_model, T_model = read_model(model_path)
train_data = read_train(train_data_path)
test_data = read_test(test_data_path)
test_data_decoder = read_test_decoder_modified(test_data_path)


Shapes of decode input:
X: (100, 128) W: (26, 128) T: (26, 26)
Shapes of train_struct:
dataX: (25953, 128) dataY length: 25953
Shapes of model data:
W: (26, 128) T: (26, 26)
Number of training sequences: 3438
First 5 sequences' labels:
 [array([ 0, 10,  4]), array([14, 12, 12,  0, 13,  3,  8, 13,  6]), array([ 4, 17, 14]), array([13,  4, 23, 15,  4,  2, 19,  4,  3]), array([ 4,  2, 11,  0, 17,  8, 13,  6])]
Number of test sequences: 3439
First 5 sequences' labels:
 [array([24, 11, 14, 15,  7, 14, 13,  4]), array([13, 22, 14, 17, 10,  0,  1, 11,  4]), array([ 2,  2, 14, 20, 13, 19,  0,  1,  8, 11,  8, 19, 24]), array([17,  8,  6,  7, 19,  5, 20, 11, 11, 24]), array([ 4,  2, 14, 12, 15, 17,  4, 18, 18])]
Shape of test data for decoder: (26198, 128)
Top 5 feature vectors:
 [[0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1.
  0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.
  0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 

In [10]:
import os
import numpy as np


# Defining paths for the required files
path_decode_input = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/data/decode_input.txt"
path_train_struct = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/data/train_struct.txt"
path_train = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/data/train.txt"
path_test = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/data/test_struct.txt"

def read_parameters():
    """
    Reads the decode_input file and extracts X, W, and T parameters.
    X: Feature vectors
    W: Weight vectors
    T: Transition matrix
    """
    raw_data = np.loadtxt(path_decode_input, ndmin=1)
    
    # Splitting raw_data into respective parts for X, W, T
    X = np.array(raw_data[0:100*128]).reshape(100,128)
    W = np.array(raw_data[100*128:100*128+26*128]).reshape(26,128)
    T = np.array(raw_data[100*128+26*128:]).reshape(26,26)
    
    # Debug: Print shapes to ensure correct reshaping
    print(f"Shapes - X: {X.shape}, W: {W.shape}, T: {T.shape}")
    return X, W, T

def read_word_indexes():
    """
    Reads the train.txt file to extract the word indexes.
    """
    # Using np.loadtxt to only load specific column
    word_indexes = np.loadtxt(path_train, usecols=(2,))
    
    # Debug: Print first 5 word indexes to check correctness
    print("First 5 word indexes:", word_indexes[:5])
    return word_indexes

def read_train_struct():
    """
    Reads the train_struct.txt file to extract training data X and labels Y.
    """
    with open(path_train_struct, "r") as f:
        raw_data = f.read().split("\n")
    
    dataX, dataY = [], []
    for line in raw_data[:-1]:
        line = line.split(" ")
        dataY.append([int(line[0])-1]) # Assuming labels are to be decremented by 1
        datax = [0]*128
        for feature in line[2:]:
            index, _ = feature.split(":")
            datax[int(index)-1] = 1  # Set the corresponding feature to 1
        dataX.append(datax)
    
    dataX, dataY = np.array(dataX, dtype=float), np.array(dataY, dtype=int)
    
    # Debug: Print shapes and first few labels for verification
    print(f"dataX shape: {dataX.shape}, dataY shape: {dataY.shape}")
    print(f"First 5 labels: {dataY[:5]}")
    return dataX, dataY

def read_test_struct():
    """
    Reads the test_struct.txt file to extract test data X and labels Y.
    """
    with open(path_test, "r") as f:
        raw_data = f.read().split("\n")
    
    dataX, dataY = [], []
    for line in raw_data[:-1]:
        line = line.split(" ")
        dataY.append([int(line[0])-1])  # Assuming labels are to be decremented by 1
        datax = [0]*128
        for feature in line[2:]:
            index, _ = feature.split(":")
            datax[int(index)-1] = 1  # Set the corresponding feature to 1
        dataX.append(datax)
    
    dataX, dataY = np.array(dataX, dtype=float), np.array(dataY, dtype=int)
    
    # Debug: Print shapes and first few labels for verification
    print(f"dataX shape: {dataX.shape}, dataY shape: {dataY.shape}")
    print(f"First 5 labels: {dataY[:5]}")
    return dataX, dataY


In [11]:
# Usage for read_parameters
X, W, T = read_parameters()
#print("X (Feature vectors):", X[:5], "\n")  # Print first 5 rows for brevity
#print("W (Weight vectors):", W[:5], "\n")  # Print first 5 rows for brevity
#print("T (Transition matrix):", T, "\n")

# Usage for read_word_indexes
word_indexes = read_word_indexes()
print("Word indexes (First 10):", word_indexes[:10], "\n")  # Print first 10 word indexes

# Usage for read_train_struct
dataX_train, dataY_train = read_train_struct()
print("Training dataX (First 5 examples):", dataX_train[:5], "\n")  # Print first 5 examples for brevity
print("Training dataY (First 5 labels):", dataY_train[:5], "\n")  # Print first 5 labels

# Usage for read_test_struct
dataX_test, dataY_test = read_test_struct()
print("Test dataX (First 5 examples):", dataX_test[:5], "\n")  # Print first 5 examples for brevity
print("Test dataY (First 5 labels):", dataY_test[:5], "\n")  # Print first 5 labels


Shapes - X: (100, 128), W: (26, 128), T: (26, 26)
First 5 word indexes: [ 2.  3. -1.  2.  3.]
Word indexes (First 10): [ 2.  3. -1.  2.  3.  4.  5.  6.  7.  8.] 

dataX shape: (25953, 128), dataY shape: (25953, 1)
First 5 labels: [[ 0]
 [10]
 [ 4]
 [14]
 [12]]
Training dataX (First 5 examples): [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1.
  0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 1.
  1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
  1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0.
  1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.
  1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0.
  0. 1. 1. 

In [43]:
import numpy, data_read, prob_grad
from scipy.optimize import fmin_bfgs
import time
import pickle
X_y = data_read.read_train()
W, T = data_read.read_model()

def func(params, *args):
#objective function specified in the handout
	W, T = params[:26*128].reshape((26, 128)), params[26*128:].reshape((26, 26))
	data = args[0]
	C = args[1]

	log_sum = 0
	for example in data:
		log_sum += prob_grad.compute_log_p(example[0], example[1], W, T)

	norm = numpy.zeros(26)
	for i in range(26):
		norm[i] = numpy.linalg.norm(W[i])

	numpy.square(norm, out=norm)
	
	return -1*(C/len(data))*log_sum + 0.5*numpy.sum(norm) + 0.5*numpy.sum(numpy.square(T))

def func_prime(params, *args):
#derivative of objective function specified in the handout
	W, T = params[:26*128].reshape((26, 128)), params[26*128:].reshape((26, 26))
	data = args[0]
	C = args[1]

	log_grad_w = numpy.zeros((26, 128))
	log_grad_t = numpy.zeros((26, 26))

	#gradient of logP w/ W
	for example in data:
		numpy.add(log_grad_w, prob_grad.log_p_wgrad(W,\
			example[0], example[1], T), out=log_grad_w)
		numpy.add(log_grad_t, prob_grad.log_p_tgrad(T,\
			example[0], example[1], W), out=log_grad_t)

	#multiply C/N
	numpy.multiply(log_grad_w, -1*C/len(data), out=log_grad_w)
	numpy.multiply(log_grad_t, -1*C/len(data), out=log_grad_t)

	#add gradient of norm
	numpy.add(log_grad_w, W, out=log_grad_w)

	#add normalizing factor
	numpy.add(log_grad_t, T, out=log_grad_t)

	return numpy.concatenate([log_grad_w.reshape(26*128),\
		log_grad_t.reshape(26*26)])

#on = numpy.concatenate([W.reshape(26*128), T.reshape(26*26)])

#res = func(on, X_y[:9], 1000)
#result = func_prime(on, X_y[:9], 1000)

#need to flatten for the optimizer
initial_guess = numpy.zeros((26*128+26*26))

#bounds = [(-10000000, 10000000)]*(28*128+26*26)


#ret = fmin_bfgs(func, initial_guess, fprime=func_prime, args=(X_y[:5], 1000),\
#	maxiter=2, retall=True, full_output=True)

def get_params(x_y):
    t0 = time.time()
    ret=fmin_bfgs(func, initial_guess, fprime=func_prime, args=(x_y,100),\
    	 maxiter=20,retall=True, full_output=True)
    t1 = time.time()
    with open("best_Weights_c100","+bw") as f :
        pickle.dump(ret,f)
    numpy.savetxt("best_Weights_c100",ret[0])
    #numpy.savetxt("best_func_c_10",ret[1])
    
    print(f"Time: {t1-t0}")
    
get_params(X_y)

         Current function value: 758.875146
         Iterations: 20
         Function evaluations: 22
         Gradient evaluations: 22
Time: 144.85742807388306


In [34]:
def read_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    return lines

def calculate_accuracy(prediction_file, test_file):
    predictions = read_file(prediction_file)
    test_data = read_test(test_file)
    
    total_samples = len(predictions)
    correct_predictions = 0
    
    for pred_label, (test_X, test_Y) in zip(predictions, test_data):
        test_label = test_Y[0]  # Assuming test_Y contains only one label per sequence
        if pred_label == test_label:
            correct_predictions += 1
    
    accuracy = (correct_predictions / total_samples) * 100
    return accuracy

# Example usage:
prediction_file = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/result/prediction.txt"
test_file = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/data/test.txt"
accuracy = calculate_accuracy(prediction_file, test_file)
print(f"Accuracy: {accuracy:.2f}%")


Number of test sequences: 3439
First 5 sequences' labels:
 [array([24, 11, 14, 15,  7, 14, 13,  4]), array([13, 22, 14, 17, 10,  0,  1, 11,  4]), array([ 2,  2, 14, 20, 13, 19,  0,  1,  8, 11,  8, 19, 24]), array([17,  8,  6,  7, 19,  5, 20, 11, 11, 24]), array([ 4,  2, 14, 12, 15, 17,  4, 18, 18])]
Accuracy: 0.00%


In [39]:
import numpy as np

def calculate_accuracy(predictions_path, ground_truth_path):
    """
    Calculates the accuracy of the predictions.
    
    :param predictions_path: Path to the predictions file.
    :param ground_truth_path: Path to the ground truth labels file.
    :return: Accuracy of the predictions.
    """
    # Use the read_test function and only extract the ground truth labels part
    test_data = read_test(ground_truth_path)
    ground_truth_labels = [label for _, labels in test_data for label in labels]

    # Read predictions
    with open(predictions_path, 'r') as f:
        predictions = [int(line.strip()) for line in f.readlines()]

    # Ensure the total number of labels matches the number of predictions
    if len(predictions) != len(ground_truth_labels):
        raise ValueError("The number of predictions does not match the number of ground truth labels.")

    # Calculate accuracy
    correct_predictions = sum(p == gt for p, gt in zip(predictions, ground_truth_labels))
    accuracy = correct_predictions / len(ground_truth_labels)
    
    return accuracy

# Example usage
predictions_path = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/result/prediction.txt"
ground_truth_path = "C:/Users/prana/Desktop/Fall 23/Adv ML/LAB1/CRF-master/data/test.txt"
accuracy = calculate_accuracy(predictions_path, ground_truth_path)
print(f"Accuracy: {accuracy}")


Number of test sequences: 3439
First 5 sequences' labels:
 [array([24, 11, 14, 15,  7, 14, 13,  4]), array([13, 22, 14, 17, 10,  0,  1, 11,  4]), array([ 2,  2, 14, 20, 13, 19,  0,  1,  8, 11,  8, 19, 24]), array([17,  8,  6,  7, 19,  5, 20, 11, 11, 24]), array([ 4,  2, 14, 12, 15, 17,  4, 18, 18])]
Accuracy: 0.00809222078021223
