In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
tf.random.set_seed(1234)
np.random.seed(1234)
# -------------------------------
# MLP Class Definition (look at slides)
# -------------------------------
class MLP(object):
    def __init__(self, size_input, size_hidden1, size_hidden2, size_hidden3, size_output, device=None):
        """
        size_input: int, size of input layer
        size_hidden1: int, size of the 1st hidden layer
        size_hidden2: int, size of the 2nd hidden layer
        size_hidden3: int, size of the 3rd hidden layer (Note: Not used in compute_output in this example)
        size_output: int, size of output layer
        device: str or None, either 'cpu' or 'gpu' or None. If None, the device is decided automatically.
        """
        self.size_input = size_input
        self.size_hidden1 = size_hidden1
        self.size_hidden2 = size_hidden2
        self.size_hidden3 = size_hidden3  # (Currently not used)
        self.size_output = size_output
        self.device = device

        # Initialize weights and biases for first hidden layer
        self.W1 = tf.Variable(tf.random.normal([self.size_input, self.size_hidden1], stddev=0.1))
        self.b1 = tf.Variable(tf.zeros([1, self.size_hidden1]))

        # Initialize weights and biases for second hidden layer
        self.W2 = tf.Variable(tf.random.normal([self.size_hidden1, self.size_hidden2], stddev=0.1))
        self.b2 = tf.Variable(tf.zeros([1, self.size_hidden2]))

        # Initialize weights and biases for output layer
        self.W3 = tf.Variable(tf.random.normal([self.size_hidden2, self.size_output], stddev=0.1))
        self.b3 = tf.Variable(tf.zeros([1, self.size_output]))

        # Define variables to be updated during backpropagation
        self.variables = [self.W1, self.W2, self.W3, self.b1, self.b2, self.b3]

    def forward(self, X):
        """
        Forward pass.
        X: Tensor, inputs.
        """
        if self.device is not None:
            with tf.device('gpu:0' if self.device == 'gpu' else 'cpu'):
                self.y = self.compute_output(X)
        else:
            self.y = self.compute_output(X)
        return self.y

    def loss(self, y_pred, y_true):
        """
        Computes the loss between predicted and true outputs.
        y_pred - Tensor of shape (batch_size, size_output)
        y_true - Tensor of shape (batch_size, size_output)
        """
        y_true_tf = tf.cast(y_true, dtype=tf.float32)
        y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
        cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        loss_x = cce(y_true_tf, y_pred_tf)
        return loss_x

    def backward(self, X_train, y_train):
        """
        Backward pass: compute gradients of the loss with respect to the variables.
        """
        with tf.GradientTape() as tape:
            predicted = self.forward(X_train)
            current_loss = self.loss(predicted, y_train)
        grads = tape.gradient(current_loss, self.variables)
        return grads

    def compute_output(self, X):
        """
        Custom method to obtain output tensor during the forward pass.
        """
        # Cast X to float32
        X_tf = tf.cast(X, dtype=tf.float32)
        # First hidden layer
        h1 = tf.matmul(X_tf, self.W1) + self.b1
        z1 = tf.nn.relu(h1)
        # Second hidden layer
        h2 = tf.matmul(z1, self.W2) + self.b2
        z2 = tf.nn.relu(h2)
        # Output layer (logits)
        output = tf.matmul(z2, self.W3) + self.b3
        return output

# -------------------------------
# Character-Level Tokenizer and Preprocessing Functions
# -------------------------------
def char_level_tokenizer(texts, num_words=None):
    """
    Create and fit a character-level tokenizer.

    Args:
        texts (list of str): List of texts (e.g., movie reviews).
        num_words (int or None): Maximum number of tokens to keep (based on frequency).

    Returns:
        tokenizer: A fitted Tokenizer instance.
    """
    tokenizer = Tokenizer(num_words=num_words, char_level=True, lower=True)
    tokenizer.fit_on_texts(texts)
    return tokenizer

def texts_to_bow(tokenizer, texts):
    """
    Convert texts to a bag-of-characters representation.

    Args:
        tokenizer: A fitted character-level Tokenizer.
        texts (list of str): List of texts.

    Returns:
        Numpy array representing binary presence of characters.
    """
    # Use texts_to_matrix with mode 'binary' to create fixed-length vectors.
    matrix = tokenizer.texts_to_matrix(texts, mode='binary')
    return matrix

# -------------------------------
# Example Usage for IMDB Classification
# -------------------------------
if __name__ == "__main__":
    # Example IMDB reviews (In practice, load your dataset here)
    texts = [
        "I loved this movie! It was fantastic.",
        "The film was terrible and boring."
    ]
    # One-hot encoded labels for 2 classes (e.g., positive: [0,1], negative: [1,0])
    labels = np.array([[0, 1], [1, 0]])

    # Create and fit a character-level tokenizer
    tokenizer = char_level_tokenizer(texts)

    # Convert texts to bag-of-characters representation
    X = texts_to_bow(tokenizer, texts)
    print("Input shape:", X.shape)

    # Set model hyperparameters.
    # The input size is equal to the dimension of the bag-of-characters vector.
    size_input = X.shape[1]
    size_hidden1 = 64
    size_hidden2 = 32
    size_hidden3 = 16  # Not used in compute_output (placeholder for a potential extra layer)
    size_output = 2

    # Instantiate the MLP model.
    model = MLP(size_input, size_hidden1, size_hidden2, size_hidden3, size_output, device=None)

    # Define an optimizer.
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

    # Training loop (for demonstration purposes; adjust epochs and batch size as needed)
    epochs = 10
    for epoch in range(epochs):
        # Forward pass: compute predictions
        predictions = model.forward(X)
        # Compute loss
        current_loss = model.loss(predictions, labels)
        # Backward pass: compute gradients
        grads = model.backward(X, labels)
        # Update weights manually using the optimizer.
        optimizer.apply_gradients(zip(grads, model.variables))
        print(f"Epoch {epoch+1}, Loss: {current_loss.numpy()}")

    # Testing the model on a new review.
    new_text = ["An amazing film with a thrilling plot."]
    X_new = texts_to_bow(tokenizer, new_text)
    logits = model.forward(X_new)
    probabilities = tf.nn.softmax(logits)
    print("Predicted probabilities:", probabilities.numpy())


Input shape: (2, 22)
Epoch 1, Loss: 0.687633752822876
Epoch 2, Loss: 0.6506392359733582
Epoch 3, Loss: 0.6089686155319214
Epoch 4, Loss: 0.5542305707931519
Epoch 5, Loss: 0.47900712490081787
Epoch 6, Loss: 0.39020609855651855
Epoch 7, Loss: 0.2936958968639374
Epoch 8, Loss: 0.20051637291908264
Epoch 9, Loss: 0.12191140651702881
Epoch 10, Loss: 0.06505545973777771
Predicted probabilities: [[0.8995173  0.10048267]]


**MLP on IMDB Dataset**

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

tf.random.set_seed(1234)
np.random.seed(1234)
# -------------------------------
# Original MLP Class Definition
# -------------------------------
class MLP(object):
    def __init__(self, size_input, size_hidden1, size_hidden2, size_hidden3, size_output, device=None):
        """
        size_input: int, size of input layer
        size_hidden1: int, size of the 1st hidden layer
        size_hidden2: int, size of the 2nd hidden layer
        size_hidden3: int, size of the 3rd hidden layer (not used in compute_output here)
        size_output: int, size of output layer
        device: str or None, either 'cpu' or 'gpu' or None.
        """
        self.size_input = size_input
        self.size_hidden1 = size_hidden1
        self.size_hidden2 = size_hidden2
        self.size_hidden3 = size_hidden3  # (Currently not used in the forward pass)
        self.size_output = size_output
        self.device = device

        # Initialize weights and biases for first hidden layer
        self.W1 = tf.Variable(tf.random.normal([self.size_input, self.size_hidden1], stddev=0.1))
        self.b1 = tf.Variable(tf.zeros([1, self.size_hidden1]))

        # Initialize weights and biases for second hidden layer
        self.W2 = tf.Variable(tf.random.normal([self.size_hidden1, self.size_hidden2], stddev=0.1))
        self.b2 = tf.Variable(tf.zeros([1, self.size_hidden2]))

        # Initialize weights and biases for output layer
        self.W3 = tf.Variable(tf.random.normal([self.size_hidden2, self.size_output], stddev=0.1))
        self.b3 = tf.Variable(tf.zeros([1, self.size_output]))

        # List of variables to update during backpropagation
        self.variables = [self.W1, self.W2, self.W3, self.b1, self.b2, self.b3]

    def forward(self, X):
        """
        Forward pass.
        X: Tensor, inputs.
        """
        if self.device is not None:
            with tf.device('gpu:0' if self.device == 'gpu' else 'cpu'):
                self.y = self.compute_output(X)
        else:
            self.y = self.compute_output(X)
        return self.y

    def loss(self, y_pred, y_true):
        """
        Computes the loss between predicted and true outputs.
        y_pred: Tensor of shape (batch_size, size_output)
        y_true: Tensor of shape (batch_size, size_output)
        """
        y_true_tf = tf.cast(y_true, dtype=tf.float32)
        y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
        cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        loss_x = cce(y_true_tf, y_pred_tf)
        return loss_x

    def backward(self, X_train, y_train):
        """
        Backward pass: compute gradients of the loss with respect to the variables.
        """
        with tf.GradientTape() as tape:
            predicted = self.forward(X_train)
            current_loss = self.loss(predicted, y_train)
        grads = tape.gradient(current_loss, self.variables)
        return grads

    def compute_output(self, X):
        """
        Custom method to compute the output tensor during the forward pass.
        """
        # Cast X to float32
        X_tf = tf.cast(X, dtype=tf.float32)
        # First hidden layer
        h1 = tf.matmul(X_tf, self.W1) + self.b1
        z1 = tf.nn.relu(h1)
        # Second hidden layer
        h2 = tf.matmul(z1, self.W2) + self.b2
        z2 = tf.nn.relu(h2)
        # Output layer (logits)
        output = tf.matmul(z2, self.W3) + self.b3
        return output

# -------------------------------
# Character-Level Tokenizer and Preprocessing Functions
# -------------------------------
def char_level_tokenizer(texts, num_words=None):
    """
    Create and fit a character-level tokenizer.

    Args:
        texts (list of str): List of texts.
        num_words (int or None): Maximum number of tokens to keep.

    Returns:
        tokenizer: A fitted Tokenizer instance.
    """
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, char_level=True, lower=True)
    tokenizer.fit_on_texts(texts)
    return tokenizer

def texts_to_bow(tokenizer, texts):
    """
    Convert texts to a bag-of-characters representation.

    Args:
        tokenizer: A fitted character-level Tokenizer.
        texts (list of str): List of texts.

    Returns:
        Numpy array representing the binary bag-of-characters for each text.
    """
    # texts_to_matrix with mode 'binary' produces a fixed-length binary vector per text.
    matrix = tokenizer.texts_to_matrix(texts, mode='binary')
    return matrix

def one_hot_encode(labels, num_classes=2):
    """
    Convert numeric labels to one-hot encoded vectors.
    """
    return np.eye(num_classes)[labels]

# -------------------------------
# Load and Prepare the IMDB Dataset
# -------------------------------
print("Loading IMDB dataset...")
# Load the IMDB reviews dataset with the 'as_supervised' flag so that we get (text, label) pairs.
(ds_train, ds_test), ds_info = tfds.load('imdb_reviews',
                                           split=['train', 'test'],
                                           as_supervised=True,
                                           with_info=True)

# Convert training dataset to lists.
train_texts = []
train_labels = []
for text, label in tfds.as_numpy(ds_train):
    # Decode byte strings to utf-8 strings.
    train_texts.append(text.decode('utf-8'))
    train_labels.append(label)
train_labels = np.array(train_labels)

# Create a validation set from the training data (20% for validation).
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42)

# Convert test dataset to lists.
test_texts = []
test_labels = []
for text, label in tfds.as_numpy(ds_test):
    test_texts.append(text.decode('utf-8'))
    test_labels.append(label)
test_labels = np.array(test_labels)

print(f"Train samples: {len(train_texts)}, Validation samples: {len(val_texts)}, Test samples: {len(test_texts)}")

# -------------------------------
# Preprocessing: Tokenization and Vectorization
# -------------------------------
# Build the character-level tokenizer on the training texts.
tokenizer = char_level_tokenizer(train_texts)
print("Tokenizer vocabulary size:", len(tokenizer.word_index) + 1)

# Convert texts to bag-of-characters representation.
X_train = texts_to_bow(tokenizer, train_texts)
X_val   = texts_to_bow(tokenizer, val_texts)
X_test  = texts_to_bow(tokenizer, test_texts)

# Convert labels to one-hot encoding.
y_train = one_hot_encode(train_labels)
y_val   = one_hot_encode(val_labels)
y_test  = one_hot_encode(test_labels)

# -------------------------------
# Model Setup
# -------------------------------
# The input size is determined by the dimension of the bag-of-characters vector.
size_input = X_train.shape[1]
# Set hidden layer sizes as desired.
size_hidden1 = 128
size_hidden2 = 64
size_hidden3 = 32  # Placeholder (not used in the forward pass)
size_output  = 2

# Instantiate the MLP model.
model = MLP(size_input, size_hidden1, size_hidden2, size_hidden3, size_output, device=None)

# Define the optimizer.
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# -------------------------------
# Training Parameters and Loop
# -------------------------------
batch_size = 128
epochs = 10
num_batches = int(np.ceil(X_train.shape[0] / batch_size))

print("\nStarting training...\n")
for epoch in range(epochs):
    # Shuffle training data at the start of each epoch.
    indices = np.arange(X_train.shape[0])
    np.random.shuffle(indices)
    X_train = X_train[indices]
    y_train = y_train[indices]

    epoch_loss = 0
    for i in range(num_batches):
        start = i * batch_size
        end = min((i+1) * batch_size, X_train.shape[0])
        X_batch = X_train[start:end]
        y_batch = y_train[start:end]

        # Compute gradients and update weights.
        # with tf.GradientTape() as tape:
        #     predictions = model.forward(X_batch)
        #     loss_value = model.loss(predictions, y_batch)
        # grads = tape.gradient(loss_value, model.variables)
        predictions = model.forward(X_batch)
        loss_value = model.loss(predictions, y_batch)
        grads = model.backward(X_batch, y_batch)
        optimizer.apply_gradients(zip(grads, model.variables))
        epoch_loss += loss_value.numpy() * (end - start)

    epoch_loss /= X_train.shape[0]

    # Evaluate on validation set.
    val_logits = model.forward(X_val)
    val_loss = model.loss(val_logits, y_val).numpy()
    val_preds = np.argmax(val_logits.numpy(), axis=1)
    true_val = np.argmax(y_val, axis=1)
    accuracy = np.mean(val_preds == true_val)
    precision = precision_score(true_val, val_preds)
    recall = recall_score(true_val, val_preds)

    print(f"Epoch {epoch+1:02d} | Training Loss: {epoch_loss:.4f} | Val Loss: {val_loss:.4f} | "
          f"Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}")

# -------------------------------
# Final Evaluation on Test Set
# -------------------------------
print("\nEvaluating on test set...")
test_logits = model.forward(X_test)
test_loss = model.loss(test_logits, y_test).numpy()
test_preds = np.argmax(test_logits.numpy(), axis=1)
true_test = np.argmax(y_test, axis=1)
test_accuracy = np.mean(test_preds == true_test)
test_precision = precision_score(true_test, test_preds)
test_recall = recall_score(true_test, test_preds)

print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.4f} | "
      f"Test Precision: {test_precision:.4f} | Test Recall: {test_recall:.4f}")

Loading IMDB dataset...
Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.T37U9L_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.T37U9L_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.T37U9L_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.
Train samples: 20000, Validation samples: 5000, Test samples: 25000
Tokenizer vocabulary size: 134

Starting training...

Epoch 01 | Training Loss: 0.6715 | Val Loss: 0.6636 | Accuracy: 0.6058 | Precision: 0.5822 | Recall: 0.6621
Epoch 02 | Training Loss: 0.6625 | Val Loss: 0.6634 | Accuracy: 0.6056 | Precision: 0.5792 | Recall: 0.6819
Epoch 03 | Training Loss: 0.6604 | Val Loss: 0.6619 | Accuracy: 0.6032 | Precision: 0.5742 | Recall: 0.7021
Epoch 04 | Training Loss: 0.6580 | Val Loss: 0.6625 | Accuracy: 0.6030 | Precision: 0.5717 | Recall: 0.7224
Epoch 05 | Training Loss: 0.6582 | Val Loss: 0.6606 | Accuracy: 0.6084 | Precision: 0.5822 | Recall: 0.6811
Epoch 06 | Training Loss: 0.6546 | Val Loss: 0.6683 | Accuracy: 0.5986 | Precision: 0.5630 | Recall: 0.7690
Epoch 07 | Training Loss: 0.6536 | Val Loss: 0.6748 | Accuracy: 0.5810 | Precision: 0.

**Random MLP on IMDB Dataset**

In [3]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
tf.random.set_seed(1234)
np.random.seed(1234)
# -------------------------------
# Original MLP Class Definition
# -------------------------------
class MLP_rnd(object):
    def __init__(self, size_input, size_hidden1, size_hidden2, size_hidden3, size_output, device=None):
        """
        size_input: int, size of input layer
        size_hidden1: int, size of the 1st hidden layer
        size_hidden2: int, size of the 2nd hidden layer
        size_hidden3: int, size of the 3rd hidden layer (not used in compute_output here)
        size_output: int, size of output layer
        device: str or None, either 'cpu' or 'gpu' or None.
        """
        self.size_input = size_input
        self.size_hidden1 = size_hidden1
        self.size_hidden2 = size_hidden2
        self.size_hidden3 = size_hidden3  # (Currently not used in the forward pass)
        self.size_output = size_output
        self.device = device

        # Initialize weights and biases for first hidden layer
        self.W1 = tf.Variable(tf.random.normal([self.size_input, self.size_hidden1], stddev=0.1))
        self.b1 = tf.Variable(tf.zeros([1, self.size_hidden1]))

        # Initialize weights and biases for second hidden layer
        self.W2 = tf.Variable(tf.random.normal([self.size_hidden1, self.size_hidden2], stddev=0.1))
        self.b2 = tf.Variable(tf.zeros([1, self.size_hidden2]))

        # Initialize weights and biases for output layer
        self.W3 = tf.Variable(tf.random.normal([self.size_hidden2, self.size_output], stddev=0.1))
        self.b3 = tf.Variable(tf.zeros([1, self.size_output]))

        # List of variables to update during backpropagation
        #self.variables = [self.W1, self.W2, self.W3, self.b1, self.b2, self.b3]
        self.variables = [self.W3, self.b3]

    def forward(self, X):
        """
        Forward pass.
        X: Tensor, inputs.
        """
        if self.device is not None:
            with tf.device('gpu:0' if self.device == 'gpu' else 'cpu'):
                self.y = self.compute_output(X)
        else:
            self.y = self.compute_output(X)
        return self.y

    def loss(self, y_pred, y_true):
        """
        Computes the loss between predicted and true outputs.
        y_pred: Tensor of shape (batch_size, size_output)
        y_true: Tensor of shape (batch_size, size_output)
        """
        y_true_tf = tf.cast(y_true, dtype=tf.float32)
        y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
        cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        loss_x = cce(y_true_tf, y_pred_tf)
        return loss_x

    def backward(self, X_train, y_train):
        """
        Backward pass: compute gradients of the loss with respect to the variables.
        """
        with tf.GradientTape() as tape:
            predicted = self.forward(X_train)
            current_loss = self.loss(predicted, y_train)
        grads = tape.gradient(current_loss, self.variables)
        return grads

    def compute_output(self, X):
        """
        Custom method to compute the output tensor during the forward pass.
        """
        # Cast X to float32
        X_tf = tf.cast(X, dtype=tf.float32)
        # First hidden layer
        h1 = tf.matmul(X_tf, self.W1) + self.b1
        z1 = tf.nn.relu(h1)
        # Second hidden layer
        h2 = tf.matmul(z1, self.W2) + self.b2
        z2 = tf.nn.relu(h2)
        # Output layer (logits)
        output = tf.matmul(z2, self.W3) + self.b3
        return output

# -------------------------------
# Character-Level Tokenizer and Preprocessing Functions
# -------------------------------
def char_level_tokenizer(texts, num_words=None):
    """
    Create and fit a character-level tokenizer.

    Args:
        texts (list of str): List of texts.
        num_words (int or None): Maximum number of tokens to keep.

    Returns:
        tokenizer: A fitted Tokenizer instance.
    """
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, char_level=True, lower=True)
    tokenizer.fit_on_texts(texts)
    return tokenizer

def texts_to_bow(tokenizer, texts):
    """
    Convert texts to a bag-of-characters representation.

    Args:
        tokenizer: A fitted character-level Tokenizer.
        texts (list of str): List of texts.

    Returns:
        Numpy array representing the binary bag-of-characters for each text.
    """
    # texts_to_matrix with mode 'binary' produces a fixed-length binary vector per text.
    matrix = tokenizer.texts_to_matrix(texts, mode='binary')
    return matrix

def one_hot_encode(labels, num_classes=2):
    """
    Convert numeric labels to one-hot encoded vectors.
    """
    return np.eye(num_classes)[labels]

# -------------------------------
# Load and Prepare the IMDB Dataset
# -------------------------------
print("Loading IMDB dataset...")
# Load the IMDB reviews dataset with the 'as_supervised' flag so that we get (text, label) pairs.
(ds_train, ds_test), ds_info = tfds.load('imdb_reviews',
                                           split=['train', 'test'],
                                           as_supervised=True,
                                           with_info=True)

# Convert training dataset to lists.
train_texts = []
train_labels = []
for text, label in tfds.as_numpy(ds_train):
    # Decode byte strings to utf-8 strings.
    train_texts.append(text.decode('utf-8'))
    train_labels.append(label)
train_labels = np.array(train_labels)

# Create a validation set from the training data (20% for validation).
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42)

# Convert test dataset to lists.
test_texts = []
test_labels = []
for text, label in tfds.as_numpy(ds_test):
    test_texts.append(text.decode('utf-8'))
    test_labels.append(label)
test_labels = np.array(test_labels)

print(f"Train samples: {len(train_texts)}, Validation samples: {len(val_texts)}, Test samples: {len(test_texts)}")

# -------------------------------
# Preprocessing: Tokenization and Vectorization
# -------------------------------
# Build the character-level tokenizer on the training texts.
tokenizer = char_level_tokenizer(train_texts)
print("Tokenizer vocabulary size:", len(tokenizer.word_index) + 1)

# Convert texts to bag-of-characters representation.
X_train = texts_to_bow(tokenizer, train_texts)
X_val   = texts_to_bow(tokenizer, val_texts)
X_test  = texts_to_bow(tokenizer, test_texts)

# Convert labels to one-hot encoding.
y_train = one_hot_encode(train_labels)
y_val   = one_hot_encode(val_labels)
y_test  = one_hot_encode(test_labels)

# -------------------------------
# Model Setup
# -------------------------------
# The input size is determined by the dimension of the bag-of-characters vector.
size_input = X_train.shape[1]
# Set hidden layer sizes as desired.
size_hidden1 = 128
size_hidden2 = 64
size_hidden3 = 32  # Placeholder (not used in the forward pass)
size_output  = 2

# Instantiate the MLP model.
model = MLP_rnd(size_input, size_hidden1, size_hidden2, size_hidden3, size_output, device=None)

# Define the optimizer.
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# -------------------------------
# Training Parameters and Loop
# -------------------------------
batch_size = 128
epochs = 10
num_batches = int(np.ceil(X_train.shape[0] / batch_size))

print("\nStarting training...\n")
for epoch in range(epochs):
    # Shuffle training data at the start of each epoch.
    indices = np.arange(X_train.shape[0])
    np.random.shuffle(indices)
    X_train = X_train[indices]
    y_train = y_train[indices]

    epoch_loss = 0
    for i in range(num_batches):
        start = i * batch_size
        end = min((i+1) * batch_size, X_train.shape[0])
        X_batch = X_train[start:end]
        y_batch = y_train[start:end]

        # Compute gradients and update weights.
        # with tf.GradientTape() as tape:
        #     predictions = model.forward(X_batch)
        #     loss_value = model.loss(predictions, y_batch)
        # grads = tape.gradient(loss_value, model.variables)
        predictions = model.forward(X_batch)
        loss_value = model.loss(predictions, y_batch)
        grads = model.backward(X_batch, y_batch)
        optimizer.apply_gradients(zip(grads, model.variables))
        epoch_loss += loss_value.numpy() * (end - start)

    epoch_loss /= X_train.shape[0]

    # Evaluate on validation set.
    val_logits = model.forward(X_val)
    val_loss = model.loss(val_logits, y_val).numpy()
    val_preds = np.argmax(val_logits.numpy(), axis=1)
    true_val = np.argmax(y_val, axis=1)
    accuracy = np.mean(val_preds == true_val)
    precision = precision_score(true_val, val_preds)
    recall = recall_score(true_val, val_preds)

    print(f"Epoch {epoch+1:02d} | Training Loss: {epoch_loss:.4f} | Val Loss: {val_loss:.4f} | "
          f"Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}")

# -------------------------------
# Final Evaluation on Test Set
# -------------------------------
print("\nEvaluating on test set...")
test_logits = model.forward(X_test)
test_loss = model.loss(test_logits, y_test).numpy()
test_preds = np.argmax(test_logits.numpy(), axis=1)
true_test = np.argmax(y_test, axis=1)
test_accuracy = np.mean(test_preds == true_test)
test_precision = precision_score(true_test, test_preds)
test_recall = recall_score(true_test, test_preds)

print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.4f} | "
      f"Test Precision: {test_precision:.4f} | Test Recall: {test_recall:.4f}")

Loading IMDB dataset...
Train samples: 20000, Validation samples: 5000, Test samples: 25000
Tokenizer vocabulary size: 134

Starting training...

Epoch 01 | Training Loss: 0.6879 | Val Loss: 0.6852 | Accuracy: 0.5636 | Precision: 0.5433 | Recall: 0.6258
Epoch 02 | Training Loss: 0.6828 | Val Loss: 0.6820 | Accuracy: 0.5704 | Precision: 0.5454 | Recall: 0.6840
Epoch 03 | Training Loss: 0.6796 | Val Loss: 0.6796 | Accuracy: 0.5764 | Precision: 0.5512 | Recall: 0.6795
Epoch 04 | Training Loss: 0.6775 | Val Loss: 0.6771 | Accuracy: 0.5802 | Precision: 0.5641 | Recall: 0.5903
Epoch 05 | Training Loss: 0.6759 | Val Loss: 0.6767 | Accuracy: 0.5822 | Precision: 0.5592 | Recall: 0.6531
Epoch 06 | Training Loss: 0.6747 | Val Loss: 0.6756 | Accuracy: 0.5852 | Precision: 0.5651 | Recall: 0.6267
Epoch 07 | Training Loss: 0.6739 | Val Loss: 0.6749 | Accuracy: 0.5866 | Precision: 0.5676 | Recall: 0.6180
Epoch 08 | Training Loss: 0.6731 | Val Loss: 0.6746 | Accuracy: 0.5860 | Precision: 0.5660 | Recal

**MLP with feedback alignment on IMDB Dataset**

In [4]:
class MLP_FA(object):
    def __init__(self, size_input, size_hidden1, size_hidden2, size_hidden3, size_output, device=None):
        """
        size_input: int, size of input layer
        size_hidden1: int, size of the 1st hidden layer
        size_hidden2: int, size of the 2nd hidden layer
        size_hidden3: int, size of the 3rd hidden layer (Note: Not used in compute_output in this example)
        size_output: int, size of output layer
        device: str or None, either 'cpu' or 'gpu' or None.
        """
        self.size_input = size_input
        self.size_hidden1 = size_hidden1
        self.size_hidden2 = size_hidden2
        self.size_hidden3 = size_hidden3  # (Currently not used)
        self.size_output = size_output
        self.device = device

        # Initialize weights and biases for first hidden layer
        self.W1 = tf.Variable(tf.random.normal([self.size_input, self.size_hidden1], stddev=0.1))
        self.b1 = tf.Variable(tf.zeros([1, self.size_hidden1]))

        # Initialize weights and biases for second hidden layer
        self.W2 = tf.Variable(tf.random.normal([self.size_hidden1, self.size_hidden2], stddev=0.1))
        self.b2 = tf.Variable(tf.zeros([1, self.size_hidden2]))

        # Initialize weights and biases for output layer
        self.W3 = tf.Variable(tf.random.normal([self.size_hidden2, self.size_output], stddev=0.1))
        self.b3 = tf.Variable(tf.zeros([1, self.size_output]))

        # Create fixed random feedback matrices for feedback alignment:
        # B3: used to propagate the error from the output layer to the second hidden layer.
        # It replaces the use of W3^T. Its shape is (size_output, size_hidden2).
        self.B3 = tf.Variable(tf.random.normal([self.size_output, self.size_hidden2]), trainable=False)

        # B2: used to propagate the error from the second hidden layer to the first hidden layer.
        # Its shape is (size_hidden2, size_hidden1).
        self.B2 = tf.Variable(tf.random.normal([self.size_hidden2, self.size_hidden1]), trainable=False)

        # Define variables to be updated during training
        self.variables = [self.W1, self.W2, self.W3, self.b1, self.b2, self.b3]

    def forward(self, X):
        """
        Forward pass.
        X: Tensor, inputs.
        """
        if self.device is not None:
            with tf.device('gpu:0' if self.device == 'gpu' else 'cpu'):
                self.y = self.compute_output(X)
        else:
            self.y = self.compute_output(X)
        return self.y

    def loss(self, y_pred, y_true):
        """
        Computes the loss between predicted and true outputs.
        y_pred - Tensor of shape (batch_size, size_output)
        y_true - Tensor of shape (batch_size, size_output)
        """
        y_true_tf = tf.cast(y_true, dtype=tf.float32)
        y_pred_tf = tf.cast(y_pred, dtype=tf.float32)
        cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        loss_x = cce(y_true_tf, y_pred_tf)
        return loss_x

    def backward(self, X_train, y_train):
        """
        Backward pass using feedback alignment.
        Computes gradients manually using fixed random feedback matrices.
        X_train: Input data (numpy array)
        y_train: One-hot encoded labels (numpy array)
        Returns: List of gradients corresponding to [dW1, dW2, dW3, db1, db2, db3]
        """
        # Cast input to float32 tensor
        X_tf = tf.cast(X_train, tf.float32)

        # --- Forward Pass ---
        # First hidden layer
        h1 = tf.matmul(X_tf, self.W1) + self.b1
        a1 = tf.nn.relu(h1)
        # Second hidden layer
        h2 = tf.matmul(a1, self.W2) + self.b2
        a2 = tf.nn.relu(h2)
        # Output layer (logits)
        logits = tf.matmul(a2, self.W3) + self.b3
        # Softmax predictions
        y_pred = tf.nn.softmax(logits)

        # --- Compute Output Error ---
        # For cross-entropy with softmax, the derivative is (y_pred - y_true)
        delta3 = y_pred - tf.cast(y_train, tf.float32)  # shape: (batch, size_output)
        batch_size = tf.cast(tf.shape(X_tf)[0], tf.float32)

        # --- Gradients for Output Layer ---
        dW3 = tf.matmul(tf.transpose(a2), delta3) / batch_size
        db3 = tf.reduce_mean(delta3, axis=0, keepdims=True)

        # --- Feedback Alignment for Second Hidden Layer ---
        # Instead of delta2 = (delta3 dot W3^T) * ReLU'(h2), use a fixed random matrix B3.
        relu_grad_h2 = tf.cast(h2 > 0, tf.float32)
        # delta3 has shape (batch, size_output) and B3 has shape (size_output, size_hidden2)
        delta2 = tf.matmul(delta3, self.B3) * relu_grad_h2  # shape: (batch, size_hidden2)

        dW2 = tf.matmul(tf.transpose(a1), delta2) / batch_size
        db2 = tf.reduce_mean(delta2, axis=0, keepdims=True)

        # --- Feedback Alignment for First Hidden Layer ---
        # Instead of delta1 = (delta2 dot W2^T) * ReLU'(h1), use a fixed random matrix B2.
        relu_grad_h1 = tf.cast(h1 > 0, tf.float32)
        # delta2 has shape (batch, size_hidden2) and B2 has shape (size_hidden2, size_hidden1)
        delta1 = tf.matmul(delta2, self.B2) * relu_grad_h1  # shape: (batch, size_hidden1)

        dW1 = tf.matmul(tf.transpose(X_tf), delta1) / batch_size
        db1 = tf.reduce_mean(delta1, axis=0, keepdims=True)

        return [dW1, dW2, dW3, db1, db2, db3]

    def compute_output(self, X):
        """
        Custom method to obtain output tensor during the forward pass.
        """
        X_tf = tf.cast(X, dtype=tf.float32)
        h1 = tf.matmul(X_tf, self.W1) + self.b1
        z1 = tf.nn.relu(h1)
        h2 = tf.matmul(z1, self.W2) + self.b2
        z2 = tf.nn.relu(h2)
        output = tf.matmul(z2, self.W3) + self.b3
        return output


# -------------------------------
# Character-Level Tokenizer and Preprocessing Functions
# -------------------------------
def char_level_tokenizer(texts, num_words=None):
    """
    Create and fit a character-level tokenizer.

    Args:
        texts (list of str): List of texts.
        num_words (int or None): Maximum number of tokens to keep.

    Returns:
        tokenizer: A fitted Tokenizer instance.
    """
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, char_level=True, lower=True)
    tokenizer.fit_on_texts(texts)
    return tokenizer

def texts_to_bow(tokenizer, texts):
    """
    Convert texts to a bag-of-characters representation.

    Args:
        tokenizer: A fitted character-level Tokenizer.
        texts (list of str): List of texts.

    Returns:
        Numpy array representing the binary bag-of-characters for each text.
    """
    # texts_to_matrix with mode 'binary' produces a fixed-length binary vector per text.
    matrix = tokenizer.texts_to_matrix(texts, mode='binary')
    return matrix

def one_hot_encode(labels, num_classes=2):
    """
    Convert numeric labels to one-hot encoded vectors.
    """
    return np.eye(num_classes)[labels]

# -------------------------------
# Load and Prepare the IMDB Dataset
# -------------------------------
print("Loading IMDB dataset...")
# Load the IMDB reviews dataset with the 'as_supervised' flag so that we get (text, label) pairs.
(ds_train, ds_test), ds_info = tfds.load('imdb_reviews',
                                           split=['train', 'test'],
                                           as_supervised=True,
                                           with_info=True)

# Convert training dataset to lists.
train_texts = []
train_labels = []
for text, label in tfds.as_numpy(ds_train):
    # Decode byte strings to utf-8 strings.
    train_texts.append(text.decode('utf-8'))
    train_labels.append(label)
train_labels = np.array(train_labels)

# Create a validation set from the training data (20% for validation).
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42)

# Convert test dataset to lists.
test_texts = []
test_labels = []
for text, label in tfds.as_numpy(ds_test):
    test_texts.append(text.decode('utf-8'))
    test_labels.append(label)
test_labels = np.array(test_labels)

print(f"Train samples: {len(train_texts)}, Validation samples: {len(val_texts)}, Test samples: {len(test_texts)}")

# -------------------------------
# Preprocessing: Tokenization and Vectorization
# -------------------------------
# Build the character-level tokenizer on the training texts.
tokenizer = char_level_tokenizer(train_texts)
print("Tokenizer vocabulary size:", len(tokenizer.word_index) + 1)

# Convert texts to bag-of-characters representation.
X_train = texts_to_bow(tokenizer, train_texts)
X_val   = texts_to_bow(tokenizer, val_texts)
X_test  = texts_to_bow(tokenizer, test_texts)

# Convert labels to one-hot encoding.
y_train = one_hot_encode(train_labels)
y_val   = one_hot_encode(val_labels)
y_test  = one_hot_encode(test_labels)

# -------------------------------
# Model Setup
# -------------------------------
# The input size is determined by the dimension of the bag-of-characters vector.
size_input = X_train.shape[1]
# Set hidden layer sizes as desired.
size_hidden1 = 128
size_hidden2 = 64
size_hidden3 = 32  # Placeholder (not used in the forward pass)
size_output  = 2

# Instantiate the MLP model.
model = MLP_FA(size_input, size_hidden1, size_hidden2, size_hidden3, size_output, device=None)

# Define the optimizer.
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# -------------------------------
# Training Parameters and Loop
# -------------------------------
batch_size = 128
epochs = 10
num_batches = int(np.ceil(X_train.shape[0] / batch_size))

print("\nStarting training...\n")
for epoch in range(epochs):
    # Shuffle training data at the start of each epoch.
    indices = np.arange(X_train.shape[0])
    np.random.shuffle(indices)
    X_train = X_train[indices]
    y_train = y_train[indices]

    epoch_loss = 0
    for i in range(num_batches):
        start = i * batch_size
        end = min((i+1) * batch_size, X_train.shape[0])
        X_batch = X_train[start:end]
        y_batch = y_train[start:end]

        # Compute gradients and update weights.
        # with tf.GradientTape() as tape:
        #     predictions = model.forward(X_batch)
        #     loss_value = model.loss(predictions, y_batch)
        # grads = tape.gradient(loss_value, model.variables)
        predictions = model.forward(X_batch)
        loss_value = model.loss(predictions, y_batch)
        grads = model.backward(X_batch, y_batch)
        optimizer.apply_gradients(zip(grads, model.variables))
        epoch_loss += loss_value.numpy() * (end - start)

    epoch_loss /= X_train.shape[0]

    # Evaluate on validation set.
    val_logits = model.forward(X_val)
    val_loss = model.loss(val_logits, y_val).numpy()
    val_preds = np.argmax(val_logits.numpy(), axis=1)
    true_val = np.argmax(y_val, axis=1)
    accuracy = np.mean(val_preds == true_val)
    precision = precision_score(true_val, val_preds)
    recall = recall_score(true_val, val_preds)

    print(f"Epoch {epoch+1:02d} | Training Loss: {epoch_loss:.4f} | Val Loss: {val_loss:.4f} | "
          f"Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}")

# -------------------------------
# Final Evaluation on Test Set
# -------------------------------
print("\nEvaluating on test set...")
test_logits = model.forward(X_test)
test_loss = model.loss(test_logits, y_test).numpy()
test_preds = np.argmax(test_logits.numpy(), axis=1)
true_test = np.argmax(y_test, axis=1)
test_accuracy = np.mean(test_preds == true_test)
test_precision = precision_score(true_test, test_preds)
test_recall = recall_score(true_test, test_preds)

print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.4f} | "
      f"Test Precision: {test_precision:.4f} | Test Recall: {test_recall:.4f}")

Loading IMDB dataset...
Train samples: 20000, Validation samples: 5000, Test samples: 25000
Tokenizer vocabulary size: 134

Starting training...

Epoch 01 | Training Loss: 0.6810 | Val Loss: 0.6650 | Accuracy: 0.6048 | Precision: 0.5828 | Recall: 0.6502
Epoch 02 | Training Loss: 0.6634 | Val Loss: 0.6642 | Accuracy: 0.6060 | Precision: 0.5821 | Recall: 0.6638
Epoch 03 | Training Loss: 0.6629 | Val Loss: 0.6626 | Accuracy: 0.6098 | Precision: 0.5844 | Recall: 0.6753
Epoch 04 | Training Loss: 0.6611 | Val Loss: 0.6635 | Accuracy: 0.6066 | Precision: 0.5799 | Recall: 0.6844
Epoch 05 | Training Loss: 0.6625 | Val Loss: 0.6633 | Accuracy: 0.6004 | Precision: 0.6024 | Recall: 0.5169
Epoch 06 | Training Loss: 0.6596 | Val Loss: 0.6615 | Accuracy: 0.6074 | Precision: 0.5905 | Recall: 0.6205
Epoch 07 | Training Loss: 0.6577 | Val Loss: 0.6608 | Accuracy: 0.6074 | Precision: 0.5866 | Recall: 0.6444
Epoch 08 | Training Loss: 0.6555 | Val Loss: 0.6624 | Accuracy: 0.6050 | Precision: 0.5765 | Recal