# 1. Data Preparation (20%)

### Load the Iris dataset using scikit-learn

In [1]:
import torch
import torch.nn as nn
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time

In [28]:
iris = torch.load("iris_subset_30_per_class.pt")
X = torch.stack([item[0] for item in iris])  # Shape: [1000, 1, 28, 28]
y = torch.tensor([item[1] for item in iris])

### Perform appropriate preprocessing (normalization, train/test split)

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

In [31]:
print(type(Xtrain))
print(type(ytrain))
print(type(Xtest))
print(type(ytest))


<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [32]:
print(len(Xtrain))
print(len(Xtest))
print(len(ytrain))
print(len(ytest))

120
30
120
30


In [33]:
Xtrain[0:5,:]

tensor([[5.0000, 3.5000, 1.6000, 0.6000],
        [5.1000, 3.5000, 1.4000, 0.2000],
        [6.3000, 3.4000, 5.6000, 2.4000],
        [6.4000, 2.8000, 5.6000, 2.2000],
        [6.5000, 3.0000, 5.5000, 1.8000]])

In [34]:
Xtest[0:5, :]

tensor([[5.0000, 3.0000, 1.6000, 0.2000],
        [5.5000, 3.5000, 1.3000, 0.2000],
        [5.6000, 2.5000, 3.9000, 1.1000],
        [6.3000, 2.5000, 5.0000, 1.9000],
        [5.2000, 3.4000, 1.4000, 0.2000]])

In [35]:
scaler = MinMaxScaler()
scaler.fit(Xtrain)

In [36]:
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)

In [37]:
Xtrain[0:5,:]

array([[0.1944444 , 0.62499998, 0.10169492, 0.20833334],
       [0.22222215, 0.62499998, 0.06779661, 0.04166667],
       [0.55555557, 0.58333335, 0.77966099, 0.95833337],
       [0.58333332, 0.3333333 , 0.77966099, 0.87500002],
       [0.61111107, 0.41666665, 0.76271185, 0.70833331]])

In [38]:
Xtest[0:5, :]

array([[0.1944444 , 0.41666665, 0.10169492, 0.04166667],
       [0.33333329, 0.62499998, 0.05084745, 0.04166667],
       [0.36111104, 0.20833333, 0.49152543, 0.41666668],
       [0.55555557, 0.20833333, 0.67796609, 0.74999999],
       [0.2499999 , 0.58333335, 0.06779661, 0.04166667]])

In [39]:
#taking the entire Xtrain for forward prop
inputs = Xtrain

# 2. Model Adaptation (30%)

### importing the libraries

In [40]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, TensorDataset
import time

In [41]:
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:

# Define the SKA model with 4 layers
class SKAModel(nn.Module):
    def __init__(self, input_size=4, layer_sizes=[8, 6, 3], K=50):
        super(SKAModel, self).__init__()
        self.input_size = input_size
        self.layer_sizes = layer_sizes
        self.K = K  # Number of forward steps

        # Initialize weights and biases as nn.ParameterList
        self.weights = nn.ParameterList()
        self.biases = nn.ParameterList()
        prev_size = input_size
        for size in layer_sizes:
            self.weights.append(nn.Parameter(torch.randn(prev_size, size) * 0.01))
            self.biases.append(nn.Parameter(torch.zeros(size)))
            prev_size = size

        # Tracking tensors for knowledge accumulation and entropy computation
        self.Z = [None] * len(layer_sizes)  # Knowledge tensors per layer
        self.D = [None] * len(layer_sizes)  # Decision probability tensors
        self.D_prev = [None] * len(layer_sizes)  # Previous decisions for computing shifts
        self.delta_D = [None] * len(layer_sizes)  # Decision shifts per step
        self.entropy = [None] * len(layer_sizes)  # Layer-wise entropy storage

        # Store entropy, cosine, and output distribution history for visualization
        self.entropy_history = [[] for _ in range(len(layer_sizes))]
        self.cosine_history = [[] for _ in range(len(layer_sizes))]
        self.output_history = []  # New: Store mean output distribution (10 classes) per step
    

    def forward(self, x):
        """Computes SKA forward pass, storing knowledge and decisions."""
        batch_size = x.shape[0]
        #x = x.view(batch_size, -1)  # Flatten images --> dont need to perform this, we already have a 1d tensor

        for l in range(len(self.layer_sizes)):
            # Compute knowledge tensor Z = Wx + b
            z = torch.mm(x, self.weights[l]) + self.biases[l]
            # Apply sigmoid activation to get decision probabilities
            d = torch.sigmoid(z)
            # Store values for entropy computation
            self.Z[l] = z
            self.D[l] = d
            x = d  # Output becomes input for the next layer

        return x
    

    def calculate_entropy(self):
        """Computes entropy reduction and cos(theta) per layer."""
        total_entropy = 0
        for l in range(len(self.layer_sizes)):
            if self.Z[l] is not None and self.D_prev[l] is not None and self.D[l] is not None:
                # Compute decision shifts
                self.delta_D[l] = self.D[l] - self.D_prev[l]
                # Entropy reduction using SKA formula
                dot_product = torch.sum(self.Z[l] * self.delta_D[l])
                layer_entropy = -1 / np.log(2) * dot_product
                self.entropy[l] = layer_entropy.item()
                self.entropy_history[l].append(layer_entropy.item())

                # Compute cos(theta) for alignment
                z_norm = torch.norm(self.Z[l])
                delta_d_norm = torch.norm(self.delta_D[l])
                if z_norm > 0 and delta_d_norm > 0:
                    cos_theta = dot_product / (z_norm * delta_d_norm)
                    self.cosine_history[l].append(cos_theta.item())
                else:
                    self.cosine_history[l].append(0.0)  # Default if norms are zero

                total_entropy += layer_entropy
        return total_entropy
    

    def ska_update(self, inputs, learning_rate=0.01):
        """Updates weights using entropy-based learning without backpropagation."""
        for l in range(len(self.layer_sizes)):
            if self.delta_D[l] is not None:
                # Previous layer's output
                prev_output = inputs.view(inputs.shape[0], -1) if l == 0 else self.D_prev[l-1]
                # Compute sigmoid derivative: D * (1 - D)
                d_prime = self.D[l] * (1 - self.D[l])
                # Compute entropy gradient
                gradient = -1 / np.log(2) * (self.Z[l] * d_prime + self.delta_D[l])
                # Compute weight updates via outer product
                dW = torch.matmul(prev_output.t(), gradient) / prev_output.shape[0]
                # Update weights and biases
                self.weights[l] = self.weights[l] - learning_rate * dW
                self.biases[l] = self.biases[l] - learning_rate * gradient.mean(dim=0)

    def initialize_tensors(self, batch_size):
        """Resets decision tensors at the start of each training iteration."""
        for l in range(len(self.layer_sizes)):
            self.Z[l] = None         # Reset knowledge tensors
            self.D[l] = None         # Reset current decision probabilities
            self.D_prev[l] = None    # Reset previous decision probabilities
            self.delta_D[l] = None   # Reset decision shifts
            self.entropy[l] = None   # Reset entropy storage
            self.entropy_history[l] = []  # Reset entropy history
            self.cosine_history[l] = []   # Reset cosine history
        self.output_history = []  # Reset output history
        

    def visualize_entropy_heatmap(self, step):
        """Dynamically scales the heatmap range and visualizes entropy reduction."""
        entropy_data = np.array(self.entropy_history)
        vmin = np.min(entropy_data)  # Dynamically set minimum entropy value
        vmax = 0.0  # Keep 0 as the upper limit for standardization
        plt.figure(figsize=(12, 8))
        sns.heatmap(entropy_data, cmap="Blues_r", vmin=vmin, vmax=vmax,  
                    xticklabels=range(1, entropy_data.shape[1] + 1),
                    yticklabels=[f"Layer {i+1}" for i in range(len(self.layer_sizes))])
        plt.title(f"Layer-wise Entropy Heatmap (Step {step})")
        plt.xlabel("Step Index K")
        plt.ylabel("Network Layers")
        plt.tight_layout()
        plt.savefig(f"output/entropy_heatmap_step_{step}.png")
        plt.show(block=False)  # Non-blocking
        plt.pause(2)  # Wait for 2 seconds
        plt.close()  # Close automatically

    def visualize_cosine_heatmap(self, step):
        """Visualizes cos(theta) alignment heatmap with a diverging scale."""
        cosine_data = np.array(self.cosine_history)
        plt.figure(figsize=(12, 8))
        sns.heatmap(cosine_data, cmap="coolwarm_r", vmin=-1.0, vmax=1.0,  
                    xticklabels=range(1, cosine_data.shape[1] + 1),
                    yticklabels=[f"Layer {i+1}" for i in range(len(self.layer_sizes))])
        plt.title(f"Layer-wise Cos(\u03B8) Alignment Heatmap (Step {step})")
        plt.xlabel("Step Index K")
        plt.ylabel("Network Layers")
        plt.tight_layout()
        plt.savefig(f"output/cosine_heatmap_step_{step}.png")
        plt.show(block=False)  # Non-blocking
        plt.pause(2)  # Wait for 2 seconds
        plt.close()  # Close automatically

    def visualize_output_distribution(self):
        """Plots the evolution of the 10-class output distribution over K steps."""
        output_data = np.array(self.output_history)  # Shape: [K, 10]
        plt.figure(figsize=(10, 6))
        plt.plot(output_data)  # Plot each class as a line
        plt.title('Output Decision Probability Evolution Across Steps (Single Pass)')
        plt.xlabel('Step Index K')
        plt.ylabel('Mean Sigmoid Output')
        plt.legend([f"Class {i}" for i in range(10)], loc='upper right', bbox_to_anchor=(1.15, 1))
        plt.grid(True)
        plt.tight_layout()
        plt.savefig("output/output_distribution_single_pass.png")
        plt.show(block=False)  # Non-blocking
        plt.pause(2)  # Wait for 2 seconds
        plt.close()  # Close automatically

In [None]:

# Define the SKA model with 4 layers
class SKAModel_iris(nn.Module):
    def __init__(self, input_size=4, layer_sizes=[8, 6, 3], K=50):
        super(SKAModel, self).__init__()
        self.input_size = input_size
        self.layer_sizes = layer_sizes
        self.K = K  # Number of forward steps

        # Initialize weights and biases as nn.ParameterList
        self.weights = nn.ParameterList()
        self.biases = nn.ParameterList()
        prev_size = input_size
        for size in layer_sizes:
            self.weights.append(nn.Parameter(torch.randn(prev_size, size) * 0.01))
            self.biases.append(nn.Parameter(torch.zeros(size)))
            prev_size = size

        # Tracking tensors for knowledge accumulation and entropy computation
        self.Z = [None] * len(layer_sizes)  # Knowledge tensors per layer
        self.D = [None] * len(layer_sizes)  # Decision probability tensors
        self.D_prev = [None] * len(layer_sizes)  # Previous decisions for computing shifts
        self.delta_D = [None] * len(layer_sizes)  # Decision shifts per step
        self.entropy = [None] * len(layer_sizes)  # Layer-wise entropy storage

        # Store entropy, cosine, and output distribution history for visualization
        self.entropy_history = [[] for _ in range(len(layer_sizes))]
        self.cosine_history = [[] for _ in range(len(layer_sizes))]
        self.output_history = []  # New: Store mean output distribution (10 classes) per step
    

    def forward(self, x):
        """Computes SKA forward pass, storing knowledge and decisions."""
        batch_size = x.shape[0]
        #x = x.view(batch_size, -1)  # Flatten images --> dont need to perform this, we already have a 1d tensor

        for l in range(len(self.layer_sizes)):
            # Compute knowledge tensor Z = Wx + b
            z = torch.mm(x, self.weights[l]) + self.biases[l]
            # Apply sigmoid activation to get decision probabilities
            d = torch.sigmoid(z)
            # Store values for entropy computation
            self.Z[l] = z
            self.D[l] = d
            x = d  # Output becomes input for the next layer

        return x
    

    def calculate_entropy(self):
        """Computes entropy reduction and cos(theta) per layer."""
        total_entropy = 0
        for l in range(len(self.layer_sizes)):
            if self.Z[l] is not None and self.D_prev[l] is not None and self.D[l] is not None:
                # Compute decision shifts
                self.delta_D[l] = self.D[l] - self.D_prev[l]
                # Entropy reduction using SKA formula
                dot_product = torch.sum(self.Z[l] * self.delta_D[l])
                layer_entropy = -1 / np.log(2) * dot_product
                self.entropy[l] = layer_entropy.item()
                self.entropy_history[l].append(layer_entropy.item())

                # Compute cos(theta) for alignment
                z_norm = torch.norm(self.Z[l])
                delta_d_norm = torch.norm(self.delta_D[l])
                if z_norm > 0 and delta_d_norm > 0:
                    cos_theta = dot_product / (z_norm * delta_d_norm)
                    self.cosine_history[l].append(cos_theta.item())
                else:
                    self.cosine_history[l].append(0.0)  # Default if norms are zero

                total_entropy += layer_entropy
        return total_entropy
    

    def ska_update(self, inputs, learning_rate=0.01):
        """Updates weights using entropy-based learning without backpropagation."""
        for l in range(len(self.layer_sizes)):
            if self.delta_D[l] is not None:
                # Previous layer's output
                prev_output = inputs.view(inputs.shape[0], -1) if l == 0 else self.D_prev[l-1]
                # Compute sigmoid derivative: D * (1 - D)
                d_prime = self.D[l] * (1 - self.D[l])
                # Compute entropy gradient
                gradient = -1 / np.log(2) * (self.Z[l] * d_prime + self.delta_D[l])
                # Compute weight updates via outer product
                dW = torch.matmul(prev_output.t(), gradient) / prev_output.shape[0]
                # Update weights and biases
                self.weights[l] = self.weights[l] - learning_rate * dW
                self.biases[l] = self.biases[l] - learning_rate * gradient.mean(dim=0)

    def initialize_tensors(self, batch_size):
        """Resets decision tensors at the start of each training iteration."""
        for l in range(len(self.layer_sizes)):
            self.Z[l] = None         # Reset knowledge tensors
            self.D[l] = None         # Reset current decision probabilities
            self.D_prev[l] = None    # Reset previous decision probabilities
            self.delta_D[l] = None   # Reset decision shifts
            self.entropy[l] = None   # Reset entropy storage
            self.entropy_history[l] = []  # Reset entropy history
            self.cosine_history[l] = []   # Reset cosine history
        self.output_history = []  # Reset output history
        

    def visualize_entropy_heatmap(self, step):
        """Dynamically scales the heatmap range and visualizes entropy reduction."""
        entropy_data = np.array(self.entropy_history)
        vmin = np.min(entropy_data)  # Dynamically set minimum entropy value
        vmax = 0.0  # Keep 0 as the upper limit for standardization
        plt.figure(figsize=(12, 8))
        sns.heatmap(entropy_data, cmap="Blues_r", vmin=vmin, vmax=vmax,  
                    xticklabels=range(1, entropy_data.shape[1] + 1),
                    yticklabels=[f"Layer {i+1}" for i in range(len(self.layer_sizes))])
        plt.title(f"Layer-wise Entropy Heatmap (Step {step})")
        plt.xlabel("Step Index K")
        plt.ylabel("Network Layers")
        plt.tight_layout()
        plt.savefig(f"output/entropy_heatmap_step_{step}.png")
        plt.show(block=False)  # Non-blocking
        plt.pause(2)  # Wait for 2 seconds
        plt.close()  # Close automatically

    def visualize_cosine_heatmap(self, step):
        """Visualizes cos(theta) alignment heatmap with a diverging scale."""
        cosine_data = np.array(self.cosine_history)
        plt.figure(figsize=(12, 8))
        sns.heatmap(cosine_data, cmap="coolwarm_r", vmin=-1.0, vmax=1.0,  
                    xticklabels=range(1, cosine_data.shape[1] + 1),
                    yticklabels=[f"Layer {i+1}" for i in range(len(self.layer_sizes))])
        plt.title(f"Layer-wise Cos(\u03B8) Alignment Heatmap (Step {step})")
        plt.xlabel("Step Index K")
        plt.ylabel("Network Layers")
        plt.tight_layout()
        plt.savefig(f"output/cosine_heatmap_step_{step}.png")
        plt.show(block=False)  # Non-blocking
        plt.pause(2)  # Wait for 2 seconds
        plt.close()  # Close automatically

    def visualize_output_distribution(self):
        """Plots the evolution of the 10-class output distribution over K steps."""
        output_data = np.array(self.output_history)  # Shape: [K, 10]
        plt.figure(figsize=(10, 6))
        plt.plot(output_data)  # Plot each class as a line
        plt.title('Output Decision Probability Evolution Across Steps (Single Pass)')
        plt.xlabel('Step Index K')
        plt.ylabel('Mean Sigmoid Output')
        plt.legend([f"Class {i}" for i in range(10)], loc='upper right', bbox_to_anchor=(1.15, 1))
        plt.grid(True)
        plt.tight_layout()
        plt.savefig("output/output_distribution_single_pass.png")
        plt.show(block=False)  # Non-blocking
        plt.pause(2)  # Wait for 2 seconds
        plt.close()  # Close automatically


    def evaluate(self, X, y_true):
        """Evaluate model using accuracy and confusion matrix."""
        outputs = self.forward(X).detach().cpu().numpy()
        y_pred = np.argmax(outputs, axis=1)

        accuracy = accuracy_score(y_true, y_pred)
        print(f"Accuracy: {accuracy:.4f}")

        cm = confusion_matrix(y_true, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot(cmap='Blues')
        plt.title("Confusion Matrix")
        plt.show()

    def visualize_knowledge_magnitude(self):
        """Visualizes the L2 norm of knowledge tensors over training steps."""
        plt.figure(figsize=(10, 6))
        for l in range(len(self.layer_sizes)):
            magnitudes = [torch.norm(z).item() for z in self.Z if z is not None]
            plt.plot(magnitudes, label=f"Layer {l + 1}")
        plt.title("Knowledge Tensor Magnitude Over Training")
        plt.xlabel("Training Steps")
        plt.ylabel("L2 Norm")
        plt.legend()
        plt.grid(True)
        plt.show()

    def visualize_weight_changes(self):
        """Plots the average weight changes over training steps."""
        plt.figure(figsize=(10, 6))
        for l in range(len(self.layer_sizes)):
            weight_norms = [torch.norm(w).item() for w in self.weights]
            plt.plot(weight_norms, label=f"Layer {l + 1}")
        plt.title("Weight Changes During Training")
        plt.xlabel("Training Steps")
        plt.ylabel("Weight Norm")
        plt.legend()
        plt.grid(True)
        plt.show()

    def visualize_decision_evolution(self):
        """Visualizes decision probability evolution across steps."""
        output_data = np.array(self.output_history)
        plt.figure(figsize=(10, 6))
        plt.plot(output_data)
        plt.title("Decision Probability Evolution")
        plt.xlabel("Step Index K")
        plt.ylabel("Mean Sigmoid Output")
        plt.grid(True)
        plt.show()




# 3) Model Tranining

### Train the adapted SKA model on the Iris dataset

In [43]:
#training parameter

model = SKAModel_iris()
learning_rate = 0.01

# SKA training over multiple forward steps
total_entropy = 0
step_count = 0
start_time = time.time()

In [44]:
# initilising the tensor for first step
model.initialize_tensors(inputs.size)

In [45]:
inputs

array([[0.1944444 , 0.62499998, 0.10169492, 0.20833334],
       [0.22222215, 0.62499998, 0.06779661, 0.04166667],
       [0.55555557, 0.58333335, 0.77966099, 0.95833337],
       [0.58333332, 0.3333333 , 0.77966099, 0.87500002],
       [0.61111107, 0.41666665, 0.76271185, 0.70833331],
       [0.08333326, 0.6666666 , 0.        , 0.04166667],
       [0.08333326, 0.45833328, 0.08474576, 0.04166667],
       [0.38888879, 1.        , 0.08474576, 0.125     ],
       [1.        , 0.74999995, 0.91525424, 0.79166667],
       [0.66666658, 0.20833333, 0.81355934, 0.70833331],
       [0.66666658, 0.45833328, 0.77966099, 0.95833337],
       [0.69444446, 0.5       , 0.83050848, 0.91666665],
       [0.33333329, 0.1666667 , 0.47457626, 0.41666668],
       [0.22222215, 0.74999995, 0.15254237, 0.125     ],
       [0.58333332, 0.3333333 , 0.77966099, 0.83333329],
       [0.11111101, 0.5       , 0.05084745, 0.04166667],
       [0.72222221, 0.45833328, 0.74576272, 0.83333329],
       [0.47222218, 0.08333335,

In [46]:
inputs.shape

(120, 4)

In [47]:
inputs = np.expand_dims(inputs, axis=-1)

In [48]:
inputs.shape

(120, 4, 1)

In [49]:
inputs = torch.tensor(inputs, dtype=torch.float32)

In [50]:
# Process K forward steps (without backpropagation)
for k in range(model.K):
    outputs = model.forward(inputs)
    # Store mean output distribution for the final layer
    model.output_history.append(outputs.mean(dim=0).detach().cpu().numpy())  # [10] vector
    if k > 0:  # Compute entropy after first step
        batch_entropy = model.calculate_entropy()
        model.ska_update(inputs, learning_rate)
        total_entropy += batch_entropy
        step_count += 1
        print(f'Step: {k}, Total Steps: {step_count}, Entropy: {batch_entropy:.4f}')
        model.visualize_entropy_heatmap(step_count)
        model.visualize_cosine_heatmap(step_count)  # Add cosine heatmap
    # Update previous decision tensors
    model.D_prev = [d.clone().detach() if d is not None else None for d in model.D]

RuntimeError: self must be a matrix

# 4. Analysis and Comparison (20%)


### implementing traditional model


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# # Load and preprocess the Iris dataset
# iris = load_iris()
# X, y = iris.data, iris.target
# scaler = StandardScaler()
# X = scaler.fit_transform(X)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(Xtrain, dtype=torch.float32)
y_train_tensor = torch.tensor(ytrain, dtype=torch.long)
X_test_tensor = torch.tensor(Xtest, dtype=torch.float32)
y_test_tensor = torch.tensor(ytest, dtype=torch.long)

# Define a traditional feedforward neural network
class TraditionalNN(nn.Module):
    def __init__(self, input_size=4, hidden_sizes=[16, 8, 6], output_size=3):
        super(TraditionalNN, self).__init__()
        self.layers = nn.ModuleList()
        prev_size = input_size
        for size in hidden_sizes:
            self.layers.append(nn.Linear(prev_size, size))
            prev_size = size
        self.output_layer = nn.Linear(prev_size, output_size)

    def forward(self, x):
        for layer in self.layers:
            x = torch.relu(layer(x))
        return self.output_layer(x)

# Initialize and train the traditional neural network
def train_nn(model, X_train, y_train, epochs=100, learning_rate=0.01):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            _, preds = torch.max(outputs, 1)
            acc = accuracy_score(y_train, preds.detach().numpy())
            print(f"Epoch {epoch}: Loss = {loss.item():.4f}, Accuracy = {acc:.4f}")

# Evaluate the model
def evaluate_model(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        outputs = model(X_test)
        _, preds = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test, preds.numpy())
        print(f"Test Accuracy: {accuracy:.4f}")

        cm = confusion_matrix(y_test, preds.numpy())
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot(cmap='Blues')
        plt.title("Confusion Matrix")
        plt.show()

# Compare SKA with traditional neural network
traditional_nn = TraditionalNN()
print("Training Traditional Neural Network...")
train_nn(traditional_nn, X_train_tensor, y_train_tensor)
evaluate_model(traditional_nn, X_test_tensor, y_test_tensor)

# Insights on SKA vs. Traditional Neural Network
# 1. Forward Steps (K) and Convergence
# SKA models may require more iterations (higher K) for knowledge accumulation, while traditional NNs converge via gradient-based updates.

# 2. Entropy Reduction and Accuracy
# Higher entropy reduction correlates with better classification accuracy as the model gains more discriminative knowledge.

# 3. Advantages of SKA:
# - Gradient-free learning: Avoids issues with vanishing gradients.
# - Interpretability: Entropy and decision tracking offer better insights.
#
# Limitations of SKA:
# - Computational cost: Requires multiple forward steps.
# - Slower convergence compared to backpropagation.

print("Comparison completed. Analyze visual outputs and performance metrics.")

120