In [1]:

import numpy as np
import time
from sega_learn.neural_networks import *
from sega_learn.neural_networks.numba_utils import (
    relu as relu_nb, 
    relu_derivative as relu_derivative_nb,
    leaky_relu as leaky_relu_nb,
    leaky_relu_derivative as leaky_relu_derivative_nb,
    tanh as tanh_nb,
    tanh_derivative as tanh_derivative_nb,
    sigmoid as sigmoid_nb,
    sigmoid_derivative as sigmoid_derivative_nb,
    softmax as softmax_nb,
)

from sega_learn.neural_networks.numba_utils import sum_axis0, sum_reduce
from sega_learn.neural_networks.numba_utils import apply_dropout_jit
from sega_learn.neural_networks.numba_utils import compute_l2_reg

In [2]:
def compare_outputs(func1, func2, *args):
    output1 = func1(*args)
    output2 = func2(*args)
    
    tolerance = 1e-7
    if np.allclose(output1, output2, atol=tolerance):
        # print(f"{func1.__name__} and {func2.__name__} outputs within tolerance of {tolerance}.")
        pass
    else:
        fail = True
        print(f"\n{func1.__name__} and {func2.__name__} outputs are not within tolerance of {tolerance}.")

#### Compare Activation and Activation Derivatives

In [3]:
z = np.random.randn(1000, 2)

# ReLU and ReLU Derivative
compare_outputs(Activation.relu, relu_nb, z)
compare_outputs(Activation.relu_derivative, relu_derivative_nb, z)

# Leaky ReLU and Leaky ReLU Derivative
compare_outputs(Activation.leaky_relu, leaky_relu_nb, z)
compare_outputs(Activation.leaky_relu_derivative, leaky_relu_derivative_nb, z)

# Tanh and Tanh Derivative
compare_outputs(Activation.tanh, tanh_nb, z)
compare_outputs(Activation.tanh_derivative, tanh_derivative_nb, z)

# Sigmoid and Sigmoid Derivative
compare_outputs(Activation.sigmoid, sigmoid_nb, z)
compare_outputs(Activation.sigmoid_derivative, sigmoid_derivative_nb, z)

# Softmax
compare_outputs(Activation.softmax, softmax_nb, z)

#### Compare JIT utils

In [4]:
# Generate random data
X = np.random.randn(1000, 1000)

# Result for np.sum
np_sum_result = np.sum(X, axis=0, keepdims=True)

# Result for sum_axis0
sum_axis0_result = sum_axis0(X)

# Verify that results are the same
tolerance = 1e-6
if np.allclose(np_sum_result, sum_axis0_result, atol=tolerance):
    print("Results match!")
else:
    print("Results do not match!")
    diff_index = np.where(np.abs(np_sum_result - sum_axis0_result) > tolerance)[0][0]
    print(f"Difference found at index {diff_index}: {np_sum_result[0, diff_index]} vs {sum_axis0_result[diff_index]}")

Results match!


#### Compare Loss Functions

In [5]:
from sega_learn.neural_networks.loss import CrossEntropyLoss, BCEWithLogitsLoss
from sega_learn.neural_networks.loss_jit import JITCrossEntropyLoss, JITBCEWithLogitsLoss

In [6]:
# Compare Cross Entropy Loss

# Generate some dummy data for multi-class classification
n_samples, n_classes = 5, 3

logits_ce = np.random.randn(n_samples, n_classes)

# Generate integer targets and convert to one-hot
targets_int = np.random.randint(0, n_classes, size=n_samples)
targets_onehot = np.eye(n_classes)[targets_int]

# Instantiate loss function objects
base_ce_loss = CrossEntropyLoss()
jit_ce_loss = JITCrossEntropyLoss()

# Calculate losses
loss_base_ce = base_ce_loss(logits_ce, targets_onehot)
loss_jit_ce = jit_ce_loss.calculate_loss(logits_ce, targets_onehot)

tolerance = 1e-7
print("Cross Entropy Loss Comparison:")
print("-"*75)
if np.allclose(loss_base_ce, loss_jit_ce, atol=tolerance):
    print("Losses are equal to within tolerance of", tolerance)
print("Base Loss     :", loss_base_ce)
print("JIT Loss      :", loss_jit_ce)
print("Difference    :", abs(loss_base_ce - loss_jit_ce))


Cross Entropy Loss Comparison:
---------------------------------------------------------------------------
Losses are equal to within tolerance of 1e-07
Base Loss     : 1.2011396407224744
JIT Loss      : 1.201139640722479
Difference    : 4.6629367034256575e-15


In [7]:
# Compare Binary Cross Entropy Loss

# Generate some dummy data for binary classification
n_samples_bce = 10
logits_bce = np.random.randn(n_samples_bce)

# Generate binary targets (0 or 1)
targets_bce = np.random.randint(0, 2, size=n_samples_bce)

# Instantiate loss function objects
base_bce_loss = BCEWithLogitsLoss()
jit_bce_loss = JITBCEWithLogitsLoss()

# Calculate losses
loss_base_bce = base_bce_loss(logits_bce, targets_bce)
loss_jit_bce = jit_bce_loss.calculate_loss(logits_bce, targets_bce)


tolerance = 1e-7
print("\nBCE With Logits Loss Comparison:")
print("-"*50)
if np.allclose(loss_base_bce, loss_jit_bce, atol=tolerance):
    print("Losses are equal to within tolerance of", tolerance)

print("Base Loss     :", loss_base_bce)
print("JIT Loss      :", loss_jit_bce)
print("Difference    :", abs(loss_base_bce - loss_jit_bce))



BCE With Logits Loss Comparison:
--------------------------------------------------
Losses are equal to within tolerance of 1e-07
Base Loss     : 0.6638989523377259
JIT Loss      : 0.6638989523377259
Difference    : 0.0


#### Compare dropout

In [8]:
# Compare apply_dropout to apply_dropout_jit

layer_sizes = [10, 5, 3, 2]
dropout_rate = 0.5
reg_lambda = 0
activations = ['relu', 'relu', 'softmax']
# Initialize neural networks
nn = BaseBackendNeuralNetwork(layer_sizes, dropout_rate, reg_lambda, activations)

counts_base = []
counts_jit = []
n_trials = 10_000
for i in range(n_trials):
    # Generate random data
    X = np.random.randn(1000, 5)

    # Apply dropout
    X_dropout = nn.apply_dropout(X)
    X_dropout_jit = apply_dropout_jit(X, dropout_rate)

    # Count the number of non-zero elements in each array
    count_base = np.count_nonzero(X_dropout)
    count_jit = np.count_nonzero(X_dropout_jit)

    counts_base.append(count_base)
    counts_jit.append(count_jit)

avg_count_base = np.mean(counts_base)
avg_count_jit = np.mean(counts_jit)
std_dev_base = np.std(counts_base)
std_dev_jit = np.std(counts_jit)

print(f"\nApply Dropout Comparison: {n_trials:,} trials")
print("-"*50)
print(f"Base Count : {avg_count_base:.2f} ± {std_dev_base:.2f}")
print(f"JIT Count  : {avg_count_jit:.2f} ± {std_dev_jit:.2f}")



Apply Dropout Comparison: 10,000 trials
--------------------------------------------------
Base Count : 2499.86 ± 35.43
JIT Count  : 2500.28 ± 35.30


#### Compare L2 Regularization

In [9]:
weights = [np.random.randn(5, 5) for _ in range(10)]
biases = [np.random.randn(5) for _ in range(10)]
activations = ['relu' for _ in range(10)]
layer_sizes = [5] * 10
dropout_rate = 0.5
reg_lambda = 0
nn = BaseBackendNeuralNetwork(layer_sizes, dropout_rate, reg_lambda, activations)


l2_base = nn.compute_l2_reg(weights)
l2_jit = compute_l2_reg(weights) 

print(f"\nL2 Regularization Comparison:")
print("-"*35)
print(f"Base L2 : {l2_base:.2f}")
print(f"JIT L2  : {l2_jit:.2f}")



L2 Regularization Comparison:
-----------------------------------
Base L2 : 236.52
JIT L2  : 236.52


#### Compare Optimizers

In [11]:
from sega_learn.neural_networks.optimizers import AdamOptimizer, SGDOptimizer, AdadeltaOptimizer
from sega_learn.neural_networks.optimizers_jit import JITAdamOptimizer, JITSGDOptimizer, JITAdadeltaOptimizer

In [12]:
num_layers = 5
np.random.seed(42)

In [13]:
# Compare Adam Optimizer
lr = 0.01
beta1 = 0.5
beta2 = 0.9
epsilon = 1e-5
reg_lambda = 0.01
activations = ['relu' for _ in range(num_layers)]

base_layers = []
jit_layers = []
for i in range(num_layers):
    base_layers.append(Layer(3,3,activations[i]))
    jit_layers.append(JITLayer(3,3,activations[i]))

# Initialize optimizer objects
adam_base = AdamOptimizer(lr, beta1, beta2, epsilon, reg_lambda)
adam_jit = JITAdamOptimizer(lr, beta1, beta2, epsilon, reg_lambda)

adam_base.initialize(base_layers)
adam_jit.initialize(jit_layers)

# Assert that the optimizer states are the same (m, v, t)
for i in range(num_layers):
    assert np.allclose(adam_base.m, adam_jit.m)
    assert np.allclose(adam_base.v, adam_jit.v)
    assert adam_base.t == adam_jit.t

# Set layer weights to the same values (initialized randomly)
for i in range(num_layers):
    jit_layers[i].weights = base_layers[i].weights
    

# Update optimizer
dW = [np.random.randn(3,3) for _ in range(num_layers)]
db = [np.random.randn(3) for _ in range(num_layers)]
for i in range(num_layers):
    adam_base.update(base_layers[i], dW[i], db[i], i)
    # adam_jit.update(jit_layers[i], dW[i], db[i], i)
adam_jit.update_layers(jit_layers, dW, db)

tolerance = 1e-7
for i in range(num_layers):
    assert np.allclose(adam_base.m, adam_jit.m)
    assert np.allclose(adam_base.v, adam_jit.v)
    assert adam_base.t == adam_jit.t

    weights_close = np.allclose(base_layers[i].weights, jit_layers[i].weights, atol=tolerance)
    if not weights_close:
        print(f"\nLayer {i} weights are not close.")
        # Find the index of the first element that is not close
        diff_index = np.where(np.abs(base_layers[i].weights - jit_layers[i].weights) > 1e-7)[0][0]
        print(f"Difference found at index {diff_index}: \n\t{base_layers[i].weights[:, diff_index]} vs \n\t{jit_layers[i].weights[:, diff_index]}")
    
    biases_close = np.allclose(base_layers[i].biases, jit_layers[i].biases, atol=tolerance)
    if not biases_close:
        print(f"\nLayer {i} biases are not close.")
        # Find the index of the first element that is not close
        diff_index = np.where(np.abs(base_layers[i].biases - jit_layers[i].biases) > 1e-7)[0][0]
        print(f"Difference found at index {diff_index}: \n\t{base_layers[i].biases[diff_index]} vs \n\t{jit_layers[i].biases[diff_index]}")
        
    assert weights_close
    assert biases_close
    
    print(f"Layer {i} weights and biases are within tolerance: {tolerance}")


Layer 0 weights and biases are within tolerance: 1e-07
Layer 1 weights and biases are within tolerance: 1e-07
Layer 2 weights and biases are within tolerance: 1e-07
Layer 3 weights and biases are within tolerance: 1e-07
Layer 4 weights and biases are within tolerance: 1e-07


In [14]:
# Compare SGD Optimizer
lr = 0.01
momentum = 0.9
reg_lambda = 0.01
activations = ['relu' for _ in range(num_layers)]

base_layers = []
jit_layers = []
for i in range(num_layers):
    base_layers.append(Layer(3, 3, activations[i]))
    jit_layers.append(JITLayer(3, 3, activations[i]))

# Initialize optimizer objects
sgd_base = SGDOptimizer(lr, momentum, reg_lambda)
sgd_jit = JITSGDOptimizer(lr, momentum, reg_lambda)

sgd_base.initialize(base_layers)
sgd_jit.initialize(jit_layers)

# Assert that the optimizer states are the same (velocity)
for i in range(num_layers):
    assert np.allclose(sgd_base.velocity, sgd_jit.velocity)

# Set layer weights to the same values (initialized randomly)
for i in range(num_layers):
    jit_layers[i].weights = base_layers[i].weights

# Update optimizer
dW = [np.random.randn(3, 3) for _ in range(num_layers)]
db = [np.random.randn(3) for _ in range(num_layers)]
for i in range(num_layers):
    sgd_base.update(base_layers[i], dW[i], db[i], i)
sgd_jit.update_layers(jit_layers, dW, db)

tolerance = 1e-7
for i in range(num_layers):
    assert np.allclose(sgd_base.velocity, sgd_jit.velocity)

    weights_close = np.allclose(base_layers[i].weights, jit_layers[i].weights, atol=tolerance)
    if not weights_close:
        print(f"\nLayer {i} weights are not close.")
        diff_index = np.where(np.abs(base_layers[i].weights - jit_layers[i].weights) > 1e-7)[0][0]
        print(f"Difference found at index {diff_index}: \n\t{base_layers[i].weights[:, diff_index]} vs \n\t{jit_layers[i].weights[:, diff_index]}")

    biases_close = np.allclose(base_layers[i].biases, jit_layers[i].biases, atol=tolerance)
    if not biases_close:
        print(f"\nLayer {i} biases are not close.")
        diff_index = np.where(np.abs(base_layers[i].biases - jit_layers[i].biases) > 1e-7)[0][0]
        print(f"Difference found at index {diff_index}: \n\t{base_layers[i].biases[diff_index]} vs \n\t{jit_layers[i].biases[diff_index]}")

    assert weights_close
    assert biases_close

    print(f"Layer {i} weights and biases are within tolerance: {tolerance}")

Layer 0 weights and biases are within tolerance: 1e-07
Layer 1 weights and biases are within tolerance: 1e-07
Layer 2 weights and biases are within tolerance: 1e-07
Layer 3 weights and biases are within tolerance: 1e-07
Layer 4 weights and biases are within tolerance: 1e-07


In [15]:
# Compare Adadelta Optimizer
lr = 1.0
rho = 0.95
epsilon = 1e-6
reg_lambda = 0.01
activations = ['relu' for _ in range(num_layers)]

base_layers = []
jit_layers = []
for i in range(num_layers):
    base_layers.append(Layer(3, 3, activations[i]))
    jit_layers.append(JITLayer(3, 3, activations[i]))

# Initialize optimizer objects
adadelta_base = AdadeltaOptimizer(lr, rho, epsilon, reg_lambda)
adadelta_jit = JITAdadeltaOptimizer(lr, rho, epsilon, reg_lambda)

adadelta_base.initialize(base_layers)
adadelta_jit.initialize(jit_layers)

# Assert that the optimizer states are the same (E_g2, E_delta_x2)
for i in range(num_layers):
    assert np.allclose(adadelta_base.E_g2, adadelta_jit.E_g2)
    assert np.allclose(adadelta_base.E_delta_x2, adadelta_jit.E_delta_x2)

# Set layer weights to the same values (initialized randomly)
for i in range(num_layers):
    jit_layers[i].weights = base_layers[i].weights

# Update optimizer
dW = [np.random.randn(3, 3) for _ in range(num_layers)]
db = [np.random.randn(3) for _ in range(num_layers)]
for i in range(num_layers):
    adadelta_base.update(base_layers[i], dW[i], db[i], i)
adadelta_jit.update_layers(jit_layers, dW, db)

tolerance = 1e-7
for i in range(num_layers):
    assert np.allclose(adadelta_base.E_g2, adadelta_jit.E_g2)
    assert np.allclose(adadelta_base.E_delta_x2, adadelta_jit.E_delta_x2)

    weights_close = np.allclose(base_layers[i].weights, jit_layers[i].weights, atol=tolerance)
    if not weights_close:
        print(f"\nLayer {i} weights are not close.")
        diff_index = np.where(np.abs(base_layers[i].weights - jit_layers[i].weights) > 1e-7)[0][0]
        print(f"Difference found at index {diff_index}: \n\t{base_layers[i].weights[:, diff_index]} vs \n\t{jit_layers[i].weights[:, diff_index]}")

    biases_close = np.allclose(base_layers[i].biases, jit_layers[i].biases, atol=tolerance)
    if not biases_close:
        print(f"\nLayer {i} biases are not close.")
        diff_index = np.where(np.abs(base_layers[i].biases - jit_layers[i].biases) > 1e-7)[0][0]
        print(f"Difference found at index {diff_index}: \n\t{base_layers[i].biases[diff_index]} vs \n\t{jit_layers[i].biases[diff_index]}")

    assert weights_close
    assert biases_close

    print(f"Layer {i} weights and biases are within tolerance: {tolerance}")

Layer 0 weights and biases are within tolerance: 1e-07
Layer 1 weights and biases are within tolerance: 1e-07
Layer 2 weights and biases are within tolerance: 1e-07
Layer 3 weights and biases are within tolerance: 1e-07
Layer 4 weights and biases are within tolerance: 1e-07
