In [1]:
module("list")

Currently Loaded Modulefiles:
 1) rvs-cloud/1.0   2) anaconda/3/2023.03  


In [2]:
module("load","pytorch/gpu-cuda-11.6/2.1.0")

Loading pytorch/gpu-cuda-11.6/2.1.0
  Loading requirement: cuda/11.6 cudnn/8.9.2


In [3]:
module("load","tensorflow/gpu-cuda-11.6/2.13.0")

Loading tensorflow/gpu-cuda-11.6/2.13.0
  Loading requirement: protobuf/4.24.0 nccl/2.18.3 tensorrt/8.6.1


In [6]:
module("load","keras/2.13.1")

In [9]:
module("list")

Currently Loaded Modulefiles:
 1) rvs-cloud/1.0                 6) protobuf/4.24.0 <aL>             
 2) anaconda/3/2023.03            7) nccl/2.18.3 <aL>                 
 3) cuda/11.6 <aL>                8) tensorrt/8.6.1 <aL>              
 4) cudnn/8.9.2 <aL>              9) tensorflow/gpu-cuda-11.6/2.13.0  
 5) pytorch/gpu-cuda-11.6/2.1.0  10) keras/2.13.1                     

Key:
<module-tag>  <aL>=auto-loaded  


In [2]:
%load_ext autoreload  #Reloads updated modules

In [3]:
%autoreload 2

In [143]:
import tensorflow as tf
import torch
import numpy as np

# Import TensorFlow classes
#from layers import StochasticDepth as TFStochasticDepth, RandomDrop as TFRandomDrop

# Import PyTorch classes
#from layers_pytorch import StochasticDepth as PTStochasticDepth, RandomDrop as PTRandomDrop

from layers import StochasticDepth as TFStochasticDepth, RandomDrop as TFRandomDrop, SimpleHeadAttention as TFSimpleHeadAttention, TalkingHeadAttention as TFTalkingHeadAttention, LayerScale as TFLayerScale
from layers_pytorch import StochasticDepth as PTStochasticDepth, RandomDrop as PTRandomDrop, SimpleHeadAttention as PTSimpleHeadAttention, TalkingHeadAttention as PTTalkingHeadAttention, LayerScale as PTLayerScale


In [106]:
# Set random seeds for reproducibility
tf.random.set_seed(42)
torch.manual_seed(42)
np.random.seed(42)

# Create a random input tensor
batch_size, seq_len, num_features = 32, 10, 64
np_input = np.random.randn(batch_size, seq_len, num_features).astype(np.float32)

# Create TensorFlow tensor
tf_input = tf.convert_to_tensor(np_input)

# Create PyTorch tensor
pt_input = torch.from_numpy(np_input)

# Modify StochasticDepth classes to accept the random tensor
class TFStochasticDepthModified(TFStochasticDepth):
    def call(self, x, random_tensor, training=False):
        if training:
            keep_prob = 1 - self.drop_prob
            random_tensor = keep_prob + random_tensor
            random_tensor = tf.floor(random_tensor)
            x = x * random_tensor
            return x 
        return x

class PTStochasticDepthModified(PTStochasticDepth):
    def forward(self, x, random_tensor, training=False):
        if training:
            keep_prob = 1 - self.drop_prob
            random_tensor = keep_prob + random_tensor
            random_tensor = torch.floor(random_tensor)
            x = x * random_tensor
            return x
        return x


# Parameters
drop_prob = 0.1
num_skip = 5

# TensorFlow layers
tf_stochastic_depth = TFStochasticDepth(drop_prob)
tf_random_drop = TFRandomDrop(drop_prob, num_skip)

# PyTorch layers
pt_stochastic_depth = PTStochasticDepth(drop_prob)
pt_random_drop = PTRandomDrop(drop_prob, num_skip)

# Modified StochasticDepth
tf_stochastic_depth_m = TFStochasticDepthModified(drop_prob)
pt_stochastic_depth_m = PTStochasticDepthModified(drop_prob)

# TensorFlow forward pass
tf_stochastic_output = tf_stochastic_depth(tf_input, training=True)
tf_random_drop_output = tf_random_drop(tf_input, training=True)

# PyTorch forward pass
pt_stochastic_output = pt_stochastic_depth(pt_input, training=True)
pt_random_drop_output = pt_random_drop(pt_input, training=True)

# Forward pass of modified StochasticDepth
np_random = np.random.rand(batch_size, 1, 1).astype(np.float32)
tf_random = tf.convert_to_tensor(np_random)
pt_random = torch.from_numpy(np_random)

tf_output = tf_stochastic_depth_m(tf_input, tf_random, training=True)
pt_output = pt_stochastic_depth_m(pt_input, pt_random, training=True)


In [108]:
# Convert outputs to numpy for comparison
tf_output_np = tf_output.numpy()
pt_output_np = pt_output.detach().numpy()

# Compare outputs
print("Comparing StochasticDepth with common random number:")
print(f"  Shape: TF {tf_output_np.shape}, PT {pt_output_np.shape}")
print(f"  Mean: TF {tf_output_np.mean():.6f}, PT {pt_output_np.mean():.6f}")
print(f"  Std: TF {tf_output_np.std():.6f}, PT {pt_output_np.std():.6f}")
print(f"  Max Abs Diff: {np.abs(tf_output_np - pt_output_np).max():.6f}")

Comparing StochasticDepth with common random number:
  Shape: TF (32, 10, 64), PT (32, 10, 64)
  Mean: TF -0.000701, PT -0.000701
  Std: TF 0.902680, PT 0.902680
  Max Abs Diff: 0.000000


In [99]:
# Convert outputs to numpy for comparison
tf_stochastic_np = tf_stochastic_output.numpy()
tf_random_drop_np = tf_random_drop_output.numpy()
pt_stochastic_np = pt_stochastic_output.detach().numpy()
pt_random_drop_np = pt_random_drop_output.detach().numpy()

# Compare outputs when rnadom number is not common for both functions

print(f"Comparing StochasticDepth:")
print(f"  Shape: TF {tf_stochastic_np.shape}, PT {pt_stochastic_np.shape}")
print(f"  Mean: TF {tf_stochastic_np.mean():.6f}, PT {pt_stochastic_np.mean():.6f}")
print(f"  Std: TF {tf_stochastic_np.std():.6f}, PT {pt_stochastic_np.std():.6f}")
print(f"  Max Abs Diff: {np.abs(tf_stochastic_np - pt_stochastic_np).max():.6f}")
print()

Comparing StochasticDepth:
  Shape: TF (32, 10, 64), PT (32, 10, 64)
  Mean: TF 0.005732, PT 0.007001
  Std: TF 0.972660, PT 0.969924
  Max Abs Diff: 3.942331



In [134]:
batch_size, seq_len, num_features = 32, 10, 64
np_input = np.random.randn(batch_size, seq_len, num_features).astype(np.float32)

# Create TensorFlow and PyTorch input tensors
pt_input = torch.from_numpy(np_input)

np_random = np.random.rand(batch_size, 1,1).astype(np.float32)
pt_random = torch.from_numpy(np_random)

In [135]:
pt_random.unsqueeze(-1).shape

torch.Size([32, 1, 1, 1])

In [136]:
pt_input.shape

torch.Size([32, 10, 64])

In [141]:
shape = (pt_input.size(0), 1)
pt_random = torch.reshape(pt_random, shape)
print(pt_random.shape, pt_input.shape)
pt_input[:,:,5:] = pt_input[:,:,5:]*pt_random.unsqueeze(-1)
print(pt_random.unsqueeze(-1).shape)
print(final.shape)

torch.Size([32, 1]) torch.Size([32, 10, 64])
torch.Size([32, 1, 1])
torch.Size([32, 10, 64])


In [142]:
import tensorflow as tf
import torch
import numpy as np

from layers import RandomDrop as TFRandomDrop
from layers_pytorch import RandomDrop as PTRandomDrop

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
torch.manual_seed(42)

# Create a random input tensor
batch_size, seq_len, num_features = 32, 10, 64
np_input = np.random.randn(batch_size, seq_len, num_features).astype(np.float32)

# Create TensorFlow and PyTorch input tensors
tf_input = tf.convert_to_tensor(np_input)
pt_input = torch.from_numpy(np_input)

# Parameters
drop_prob = 0.1
num_skip = 5

# Generate a common random tensor for both implementations
np_random = np.random.rand(batch_size, 1,1).astype(np.float32)
tf_random = tf.convert_to_tensor(np_random)
pt_random = torch.from_numpy(np_random)

# Modify RandomDrop classes to accept the random tensor
class TFRandomDropModified(TFRandomDrop):
    def call(self, x, random_tensor, training=False):
        if training:
            keep_prob = 1 - self.drop_prob
            shape = (tf.shape(x)[0], 1, 1)
            random_tensor = keep_prob + random_tensor
            random_tensor = tf.floor(random_tensor)
            
            # Create a mask tensor
            mask = tf.concat([
                tf.ones((tf.shape(x)[0], tf.shape(x)[1], self.num_skip)),
                tf.tile(random_tensor, 
                        [1, tf.shape(x)[1], tf.shape(x)[2] - self.num_skip])
            ], axis=2)
            
            # Apply the mask
        return x * mask

class PTRandomDropModified(PTRandomDrop):
    def forward(self, x, random_tensor, training=False):
        if training:
            keep_prob = 1 - self.drop_prob
            shape = (x.size(0), 1)
            random_tensor = torch.reshape(random_tensor, shape)
            random_tensor = keep_prob + random_tensor
            random_tensor = torch.floor(random_tensor)
            x[:, :, self.num_skip:] = x[:, :, self.num_skip:] * random_tensor.unsqueeze(-1)
        return x

# Create modified layers
tf_random_drop = TFRandomDropModified(drop_prob, num_skip)
pt_random_drop = PTRandomDropModified(drop_prob, num_skip)

# Forward pass
tf_output = tf_random_drop(tf_input, tf_random, training=True)
pt_output = pt_random_drop(pt_input, pt_random, training=True)

# Convert outputs to numpy for comparison
tf_output_np = tf_output.numpy()
pt_output_np = pt_output.detach().numpy()

# Compare outputs
print("Comparing RandomDrop:")
print(f"  Shape: TF {tf_output_np.shape}, PT {pt_output_np.shape}")
print(f"  Mean: TF {tf_output_np.mean():.6f}, PT {pt_output_np.mean():.6f}")
print(f"  Std: TF {tf_output_np.std():.6f}, PT {pt_output_np.std():.6f}")
print(f"  Max Abs Diff: {np.abs(tf_output_np - pt_output_np).max():.6f}")

# Additional checks
print("\nChecking if first 'num_skip' features are unchanged:")
first_features_unchanged = np.allclose(tf_output_np[:,:,:num_skip], np_input[:,:,:num_skip]) and \
                           np.allclose(pt_output_np[:,:,:num_skip], np_input[:,:,:num_skip])
print(f"  First {num_skip} features unchanged: {first_features_unchanged}")

print("\nChecking if dropped features are consistent:")
dropped_features_consistent = np.allclose(tf_output_np[:,:,num_skip:], pt_output_np[:,:,num_skip:])
print(f"  Dropped features consistent between TF and PT: {dropped_features_consistent}")

Comparing RandomDrop:
  Shape: TF (32, 10, 64), PT (32, 10, 64)
  Mean: TF -0.001324, PT -0.001324
  Std: TF 0.911613, PT 0.911613
  Max Abs Diff: 0.000000

Checking if first 'num_skip' features are unchanged:
  First 5 features unchanged: True

Checking if dropped features are consistent:
  Dropped features consistent between TF and PT: True


In [147]:
np.random.seed(42)
tf.random.set_seed(42)
torch.manual_seed(42)

# Parameters
batch_size = 4
seq_len = 10
embedding_dim = 64
num_heads = 4
dropout_rate = 0.1

# Create a random input tensor
np_input = np.random.randn(batch_size, seq_len, embedding_dim).astype(np.float32)

# Create TensorFlow and PyTorch input tensors
tf_input = tf.convert_to_tensor(np_input)
pt_input = torch.from_numpy(np_input)

# Create layers
tf_attention = TFSimpleHeadAttention(projection_dim=embedding_dim, num_heads=num_heads, dropout_rate=dropout_rate)
pt_attention = PTSimpleHeadAttention(projection_dim=embedding_dim, num_heads=num_heads, dropout_rate=dropout_rate)

# Build the TensorFlow layer
_ = tf_attention(tf_input)  # This call builds the layer and initializes weights

# Get TensorFlow weights
tf_weights = tf_attention.get_weights()

# Print weight shapes for debugging
print("TensorFlow weight shapes:")
for i, w in enumerate(tf_weights):
    print(f"  Weight {i}: {w.shape}")

# Ensure the same initial weights for both implementations
# For PyTorch
pt_attention.qkv.weight.data = torch.from_numpy(tf_weights[0].T)
pt_attention.qkv.bias.data = torch.from_numpy(tf_weights[1])
pt_attention.proj.weight.data = torch.from_numpy(tf_weights[2].T)
pt_attention.proj.bias.data = torch.from_numpy(tf_weights[3])

# Forward pass
tf_output, tf_attention_weights = tf_attention(tf_input, training=True)
pt_output, pt_attention_weights = pt_attention(pt_input, training=True)

# Convert outputs to numpy for comparison
tf_output_np = tf_output.numpy()
pt_output_np = pt_output.detach().numpy()
tf_attention_weights_np = tf_attention_weights.numpy()
pt_attention_weights_np = pt_attention_weights.detach().numpy()

# Compare outputs
print("\nComparing SimpleHeadAttention outputs:")
print(f"  Shape: TF {tf_output_np.shape}, PT {pt_output_np.shape}")
print(f"  Mean: TF {tf_output_np.mean():.6f}, PT {pt_output_np.mean():.6f}")
print(f"  Std: TF {tf_output_np.std():.6f}, PT {pt_output_np.std():.6f}")
print(f"  Max Abs Diff: {np.abs(tf_output_np - pt_output_np).max():.6f}")

# Compare attention weights
print("\nComparing attention weights:")
print(f"  Shape: TF {tf_attention_weights_np.shape}, PT {pt_attention_weights_np.shape}")
print(f"  Mean: TF {tf_attention_weights_np.mean():.6f}, PT {pt_attention_weights_np.mean():.6f}")
print(f"  Std: TF {tf_attention_weights_np.std():.6f}, PT {pt_attention_weights_np.std():.6f}")
print(f"  Max Abs Diff: {np.abs(tf_attention_weights_np - pt_attention_weights_np).max():.6f}")

# Additional check for attention weight normalization
print("\nChecking if attention weights sum to 1 for each query:")
tf_sum_to_one = np.allclose(tf_attention_weights_np.sum(axis=-1), 1.0, atol=1e-5)
pt_sum_to_one = np.allclose(pt_attention_weights_np.sum(axis=-1), 1.0, atol=1e-5)
print(f"  TF attention weights sum to 1: {tf_sum_to_one}")
print(f"  PT attention weights sum to 1: {pt_sum_to_one}")

TensorFlow weight shapes:
  Weight 0: (64, 192)
  Weight 1: (192,)
  Weight 2: (64, 64)
  Weight 3: (64,)

Comparing SimpleHeadAttention outputs:
  Shape: TF (4, 10, 64), PT (4, 10, 64)
  Mean: TF 0.009182, PT 0.010976
  Std: TF 0.250162, PT 0.249307
  Max Abs Diff: 0.831630

Comparing attention weights:
  Shape: TF (4, 4, 10, 10), PT (4, 4, 10, 10)
  Mean: TF 0.100000, PT 0.100000
  Std: TF 0.046986, PT 0.046986
  Max Abs Diff: 0.000000

Checking if attention weights sum to 1 for each query:
  TF attention weights sum to 1: True
  PT attention weights sum to 1: True


# Change layer_pytorch TalkingHeadAttention

In [164]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class TalkingHeadAttention2(nn.Module):
    def __init__(self, projection_dim: int, num_heads: int, dropout_rate: float):
        super(TalkingHeadAttention2, self).__init__()
        self.num_heads = num_heads
        self.projection_dim = projection_dim
        self.dropout_rate = dropout_rate
        
        head_dim = self.projection_dim // self.num_heads
        self.scale = head_dim**-0.5
        self.qkv = nn.Linear(projection_dim, projection_dim * 3)
        self.attn_drop = nn.Dropout(dropout_rate)
        self.proj = nn.Linear(projection_dim, projection_dim)
        self.proj_l = nn.Linear(num_heads, num_heads)
        self.proj_w = nn.Linear(num_heads, num_heads)
        self.proj_drop = nn.Dropout(dropout_rate)

    def forward(self, x, int_matrix=None, mask=None, training=False):
        B, N, C = x.size()
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]

        attn = torch.matmul(q, k.transpose(-2, -1))
        if int_matrix is not None:
            attn += int_matrix

        # Apply proj_l before softmax
        attn = self.proj_l(attn.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
        
        if mask is not None:
            mask = mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
            attn += (1.0 - mask) * -1e9

        attn = F.softmax(attn, dim=-1)
        
        # Apply proj_w after softmax
        attn = self.proj_w(attn.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
        attn = self.attn_drop(attn) if training else attn

        x = torch.matmul(attn, v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x) if training else x
        return x, attn

In [165]:
from layers import TalkingHeadAttention as TFTalkingHeadAttention


# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
torch.manual_seed(42)

# Parameters
batch_size = 4
seq_len = 10
embedding_dim = 64
num_heads = 4
dropout_rate = 0.1

# Create a random input tensor
np_input = np.random.randn(batch_size, seq_len, embedding_dim).astype(np.float32)

# Create TensorFlow and PyTorch input tensors
tf_input = tf.convert_to_tensor(np_input)
pt_input = torch.from_numpy(np_input)

# Create layers
tf_attention = TFTalkingHeadAttention(projection_dim=embedding_dim, num_heads=num_heads, dropout_rate=dropout_rate)
pt_attention = TalkingHeadAttention2(projection_dim=embedding_dim, num_heads=num_heads, dropout_rate=dropout_rate)

# Build the TensorFlow layer
_ = tf_attention(tf_input)  # This call builds the layer and initializes weights

# Get TensorFlow weights
tf_weights = tf_attention.get_weights()

# Print weight shapes for debugging
print("TensorFlow weight shapes:")
for i, w in enumerate(tf_weights):
    print(f"  Weight {i}: {w.shape}")

# Ensure the same initial weights for both implementations
# For PyTorch
pt_attention.qkv.weight.data = torch.from_numpy(tf_weights[0].T)
pt_attention.qkv.bias.data = torch.from_numpy(tf_weights[1])
pt_attention.proj.weight.data = torch.from_numpy(tf_weights[2].T)
pt_attention.proj.bias.data = torch.from_numpy(tf_weights[3])
pt_attention.proj_l.weight.data = torch.from_numpy(tf_weights[4].T)
pt_attention.proj_l.bias.data = torch.from_numpy(tf_weights[5])
pt_attention.proj_w.weight.data = torch.from_numpy(tf_weights[6].T)
pt_attention.proj_w.bias.data = torch.from_numpy(tf_weights[7])

# Forward pass
tf_output, tf_attention_weights = tf_attention(tf_input, training=True)
pt_output, pt_attention_weights = pt_attention(pt_input, training=True)

# Convert outputs to numpy for comparison
tf_output_np = tf_output.numpy()
pt_output_np = pt_output.detach().numpy()
tf_attention_weights_np = tf_attention_weights.numpy()
pt_attention_weights_np = pt_attention_weights.detach().numpy()

# Compare outputs
print("\nComparing TalkingHeadAttention outputs:")
print(f"  Shape: TF {tf_output_np.shape}, PT {pt_output_np.shape}")
print(f"  Mean: TF {tf_output_np.mean():.6f}, PT {pt_output_np.mean():.6f}")
print(f"  Std: TF {tf_output_np.std():.6f}, PT {pt_output_np.std():.6f}")
print(f"  Max Abs Diff: {np.abs(tf_output_np - pt_output_np).max():.6f}")

# Compare attention weights
print("\nComparing attention weights:")
print(f"  Shape: TF {tf_attention_weights_np.shape}, PT {pt_attention_weights_np.shape}")
print(f"  Mean: TF {tf_attention_weights_np.mean():.6f}, PT {pt_attention_weights_np.mean():.6f}")
print(f"  Std: TF {tf_attention_weights_np.std():.6f}, PT {pt_attention_weights_np.std():.6f}")
print(f"  Max Abs Diff: {np.abs(tf_attention_weights_np - pt_attention_weights_np).max():.6f}")

# Additional check for attention weight normalization
print("\nChecking if attention weights sum to 1 for each query:")
tf_sum_to_one = np.allclose(tf_attention_weights_np.sum(axis=-1), 1.0, atol=1e-5)
pt_sum_to_one = np.allclose(pt_attention_weights_np.sum(axis=-1), 1.0, atol=1e-5)
print(f"  TF attention weights sum to 1: {tf_sum_to_one}")
print(f"  PT attention weights sum to 1: {pt_sum_to_one}")

TensorFlow weight shapes:
  Weight 0: (64, 192)
  Weight 1: (192,)
  Weight 2: (64, 64)
  Weight 3: (64,)
  Weight 4: (4, 4)
  Weight 5: (4,)
  Weight 6: (4, 4)
  Weight 7: (4,)

Comparing TalkingHeadAttention outputs:
  Shape: TF (4, 10, 64), PT (4, 10, 64)
  Mean: TF 0.006148, PT 0.001263
  Std: TF 0.322211, PT 0.319002
  Max Abs Diff: 1.083274

Comparing attention weights:
  Shape: TF (4, 4, 10, 10), PT (4, 4, 10, 10)
  Mean: TF -0.064353, PT -0.062686
  Std: TF 0.116113, PT 0.114994
  Max Abs Diff: 0.605686

Checking if attention weights sum to 1 for each query:
  TF attention weights sum to 1: False
  PT attention weights sum to 1: False


In [116]:
#Set random seets
np.random.seed(42)
tf.random.set_seed(42)
torch.manual_seed(42)

# Helper function to compare TensorFlow and PyTorch outputs
def compare_outputs(tf_output, pt_output, name):
    tf_output = tf_output.numpy() if isinstance(tf_output, tf.Tensor) else tf_output
    pt_output = pt_output.detach().numpy() if isinstance(pt_output, torch.Tensor) else pt_output
    
    if isinstance(tf_output, tuple):
        for i, (tf_item, pt_item) in enumerate(zip(tf_output, pt_output)):
            compare_outputs(tf_item, pt_item, f"{name}_{i}")
    else:
        print(f"Comparing {name}:")
        print(f"  Shape: TF {tf_output.shape}, PT {pt_output.shape}")
        print(f"  Mean: TF {tf_output.mean():.6f}, PT {pt_output.mean():.6f}")
        print(f"  Std: TF {tf_output.std():.6f}, PT {pt_output.std():.6f}")
        print(f"  Max Abs Diff: {np.abs(tf_output - pt_output).max():.6f}")
        print()

# Test StochasticDepth
def test_stochastic_depth():
    drop_prob = 0.1
    tf_layer = TFStochasticDepth(drop_prob)
    pt_layer = PTStochasticDepth(drop_prob)
    
    x = np.random.randn(32, 10, 64, 1).astype(np.float32)
    tf_input = tf.constant(x)
    pt_input = torch.tensor(x)
    
    tf_output = tf_layer(tf_input, training=True)
    pt_output = pt_layer(pt_input, training=True)
    
    compare_outputs(tf_output, pt_output, "StochasticDepth")

# Test RandomDrop
def test_random_drop():
    drop_prob = 0.1
    num_skip = 2
    tf_layer = TFRandomDrop(drop_prob, num_skip)
    pt_layer = PTRandomDrop(drop_prob, num_skip)
    
    x = np.random.randn(32, 10, 64).astype(np.float32)
    
    # Use NumPy array for TensorFlow layer
    tf_output = tf_layer(x, training=True).numpy()
    
    # Use PyTorch tensor for PyTorch layer
    pt_input = torch.tensor(x)
    pt_output = pt_layer(pt_input, training=True)
    
    compare_outputs(tf_output, pt_output, "RandomDrop")

# Test SimpleHeadAttention
def test_simple_head_attention():
    projection_dim = 64
    num_heads = 4
    dropout_rate = 0.1
    tf_layer = TFSimpleHeadAttention(projection_dim, num_heads, dropout_rate)
    pt_layer = PTSimpleHeadAttention(projection_dim, num_heads, dropout_rate)
    
    x = np.random.rand(32, 10, projection_dim).astype(np.float32)
    tf_input = tf.constant(x)
    pt_input = torch.tensor(x)
    
    tf_output = tf_layer(tf_input, training=True)
    pt_output = pt_layer(pt_input, training=True)
    
    compare_outputs(tf_output, pt_output, "SimpleHeadAttention")

# Test TalkingHeadAttention
def test_talking_head_attention():
    projection_dim = 64
    num_heads = 4
    dropout_rate = 0.1
    tf_layer = TFTalkingHeadAttention(projection_dim, num_heads, dropout_rate)
    pt_layer = PTTalkingHeadAttention(projection_dim, num_heads, dropout_rate)
    
    x = np.random.rand(2, 10, projection_dim).astype(np.float32)
    tf_input = tf.constant(x)
    pt_input = torch.tensor(x)
    
    tf_output = tf_layer(tf_input, training=True)
    pt_output = pt_layer(pt_input, training=True)
    
    compare_outputs(tf_output, pt_output, "TalkingHeadAttention")

# Test LayerScale
def test_layer_scale():
    init_values = 0.1
    projection_dim = 64
    tf_layer = TFLayerScale(init_values, projection_dim)
    pt_layer = PTLayerScale(init_values, projection_dim)
    
    x = np.random.rand(2, 10, projection_dim).astype(np.float32)
    tf_input = tf.constant(x)
    pt_input = torch.tensor(x)
    
    tf_output = tf_layer(tf_input)
    pt_output = pt_layer(pt_input)
    
    compare_outputs(tf_output, pt_output, "LayerScale")

# Run all tests
test_stochastic_depth()
test_random_drop()
test_simple_head_attention()
test_talking_head_attention()
test_layer_scale()

Comparing StochasticDepth:
  Shape: TF (32, 10, 64, 1), PT (32, 10, 64, 1)
  Mean: TF 0.005732, PT 0.007001
  Std: TF 0.972660, PT 0.969924
  Max Abs Diff: 3.942331

Comparing RandomDrop:
  Shape: TF (32, 10, 64), PT (32, 10, 64)
  Mean: TF -0.009080, PT -0.004371
  Std: TF 0.981644, PT 0.979563
  Max Abs Diff: 4.465604

Comparing SimpleHeadAttention_0:
  Shape: TF (32, 10, 64), PT (32, 10, 64)
  Mean: TF -0.046840, PT -0.006215
  Std: TF 0.453468, PT 0.202695
  Max Abs Diff: 1.613864

Comparing SimpleHeadAttention_1:
  Shape: TF (32, 4, 10, 10), PT (32, 4, 10, 10)
  Mean: TF 0.100000, PT 0.100000
  Std: TF 0.007855, PT 0.005192
  Max Abs Diff: 0.038661



RuntimeError: mat1 and mat2 shapes cannot be multiplied (80x10 and 4x4)