## SPOTER

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
import os
import matplotlib.pyplot as plt
import copy
from typing import Optional

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
video_directory = 'Latest-WLASL-100'

total = 0

gesture_folder = np.array(os.listdir(video_directory))
for gestures in gesture_folder:
    gesture = []

    for fname in os.listdir(os.path.join(video_directory, gestures)):
        path = os.path.join(video_directory, gestures, fname)
        if os.path.isdir(path):
            gesture.append(fname)

    total += len(gesture) 
    # print(gestures, end =" : ")        
    # print(len(gesture))

print("Total gestures: ", len(gesture_folder), "; Total videos: ", total)

Total gestures:  100 ; Total videos:  4076


In [3]:
label_map = {label: num for num, label in enumerate(gesture_folder)}
len(label_map)

100

In [4]:
def _get_clones(mod, n):
    return nn.ModuleList([copy.deepcopy(mod) for _ in range(n)])

class SPOTERTransformerDecoderLayer(nn.TransformerDecoderLayer):
    """
    Edited TransformerDecoderLayer implementation omitting the redundant self-attention operation.
    """
    def __init__(self, d_model, nhead, dim_feedforward, dropout, activation):
        super(SPOTERTransformerDecoderLayer, self).__init__(d_model, nhead, dim_feedforward, dropout, activation)
        # Remove the self-attention module since it is not used.
        del self.self_attn

    def forward(self, tgt: torch.Tensor, memory: torch.Tensor, tgt_mask: Optional[torch.Tensor] = None,
                memory_mask: Optional[torch.Tensor] = None, tgt_key_padding_mask: Optional[torch.Tensor] = None,
                memory_key_padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        # Note: The self-attention is skipped.
        tgt = tgt + self.dropout1(tgt)
        tgt = self.norm1(tgt)
        tgt2 = self.multihead_attn(tgt, memory, memory,
                                   attn_mask=memory_mask,
                                   key_padding_mask=memory_key_padding_mask)[0]
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)
        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
        tgt = tgt + self.dropout3(tgt2)
        tgt = self.norm3(tgt)
        return tgt

class SPOTER(nn.Module):
    """
    Modified SPOTER model to work with input data of shape (batch, 512, 258).

    Changes made:
    - Added an input projection layer mapping from input_dim (258) to hidden_dim.
    - Created a learned positional encoding of shape (1, seq_len, hidden_dim) to match the sequence length (512).
    - Removed the flattening of the input and instead work with per-token representations.
    - Adjusted tensor dimensions (using transpose) to match the nn.Transformer API,
      which by default expects (seq_len, batch, d_model).
    - Uses a single class query for the decoder to produce a final aggregated representation.
    """
    def __init__(self, num_classes, input_dim=258, hidden_dim=72, seq_len=512):
        super().__init__()
        # Project each token's feature (258) to the hidden dimension.
        self.input_projection = nn.Linear(input_dim, hidden_dim)
        # Learned positional encoding for each token in the sequence.
        self.pos = nn.Parameter(torch.randn(1, seq_len, hidden_dim))
        # A learnable query vector for the classification decoder.
        self.class_query = nn.Parameter(torch.randn(1, hidden_dim))
        # Create a Transformer with separate encoder and decoder.
        self.transformer = nn.Transformer(
            d_model=hidden_dim,
            nhead=9,
            num_encoder_layers=6,
            num_decoder_layers=6,
            dropout=0.1
        )
        # Final classification layer.
        self.linear_class = nn.Linear(hidden_dim, num_classes)

        # Replace the default decoder layers with our custom version that skips self-attention.
        custom_decoder_layer = SPOTERTransformerDecoderLayer(hidden_dim, 9, 2048, 0.1, "relu")
        self.transformer.decoder.layers = _get_clones(custom_decoder_layer, self.transformer.decoder.num_layers)

    def forward(self, inputs):
        """
        inputs: Tensor of shape (batch, seq_len, input_dim) i.e. (2038, 512, 258)
        """
        # Project input tokens: (batch, seq_len, hidden_dim)
        x = self.input_projection(inputs)
        # Add positional encoding.
        x = x + self.pos
        # Transformer expects src of shape (seq_len, batch, d_model)
        x = x.transpose(0, 1)
        # Prepare class query for the decoder; shape becomes (tgt_len, batch, hidden_dim) with tgt_len=1.
        tgt = self.class_query.unsqueeze(1).repeat(1, x.size(1), 1)
        # Run the transformer: output shape will be (tgt_len, batch, hidden_dim)
        out = self.transformer(src=x, tgt=tgt)
        # Bring back to (batch, tgt_len, hidden_dim)
        out = out.transpose(0, 1)
        # Use the first (and only) token of the output for classification.
        out = self.linear_class(out[:, 0, :])
        return out

In [4]:
# Load data
X = np.load('X_TRAIN_normalized_flipped.npy')
y = np.load('y_TRAIN_normalized_flipped.npy')

print(f"Data shapes - X: {X.shape}, y: {y.shape}")

y = tf.keras.utils.to_categorical(y, num_classes=len(gesture_folder))
y_labels = np.argmax(y, axis=1)

# Split the data
X_train_ori, X_test_ori, y_train_ori, y_test_ori = train_test_split(X, y, test_size=0.2, stratify=y_labels, random_state=42)
X_test_ori, X_val_ori, y_test_ori, y_val_ori = train_test_split(X_test_ori, y_test_ori, test_size=0.5, stratify=y_test_ori.argmax(axis=1), random_state=42)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train_ori, dtype=torch.float32)
X_val = torch.tensor(X_val_ori, dtype=torch.float32)
X_test = torch.tensor(X_test_ori, dtype=torch.float32)
y_train = torch.tensor(y_train_ori.argmax(axis=1), dtype=torch.long)
y_val = torch.tensor(y_val_ori.argmax(axis=1), dtype=torch.long)
y_test = torch.tensor(y_test_ori.argmax(axis=1), dtype=torch.long)

print(f"Training set - X: {X_train.shape}, y: {y_train.shape}")
print(f"Validation set - X: {X_val.shape}, y: {y_val.shape}")
print(f"Test set - X: {X_test.shape}, y: {y_test.shape}")


Data shapes - X: (4076, 512, 258), y: (4076,)
Training set - X: torch.Size([3260, 512, 258]), y: torch.Size([3260])
Validation set - X: torch.Size([408, 512, 258]), y: torch.Size([408])
Test set - X: torch.Size([408, 512, 258]), y: torch.Size([408])


In [10]:
# Create data loaders
batch_size = 16  # Reduced batch size to avoid memory issues
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
# Model, loss, and optimizer
input_size = X_train.size(-1)
num_classes = len(label_map)
model = SPOTER(num_classes=num_classes, input_dim=input_size, hidden_dim=72, seq_len=512).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 400
loss_history = []
val_loss_history = []

# loss threshold
loss_threshold = 0.1

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for X_batch, y_batch in train_loader:
        
        # Move batch to GPU
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        # print(X_batch.shape)
        
        optimizer.zero_grad()
        outputs = model(X_batch)  # Forward pass
        loss = criterion(outputs, y_batch)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    loss_history.append(avg_loss)
    
        # Validation phase
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)  # Forward pass
            loss = criterion(outputs, y_batch)  # Compute loss
            val_loss += loss.item()  # Accumulate validation loss

            # Compute accuracy
            predictions = outputs.argmax(dim=1)
            correct += (predictions == y_batch).sum().item()
            total += y_batch.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_loss_history.append(avg_val_loss)
    val_accuracy = correct / total

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')


    if avg_loss < loss_threshold:
        print(f'Loss threshold of {loss_threshold} reached. Stopping training.')
        break

# Evaluate the model
model.eval()
with torch.no_grad():
    # Move test data to GPU
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    
    test_outputs = model(X_test)
    test_loss = criterion(test_outputs, y_test)
    accuracy = (test_outputs.argmax(dim=1) == y_test).float().mean()
    print(f'Test Loss: {test_loss.item():.4f}, Test Accuracy: {accuracy.item():.4f}')


Epoch [1/400], Loss: 4.7479, Val Loss: 4.6178, Val Accuracy: 0.0196
Epoch [2/400], Loss: 4.6617, Val Loss: 4.6050, Val Accuracy: 0.0196
Epoch [3/400], Loss: 4.6566, Val Loss: 4.5956, Val Accuracy: 0.0147
Epoch [4/400], Loss: 4.6411, Val Loss: 4.6112, Val Accuracy: 0.0147
Epoch [5/400], Loss: 4.6445, Val Loss: 4.5982, Val Accuracy: 0.0196
Epoch [6/400], Loss: 4.6398, Val Loss: 4.6034, Val Accuracy: 0.0196
Epoch [7/400], Loss: 4.6366, Val Loss: 4.5978, Val Accuracy: 0.0147
Epoch [8/400], Loss: 4.6386, Val Loss: 4.5996, Val Accuracy: 0.0196
Epoch [9/400], Loss: 4.6385, Val Loss: 4.6034, Val Accuracy: 0.0147
Epoch [10/400], Loss: 4.6304, Val Loss: 4.5904, Val Accuracy: 0.0196
Epoch [11/400], Loss: 4.6311, Val Loss: 4.5953, Val Accuracy: 0.0196


KeyboardInterrupt: 

In [7]:
import torch
import torch.nn as nn
import copy

def _get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class SPOTERTransformerDecoderLayer(nn.TransformerDecoderLayer):
    def __init__(self, d_model, nhead, dim_feedforward, dropout, activation):
        super(SPOTERTransformerDecoderLayer, self).__init__(d_model, nhead, dim_feedforward, dropout, activation)
        del self.self_attn
        
    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None,
                tgt_key_padding_mask=None, memory_key_padding_mask=None):
        tgt = tgt + self.dropout1(tgt)
        tgt = self.norm1(tgt)
        tgt2 = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask,
                                   key_padding_mask=memory_key_padding_mask)[0]
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)
        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
        tgt = tgt + self.dropout3(tgt2)
        tgt = self.norm3(tgt)
        return tgt

class ModifiedSPOTER(nn.Module):
    def __init__(self, num_classes=100, hidden_dim=258, max_frames=50):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.max_frames = max_frames
        
        # Frame position embedding
        self.frame_embed = nn.Parameter(torch.rand(max_frames, hidden_dim))
        
        # Position embedding for transformer
        self.pos = nn.Parameter(torch.rand(1, hidden_dim).unsqueeze(0))
        
        # Class query for transformer decoder
        self.class_query = nn.Parameter(torch.rand(1, hidden_dim))
        
        # Input projection to match hidden dimension if needed
        self.input_projection = nn.Linear(258, hidden_dim)
        
        # Frame selection - fixed stride sampling instead of convolution
        self.stride = 512 // max_frames if 512 > max_frames else 1
        
        # Transformer
        self.transformer = nn.Transformer(hidden_dim, 6, 6, 6)
        
        # Classification head
        self.linear_class = nn.Linear(hidden_dim, num_classes)
        
        # Replace default decoder layers with custom ones
        custom_decoder_layer = SPOTERTransformerDecoderLayer(
            self.transformer.d_model, 
            self.transformer.nhead, 
            2048, 0.1, "relu"
        )
        self.transformer.decoder.layers = _get_clones(
            custom_decoder_layer, 
            self.transformer.decoder.num_layers
        )
    
    def forward(self, x):
        batch_size = x.shape[0]
        
        # Input shape: [batch_size, 512, 258]
        
        # Select frames with fixed stride
        selected_frames = x[:, :self.max_frames*self.stride:self.stride, :]
        
        # Handle the case where we don't have enough frames after striding
        if selected_frames.shape[1] < self.max_frames:
            # Pad with zeros
            padding = torch.zeros(batch_size, self.max_frames - selected_frames.shape[1], 258, 
                                 device=selected_frames.device)
            selected_frames = torch.cat([selected_frames, padding], dim=1)
        
        # Ensure we have exactly max_frames frames
        selected_frames = selected_frames[:, :self.max_frames, :]
        
        # Project input to match hidden dimension if needed
        if self.hidden_dim != 258:
            selected_frames = self.input_projection(selected_frames)
        
        # Add positional encoding for frames
        selected_frames = selected_frames + self.frame_embed.unsqueeze(0)
        
        # Reshape for transformer: [max_frames, batch_size, hidden_dim]
        x = selected_frames.transpose(0, 1)
        
        # Create class query: [1, batch_size, hidden_dim]
        query = self.class_query.unsqueeze(0).repeat(1, batch_size, 1)
        
        # Pass through transformer
        out = self.transformer(x, query)
        
        # Output shape: [1, batch_size, hidden_dim]
        out = out.transpose(0, 1)
        
        # Apply classification head
        out = self.linear_class(out)
        
        # Output shape: [batch_size, 1, num_classes]
        # Squeeze the middle dimension
        return out.squeeze(1)

In [2]:
# Initialize the modified model
model = ModifiedSPOTER(
    num_classes=100,  # Your 100 classes
    hidden_dim=258,   # Match your keypoint dimension or choose another suitable value
    max_frames=512     # Choose how many frames to process (smaller than 512)
)

# Example forward pass
# Assuming batch_input has shape [batch_size, 512, 258]
batch_input = torch.randn(16, 512, 258)  # 16 samples in this batch
output = model(batch_input) 
# output shape will be [16, 100] - one prediction per sample for 100 classes
print(output.shape)


torch.Size([16, 100])


In [9]:
# Model, loss, and optimizer
input_size = X_train.size(-1)
num_classes = len(label_map)
# Initialize the modified model
model = ModifiedSPOTER(
    num_classes=100,  # Your 100 classes
    hidden_dim=258,   # Match your keypoint dimension or choose another suitable value
    max_frames=512     # Choose how many frames to process (smaller than 512)
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 400
loss_history = []
val_loss_history = []

# loss threshold
loss_threshold = 0.1

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for X_batch, y_batch in train_loader:
        
        # Move batch to GPU
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        # print(X_batch.shape)
        
        optimizer.zero_grad()
        outputs = model(X_batch)  # Forward pass
        loss = criterion(outputs, y_batch)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    loss_history.append(avg_loss)
    
        # Validation phase
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)  # Forward pass
            loss = criterion(outputs, y_batch)  # Compute loss
            val_loss += loss.item()  # Accumulate validation loss

            # Compute accuracy
            predictions = outputs.argmax(dim=1)
            correct += (predictions == y_batch).sum().item()
            total += y_batch.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_loss_history.append(avg_val_loss)
    val_accuracy = correct / total

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')


    if avg_loss < loss_threshold:
        print(f'Loss threshold of {loss_threshold} reached. Stopping training.')
        break

# Evaluate the model
model.eval()
with torch.no_grad():
    # Move test data to GPU
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    
    test_outputs = model(X_test)
    test_loss = criterion(test_outputs, y_test)
    accuracy = (test_outputs.argmax(dim=1) == y_test).float().mean()
    print(f'Test Loss: {test_loss.item():.4f}, Test Accuracy: {accuracy.item():.4f}')


KeyboardInterrupt: 

In [9]:
import torch
import torch.nn as nn
import copy
from typing import Optional

def _get_clones(mod, n):
    return nn.ModuleList([copy.deepcopy(mod) for _ in range(n)])

class SPOTERTransformerDecoderLayer(nn.TransformerDecoderLayer):
    def __init__(self, d_model, nhead, dim_feedforward, dropout, activation):
        super().__init__(d_model, nhead, dim_feedforward, dropout, activation)
        del self.self_attn
        # Override multihead_attn with batch_first=True
        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)

    def forward(self, tgt: torch.Tensor, memory: torch.Tensor, tgt_mask: Optional[torch.Tensor] = None,
                memory_mask: Optional[torch.Tensor] = None, tgt_key_padding_mask: Optional[torch.Tensor] = None,
                memory_key_padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        # print(f"tgt shape: {tgt.shape}, memory shape: {memory.shape}")
        tgt2 = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask,
                                   key_padding_mask=memory_key_padding_mask)[0]
        # print(f"tgt2 shape after multihead_attn: {tgt2.shape}")
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)
        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
        tgt = tgt + self.dropout3(tgt2)
        tgt = self.norm3(tgt)
        return tgt

class SPOTER(nn.Module):
    def __init__(self, num_classes, feature_dim=258, hidden_dim=258, max_seq_len=512):
        super().__init__()
        self.feature_dim = feature_dim
        self.hidden_dim = hidden_dim
        self.max_seq_len = max_seq_len

        if feature_dim != hidden_dim:
            self.proj = nn.Linear(feature_dim, hidden_dim)
        else:
            self.proj = None

        self.pos_embed = nn.Parameter(torch.rand(max_seq_len, hidden_dim))
        self.class_query = nn.Parameter(torch.rand(1, hidden_dim))
        self.transformer = nn.Transformer(hidden_dim, 9, 6, 6, batch_first=True)
        self.linear_class = nn.Linear(hidden_dim, num_classes)

        custom_decoder_layer = SPOTERTransformerDecoderLayer(self.transformer.d_model, self.transformer.nhead, 2048,
                                                             0.1, "relu")
        self.transformer.decoder.layers = _get_clones(custom_decoder_layer, self.transformer.decoder.num_layers)

    def forward(self, x):
        if self.proj is not None:
            x = self.proj(x)
        seq_len = x.size(1)
        h = x + self.pos_embed[:seq_len, :].unsqueeze(0)
        batch_size = x.size(0)
        tgt = self.class_query.unsqueeze(0).repeat(batch_size, 1, 1)
        out = self.transformer(h, tgt)
        res = self.linear_class(out[:, 0, :])
        return res

In [12]:
# Model, loss, and optimizer
input_size = X_train.size(-1)
num_classes = len(label_map)
# Initialize the modified model
model = SPOTER(num_classes=num_classes, feature_dim=input_size, hidden_dim=72, max_seq_len=512).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 400
loss_history = []
val_loss_history = []

# loss threshold
loss_threshold = 0.1

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for i, (X_batch, y_batch) in enumerate(train_loader):
        print(f"Epoch {epoch+1}, Processing batch {i+1}/{len(train_loader)}")
        
        # Move batch to GPU
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        # print(X_batch.shape)
        
        optimizer.zero_grad()
        outputs = model(X_batch)  # Forward pass
        loss = criterion(outputs, y_batch)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    loss_history.append(avg_loss)
    
        # Validation phase
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)  # Forward pass
            loss = criterion(outputs, y_batch)  # Compute loss
            val_loss += loss.item()  # Accumulate validation loss

            # Compute accuracy
            predictions = outputs.argmax(dim=1)
            correct += (predictions == y_batch).sum().item()
            total += y_batch.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_loss_history.append(avg_val_loss)
    val_accuracy = correct / total

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')


    if avg_loss < loss_threshold:
        print(f'Loss threshold of {loss_threshold} reached. Stopping training.')
        break

# Evaluate the model
model.eval()
test_loss = 0
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        test_loss += loss.item()
        predictions = outputs.argmax(dim=1)
        correct += (predictions == y_batch).sum().item()
        total += y_batch.size(0)
avg_test_loss = test_loss / len(test_loader)
test_accuracy = correct / total
print(f'Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')


Epoch 1, Processing batch 1/204
Epoch 1, Processing batch 2/204
Epoch 1, Processing batch 3/204
Epoch 1, Processing batch 4/204
Epoch 1, Processing batch 5/204
Epoch 1, Processing batch 6/204
Epoch 1, Processing batch 7/204
Epoch 1, Processing batch 8/204
Epoch 1, Processing batch 9/204
Epoch 1, Processing batch 10/204
Epoch 1, Processing batch 11/204
Epoch 1, Processing batch 12/204
Epoch 1, Processing batch 13/204
Epoch 1, Processing batch 14/204
Epoch 1, Processing batch 15/204
Epoch 1, Processing batch 16/204
Epoch 1, Processing batch 17/204
Epoch 1, Processing batch 18/204
Epoch 1, Processing batch 19/204
Epoch 1, Processing batch 20/204
Epoch 1, Processing batch 21/204
Epoch 1, Processing batch 22/204
Epoch 1, Processing batch 23/204
Epoch 1, Processing batch 24/204
Epoch 1, Processing batch 25/204
Epoch 1, Processing batch 26/204
Epoch 1, Processing batch 27/204
Epoch 1, Processing batch 28/204
Epoch 1, Processing batch 29/204
Epoch 1, Processing batch 30/204
Epoch 1, Processing

KeyboardInterrupt: 

In [1]:
import torch
import torch.nn as nn

# Define a single encoder layer
d_model = 64
dim_feedforward = 128
nhead = 8

encoder_layer = nn.TransformerEncoderLayer(
    d_model=d_model,  # Input and output have 64 features
    nhead=nhead,  # Multi-head attention with 8 heads
    dim_feedforward=dim_feedforward,  # Hidden layer dimension
    dropout=0.1
)

# Simulate a batch of 2 sequences (each with 5 tokens)
batch_size = 2
seq_len = 5
input_tensor = torch.rand(batch_size, seq_len, d_model)  # Shape: (2, 5, 64)

# Pass data through the encoder layer
output = encoder_layer(input_tensor)

print("Input shape:", input_tensor.shape)  # (2, 5, 64)
print("Output shape:", output.shape)  # (2, 5, 64) - same as input d_model


  from .autonotebook import tqdm as notebook_tqdm


Input shape: torch.Size([2, 5, 64])
Output shape: torch.Size([2, 5, 64])
