### Modified SPOTER 1.0

In [1]:
import numpy as np

# Load the data
X = np.load('train/X_TRAIN_normalized_flipped.npy')  # Shape: (4086, 512, 258)
y = np.load('train/y_TRAIN_normalized_flipped.npy')  # Shape: (4086,)
print("X shape:", X.shape)
print("y shape:", y.shape)
# Define indices for x and y coordinates
pose_indices = []
for i in range(33):
    pose_indices.extend([i*4, i*4+1])  # x and y for each of the 33 pose landmarks

left_hand_indices = []
for i in range(21):
    left_hand_indices.extend([132 + i*3, 132 + i*3 + 1])  # x and y for each of the 21 left hand landmarks

right_hand_indices = []
for i in range(21):
    right_hand_indices.extend([132 + 63 + i*3, 132 + 63 + i*3 + 1])  # x and y for each of the 21 right hand landmarks

# Combine all indices (150 total)
all_indices = pose_indices + left_hand_indices + right_hand_indices

print("Extract x and y coordinates...")
# Extract x and y coordinates
X_xy = X[:, :, all_indices]  # Shape: (4086, 512, 150)

X shape: (4076, 512, 258)
y shape: (4076,)
Extract x and y coordinates...


In [3]:
from torch.utils.data import Dataset, DataLoader
import torch

class SignLanguageDataset(Dataset):
    def __init__(self, X, y):
        self.X = X  # Shape: (num_samples, 512, 150)
        self.y = y  # Shape: (num_samples,)

    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx]).float(), torch.tensor(self.y[idx]).long()

    def __len__(self):
        return len(self.y)

# Create dataset and dataloader
dataset = SignLanguageDataset(X_xy, y)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)  # Adjust batch_size as needed

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import torch.nn as nn

class ModifiedSPOTER(nn.Module):
    def __init__(self, num_classes, seq_len=512, feature_dim=150, hidden_dim=256):
        super(ModifiedSPOTER, self).__init__()
        self.seq_len = seq_len
        self.feature_dim = feature_dim
        self.hidden_dim = hidden_dim
        
        # Linear projection to hidden_dim
        if feature_dim != hidden_dim:
            self.projection = nn.Linear(feature_dim, hidden_dim)
        else:
            self.projection = nn.Identity()
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=8, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=6)
        
        # Classification head
        self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x):
        # Input shape: (batch_size, 512, 150)
        x = self.projection(x)  # Shape: (batch_size, 512, hidden_dim)
        x = self.transformer(x)  # Shape: (batch_size, 512, hidden_dim)
        x = x.mean(dim=1)        # Shape: (batch_size, hidden_dim) - average pooling over sequence
        return self.fc(x)        # Shape: (batch_size, num_classes)

In [None]:
import torch.optim as optim

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize model, loss, and optimizer
model = ModifiedSPOTER(num_classes=100, seq_len=512, feature_dim=150, hidden_dim=256).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)  # Shape: (batch_size, 512, 150)

        optimizer.zero_grad()
        outputs = model(inputs)  # Shape: (batch_size, 100)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}')

Epoch 1, Loss: 4.772365921618892
Epoch 2, Loss: 4.679596788742963
Epoch 3, Loss: 4.653740978240966


### Modified SPOTER 1.1 + Training Loop

In [1]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Load the data
X = np.load('train/X_TRAIN_normalized_flipped.npy')  # Shape: (num_samples, 512, 258)
y = np.load('train/y_TRAIN_normalized_flipped.npy')  # Shape: (num_samples,)

# Define indices for x and y coordinates
pose_indices = []
for i in range(33):
    pose_indices.extend([i*4, i*4+1])  # x and y for 33 pose landmarks

left_hand_indices = []
for i in range(21):
    left_hand_indices.extend([132 + i*3, 132 + i*3+1])  # x and y for 21 left-hand landmarks

right_hand_indices = []
for i in range(21):
    right_hand_indices.extend([132 + 63 + i*3, 132 + 63 + i*3+1])  # x and y for 21 right-hand landmarks

# Combine indices (total 150 features)
all_indices = pose_indices + left_hand_indices + right_hand_indices

# Extract x and y coordinates
X_xy = X[:, :, all_indices]  # Shape: (num_samples, 512, 150)

# One-hot encode labels and split data (as in your code)
y = tf.keras.utils.to_categorical(y, num_classes=100)  # Adjust num_classes if gesture_folder is available
y_labels = np.argmax(y, axis=1)

X_train_ori, X_test_ori, y_train_ori, y_test_ori = train_test_split(
    X_xy, y, test_size=0.2, stratify=y_labels, random_state=42
)
X_test_ori, X_val_ori, y_test_ori, y_val_ori = train_test_split(
    X_test_ori, y_test_ori, test_size=0.5, stratify=y_test_ori.argmax(axis=1), random_state=42
)

# Convert to tensors
X_train = torch.tensor(X_train_ori, dtype=torch.float32)
X_val = torch.tensor(X_val_ori, dtype=torch.float32)
X_test = torch.tensor(X_test_ori, dtype=torch.float32)
y_train = torch.tensor(y_train_ori.argmax(axis=1), dtype=torch.long)
y_val = torch.tensor(y_val_ori.argmax(axis=1), dtype=torch.long)
y_test = torch.tensor(y_test_ori.argmax(axis=1), dtype=torch.long)

# Create DataLoaders
batch_size = 16
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=16, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False)

  from .autonotebook import tqdm as notebook_tqdm


Modified SPOTER BEFORE

In [2]:
import torch.nn as nn

class ModifiedSPOTER(nn.Module):
    def __init__(self, num_classes, seq_len=512, feature_dim=150, hidden_dim=256):
        super(ModifiedSPOTER, self).__init__()
        self.seq_len = seq_len
        self.feature_dim = feature_dim
        self.hidden_dim = hidden_dim
        
        # Project input features to hidden dimension
        if feature_dim != hidden_dim:
            self.projection = nn.Linear(feature_dim, hidden_dim)
        else:
            self.projection = nn.Identity()
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=8, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=6)
        
        # Classification head
        self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x):
        # Input shape: (batch_size, 512, 150)
        x = self.projection(x)  # Shape: (batch_size, 512, hidden_dim)
        x = self.transformer(x)  # Shape: (batch_size, 512, hidden_dim)
        x = x.mean(dim=1)  # Shape: (batch_size, hidden_dim) - average pooling over sequence
        return self.fc(x)  # Shape: (batch_size, num_classes)

Modified SPOTER AFTER

In [2]:
import torch
import torch.nn as nn

class ModifiedSPOTER(nn.Module):
    def __init__(self, num_classes, seq_len=512, feature_dim=150, hidden_dim=64, nhead=8, num_encoder_layers=3, dim_feedforward=128, dropout=0.1):
        super(ModifiedSPOTER, self).__init__()
        self.seq_len = seq_len
        self.hidden_dim = hidden_dim
        
        # Input projection to reduce dimensionality to hidden_dim (64)
        self.input_projection = nn.Linear(feature_dim, hidden_dim)
        
        # Learnable positional encoding to capture temporal order
        self.positional_encoding = nn.Parameter(torch.zeros(1, seq_len, hidden_dim))
        
        # Transformer encoder with specified layers and parameters
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,           # Matches hidden_dim
            nhead=nhead,                  # Number of attention heads
            dim_feedforward=dim_feedforward,  # Feedforward network size
            dropout=dropout,              # Dropout for regularization
            batch_first=True              # Input shape: (batch_size, seq_len, hidden_dim)
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        
        # Classification head to map to the number of classes
        self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x):
        # Input shape: (batch_size, seq_len, feature_dim)
        x = self.input_projection(x)      # Project to (batch_size, seq_len, hidden_dim)
        x = x + self.positional_encoding  # Add positional encoding
        x = self.transformer(x)           # Process through transformer encoder
        x = x[:, -1, :]                   # Use the last token for classification
        return self.fc(x)                 # Output: (batch_size, num_classes)

In [3]:
import torch.optim as optim

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model, loss, and optimizer
num_classes = 100 # Replace with len(gesture_folder) or your specific num_classes
# model = ModifiedSPOTER(num_classes=num_classes, seq_len=512, feature_dim=150, hidden_dim=256).to(device)
model = ModifiedSPOTER(num_classes=100, seq_len=512, feature_dim=150).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 250
loss_history = []
val_loss_history = []
loss_threshold = 0.1

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for X_batch, y_batch in train_loader:
        # print(f"Mean: {X_batch.mean().item():.4f}, Std: {X_batch.std().item():.4f}")
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(X_batch)  # Forward pass
        loss = criterion(outputs, y_batch)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(train_loader)
    loss_history.append(avg_loss)
    
    # Validation phase
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()
            
            predictions = outputs.argmax(dim=1)
            correct += (predictions == y_batch).sum().item()
            total += y_batch.size(0)
    
    avg_val_loss = val_loss / len(val_loader)
    val_loss_history.append(avg_val_loss)
    val_accuracy = correct / total
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
    
    if avg_loss < loss_threshold:
        print(f'Loss threshold of {loss_threshold} reached. Stopping training.')
        break

# Evaluate on test set
model.eval()
with torch.no_grad():
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    test_outputs = model(X_test)
    test_loss = criterion(test_outputs, y_test)
    accuracy = (test_outputs.argmax(dim=1) == y_test).float().mean()
    print(f'Test Loss: {test_loss.item():.4f}, Test Accuracy: {accuracy.item():.4f}')

Epoch [1/250], Loss: 4.4620, Val Loss: 4.0709, Val Accuracy: 0.0588
Epoch [2/250], Loss: 3.8810, Val Loss: 3.6522, Val Accuracy: 0.0931
Epoch [3/250], Loss: 3.5467, Val Loss: 3.4162, Val Accuracy: 0.1225
Epoch [4/250], Loss: 3.3218, Val Loss: 3.3501, Val Accuracy: 0.1348
Epoch [5/250], Loss: 3.0984, Val Loss: 3.0354, Val Accuracy: 0.2206
Epoch [6/250], Loss: 2.8615, Val Loss: 2.8406, Val Accuracy: 0.2647
Epoch [7/250], Loss: 2.6591, Val Loss: 2.6423, Val Accuracy: 0.2696
Epoch [8/250], Loss: 2.3881, Val Loss: 2.4722, Val Accuracy: 0.3456
Epoch [9/250], Loss: 2.2179, Val Loss: 2.3685, Val Accuracy: 0.3554
Epoch [10/250], Loss: 1.9733, Val Loss: 2.2293, Val Accuracy: 0.3922
Epoch [11/250], Loss: 1.8006, Val Loss: 2.1117, Val Accuracy: 0.4216
Epoch [12/250], Loss: 1.6269, Val Loss: 1.8747, Val Accuracy: 0.4926
Epoch [13/250], Loss: 1.4587, Val Loss: 1.8066, Val Accuracy: 0.4657
Epoch [14/250], Loss: 1.3941, Val Loss: 1.6392, Val Accuracy: 0.5392
Epoch [15/250], Loss: 1.2212, Val Loss: 1.7

Modified SPOTER + Decoder

In [2]:
import torch
import torch.nn as nn

class ModifiedSPOTERWithDecoder(nn.Module):
    def __init__(self, num_classes, seq_len=512, feature_dim=150, hidden_dim=64, nhead=8, num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=128, dropout=0.1):
        super(ModifiedSPOTERWithDecoder, self).__init__()
        self.seq_len = seq_len
        self.hidden_dim = hidden_dim
        
        # Input projection to reduce dimensionality
        self.input_projection = nn.Linear(feature_dim, hidden_dim)
        
        # Positional encoding for the encoder
        self.positional_encoding = nn.Parameter(torch.zeros(1, seq_len, hidden_dim))
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        
        # Decoder layer (single layer for simplicity, can increase if needed)
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=hidden_dim,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)
        
        # Learnable class query for the decoder
        self.class_query = nn.Parameter(torch.zeros(1, 1, hidden_dim))
        
        # Classification head
        self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x):
        # Input shape: (batch_size, seq_len, feature_dim)
        x = self.input_projection(x)      # Shape: (batch_size, seq_len, hidden_dim)
        x = x + self.positional_encoding  # Add positional encoding
        
        # Encoder processes the sequence
        memory = self.encoder(x)          # Shape: (batch_size, seq_len, hidden_dim)
        
        # Decoder processes the class query with encoder memory
        batch_size = x.size(0)
        tgt = self.class_query.expand(batch_size, 1, self.hidden_dim)  # Shape: (batch_size, 1, hidden_dim)
        output = self.decoder(tgt, memory)  # Shape: (batch_size, 1, hidden_dim)
        
        # Classification from the decoder output
        output = output[:, 0, :]          # Take the single token output: (batch_size, hidden_dim)
        return self.fc(output)            # Shape: (batch_size, num_classes)

In [3]:
import torch.optim as optim

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model, loss, and optimizer
num_classes = 100 # Replace with len(gesture_folder) or your specific num_classes
# model = ModifiedSPOTER(num_classes=num_classes, seq_len=512, feature_dim=150, hidden_dim=256).to(device)
model = ModifiedSPOTERWithDecoder(num_classes=100, seq_len=512, feature_dim=150).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 250
loss_history = []
val_loss_history = []
loss_threshold = 0.1

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for X_batch, y_batch in train_loader:
        # print(f"Mean: {X_batch.mean().item():.4f}, Std: {X_batch.std().item():.4f}")
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(X_batch)  # Forward pass
        loss = criterion(outputs, y_batch)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(train_loader)
    loss_history.append(avg_loss)
    
    # Validation phase
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()
            
            predictions = outputs.argmax(dim=1)
            correct += (predictions == y_batch).sum().item()
            total += y_batch.size(0)
    
    avg_val_loss = val_loss / len(val_loader)
    val_loss_history.append(avg_val_loss)
    val_accuracy = correct / total
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
    
    if avg_loss < loss_threshold:
        print(f'Loss threshold of {loss_threshold} reached. Stopping training.')
        break

# Evaluate on test set
model.eval()
with torch.no_grad():
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    test_outputs = model(X_test)
    test_loss = criterion(test_outputs, y_test)
    accuracy = (test_outputs.argmax(dim=1) == y_test).float().mean()
    print(f'Test Loss: {test_loss.item():.4f}, Test Accuracy: {accuracy.item():.4f}')

Epoch [1/250], Loss: 4.4207, Val Loss: 4.0525, Val Accuracy: 0.0441
Epoch [2/250], Loss: 3.9496, Val Loss: 3.7201, Val Accuracy: 0.1127
Epoch [3/250], Loss: 3.6271, Val Loss: 3.4813, Val Accuracy: 0.1152
Epoch [4/250], Loss: 3.4278, Val Loss: 3.3735, Val Accuracy: 0.1176
Epoch [5/250], Loss: 3.2567, Val Loss: 3.1855, Val Accuracy: 0.1495
Epoch [6/250], Loss: 3.0641, Val Loss: 2.9859, Val Accuracy: 0.1936
Epoch [7/250], Loss: 2.8419, Val Loss: 2.8291, Val Accuracy: 0.2255
Epoch [8/250], Loss: 2.6468, Val Loss: 2.6726, Val Accuracy: 0.2721
Epoch [9/250], Loss: 2.3776, Val Loss: 2.3464, Val Accuracy: 0.4020
Epoch [10/250], Loss: 2.2124, Val Loss: 2.2310, Val Accuracy: 0.3922
Epoch [11/250], Loss: 1.9727, Val Loss: 2.0450, Val Accuracy: 0.4069
Epoch [12/250], Loss: 1.7670, Val Loss: 1.9474, Val Accuracy: 0.4583
Epoch [13/250], Loss: 1.6192, Val Loss: 1.7661, Val Accuracy: 0.4755
Epoch [14/250], Loss: 1.4798, Val Loss: 1.7092, Val Accuracy: 0.4755
Epoch [15/250], Loss: 1.3415, Val Loss: 1.5

FlattenedSpoter + SPOTER Decoder

In [1]:
import torch
import torch.nn as nn

class CustomDecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"):
        super(CustomDecoderLayer, self).__init__()
        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.ReLU() if activation == "relu" else nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(dim_feedforward, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        # Cross-attention with memory (encoder output)
        tgt2, _ = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)
        
        # Feed-forward network
        tgt2 = self.feed_forward(tgt)
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)
        return tgt

class CustomDecoder(nn.Module):
    def __init__(self, decoder_layer, num_layers):
        super(CustomDecoder, self).__init__()
        self.layers = nn.ModuleList([decoder_layer for _ in range(num_layers)])
        
    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        for layer in self.layers:
            tgt = layer(tgt, memory, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask)
        return tgt

class ModifiedSPOTERWithDecoder(nn.Module):
    def __init__(self, num_classes, seq_len=512, feature_dim=150, hidden_dim=64, nhead=8, num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=128, dropout=0.1):
        super(ModifiedSPOTERWithDecoder, self).__init__()
        self.seq_len = seq_len
        self.hidden_dim = hidden_dim
        
        # Input projection to reduce dimensionality
        self.input_projection = nn.Linear(feature_dim, hidden_dim)
        
        # Positional encoding for encoder
        self.positional_encoding = nn.Parameter(torch.zeros(1, seq_len, hidden_dim))
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        
        # Custom decoder layer without self-attention
        decoder_layer = CustomDecoderLayer(
            d_model=hidden_dim,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation="relu"
        )
        self.decoder = CustomDecoder(decoder_layer, num_layers=num_decoder_layers)
        
        # Learnable class query
        self.class_query = nn.Parameter(torch.zeros(1, 1, hidden_dim))
        
        # Classification head
        self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x):
        # Input shape: (batch_size, seq_len, feature_dim)
        x = self.input_projection(x)      # Shape: (batch_size, seq_len, hidden_dim)
        x = x + self.positional_encoding  # Add positional encoding
        
        # Encoder processes the sequence
        memory = self.encoder(x)          # Shape: (batch_size, seq_len, hidden_dim)
        
        # Decoder processes the class query with encoder memory
        batch_size = x.size(0)
        tgt = self.class_query.expand(batch_size, 1, self.hidden_dim)  # Shape: (batch_size, 1, hidden_dim)
        output = self.decoder(tgt, memory)  # Shape: (batch_size, 1, hidden_dim)
        
        # Classification from the decoder output
        output = output[:, 0, :]          # Shape: (batch_size, hidden_dim)
        return self.fc(output)            # Shape: (batch_size, num_classes)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Assuming your data loading and preprocessing code remains the same
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Load and preprocess data
print('Loading keypoints...')
X = np.load('train/X_TRAIN_landmarks.npy')
y = np.load('train/y_TRAIN_landmarks.npy')
print('Done Loading!')
y = tf.keras.utils.to_categorical(y, num_classes=100)
y_labels = np.argmax(y, axis=1)

pose_indices = [i*4 for i in range(33)] + [i*4+1 for i in range(33)]
left_hand_indices = [132 + i*3 for i in range(21)] + [132 + i*3+1 for i in range(21)]
right_hand_indices = [132 + 63 + i*3 for i in range(21)] + [132 + 63 + i*3+1 for i in range(21)]
all_indices = pose_indices + left_hand_indices + right_hand_indices
X_xy = X[:, :, all_indices]

X_train_ori, X_test_ori, y_train_ori, y_test_ori = train_test_split(
    X_xy, y, test_size=0.2, stratify=y_labels, random_state=42
)
X_test_ori, X_val_ori, y_test_ori, y_val_ori = train_test_split(
    X_test_ori, y_test_ori, test_size=0.5, stratify=y_test_ori.argmax(axis=1), random_state=42
)

X_train = torch.tensor(X_train_ori, dtype=torch.float32)
X_val = torch.tensor(X_val_ori, dtype=torch.float32)
X_test = torch.tensor(X_test_ori, dtype=torch.float32)
y_train = torch.tensor(y_train_ori.argmax(axis=1), dtype=torch.long)
y_val = torch.tensor(y_val_ori.argmax(axis=1), dtype=torch.long)
y_test = torch.tensor(y_test_ori.argmax(axis=1), dtype=torch.long)

batch_size = 16
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=16, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False)

# Define the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ModifiedSPOTERWithDecoder(num_classes=100, seq_len=512, feature_dim=150).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
num_epochs = 250
loss_history = []
val_loss_history = []
loss_threshold = 0.1

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(train_loader)
    loss_history.append(avg_loss)
    
    # Validation phase
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()
            
            predictions = outputs.argmax(dim=1)
            correct += (predictions == y_batch).sum().item()
            total += y_batch.size(0)
    
    avg_val_loss = val_loss / len(val_loader)
    val_loss_history.append(avg_val_loss)
    val_accuracy = correct / total
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
    
    if avg_loss < loss_threshold:
        print(f'Loss threshold of {loss_threshold} reached. Stopping training.')
        break

# Evaluate on test set
model.eval()
with torch.no_grad():
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    test_outputs = model(X_test)
    test_loss = criterion(test_outputs, y_test)
    accuracy = (test_outputs.argmax(dim=1) == y_test).float().mean()
    print(f'Test Loss: {test_loss.item():.4f}, Test Accuracy: {accuracy.item():.4f}')

Loading keypoints...
Done Loading!
Epoch [1/250], Loss: 4.6882, Val Loss: 4.5755, Val Accuracy: 0.0245
Epoch [2/250], Loss: 4.5213, Val Loss: 4.4309, Val Accuracy: 0.0245
Epoch [3/250], Loss: 4.4015, Val Loss: 4.3562, Val Accuracy: 0.0245
Epoch [4/250], Loss: 4.3303, Val Loss: 4.3018, Val Accuracy: 0.0294
Epoch [5/250], Loss: 4.2805, Val Loss: 4.2623, Val Accuracy: 0.0343
Epoch [6/250], Loss: 4.2296, Val Loss: 4.2070, Val Accuracy: 0.0539
Epoch [7/250], Loss: 4.1643, Val Loss: 4.1458, Val Accuracy: 0.0637
Epoch [8/250], Loss: 4.1106, Val Loss: 4.0835, Val Accuracy: 0.0637
Epoch [9/250], Loss: 4.0526, Val Loss: 4.0095, Val Accuracy: 0.0833
Epoch [10/250], Loss: 3.9818, Val Loss: 3.9525, Val Accuracy: 0.0882
Epoch [11/250], Loss: 3.9375, Val Loss: 3.9084, Val Accuracy: 0.1078
Epoch [12/250], Loss: 3.8749, Val Loss: 3.8544, Val Accuracy: 0.0882
Epoch [13/250], Loss: 3.8458, Val Loss: 3.8052, Val Accuracy: 0.0980
Epoch [14/250], Loss: 3.7886, Val Loss: 3.7636, Val Accuracy: 0.0882
Epoch [1