In [1]:
import os
import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# -------------------------------
# Load Pre-Extracted Data
# -------------------------------
# X has shape (2038, 512, 258) and y has shape (2038,)
X = np.load('X_TRAIN_normalized.npy')
y = np.load('y_TRAIN_normalized.npy')
print("X shape:", X.shape, "y shape:", y.shape)

# Determine number of classes from y
num_classes = len(np.unique(y))
print("Number of classes:", num_classes)

# -------------------------------
# PyTorch Dataset for Sign Language Videos
# -------------------------------
class SignLanguageDataset(Dataset):
    def __init__(self, X, y):
        """
        X: NumPy array of shape (num_samples, seq_len, 258)
        y: NumPy array of shape (num_samples,)
        """
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = SignLanguageDataset(X, y)
# Use a batch size similar to that in the paper (e.g., 4 for WLASL)
batch_size = 4
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# -------------------------------
# Positional Encoding Module
# -------------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=600):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)  # Shape: (max_len, 1, d_model)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        # x shape: (seq_len, batch_size, d_model)
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# -------------------------------
# Transformer Encoder–Only Classifier
# -------------------------------
class TransformerEncoderClassifier(nn.Module):
    def __init__(self, input_dim=258, seq_len=512, d_model=256, num_layers=4, nhead=8, num_classes=100, dropout=0.1):
        """
        input_dim: Dimensionality of each frame's features (258)
        seq_len: Number of frames per video (512)
        d_model: Model dimension after projection
        num_layers: Number of transformer encoder layers
        nhead: Number of attention heads
        num_classes: Number of output classes
        """
        super(TransformerEncoderClassifier, self).__init__()
        # Project the input to the model dimension
        self.input_proj = nn.Linear(input_dim, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout=dropout, max_len=seq_len+1)
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Define a learnable [CLS] token for classification
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_model))
        self.fc = nn.Linear(d_model, num_classes)
        
    def forward(self, x):
        # x shape: (batch_size, seq_len, input_dim)
        batch_size = x.size(0)
        # Project input to d_model
        x = self.input_proj(x)  # (batch_size, seq_len, d_model)
        
        # Prepend the CLS token to each sequence
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # (batch_size, 1, d_model)
        x = torch.cat([cls_tokens, x], dim=1)  # (batch_size, seq_len+1, d_model)
        
        # Transformer encoder expects input shape (seq_len+1, batch_size, d_model)
        x = x.transpose(0, 1)  # (seq_len+1, batch_size, d_model)
        
        # Add positional encoding
        x = self.positional_encoding(x)
        
        # Pass through transformer encoder
        x = self.transformer_encoder(x)  # (seq_len+1, batch_size, d_model)
        
        # Use the output of the CLS token for classification
        cls_output = x[0]  # (batch_size, d_model)
        logits = self.fc(cls_output)  # (batch_size, num_classes)
        return logits

# -------------------------------
# Model, Loss, and Optimizer Setup
# -------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerEncoderClassifier(
    input_dim=258,
    seq_len=512,
    d_model=256,
    num_layers=4,
    nhead=8,
    num_classes=num_classes,
    dropout=0.1
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

# -------------------------------
# Training Loop
# -------------------------------
num_epochs = 200  # Adjust based on your needs
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    total = 0
    correct = 0
    for batch_X, batch_y in dataloader:
        batch_X = batch_X.to(device)  # (batch_size, 512, 258)
        batch_y = batch_y.to(device)  # (batch_size,)
        
        optimizer.zero_grad()
        outputs = model(batch_X)  # (batch_size, num_classes)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * batch_X.size(0)
        _, predicted = torch.max(outputs, dim=1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
        
    epoch_loss = running_loss / total
    epoch_acc = correct / total * 100
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%")
    

  from .autonotebook import tqdm as notebook_tqdm


X shape: (2038, 512, 258) y shape: (2038,)
Number of classes: 100


RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
import os
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# # -------------------------------
# # Positional Encoding Module
# # -------------------------------
# class PositionalEncoding(nn.Module):
#     def __init__(self, d_model, dropout=0.1, max_len=600):
#         super(PositionalEncoding, self).__init__()
#         self.dropout = nn.Dropout(p=dropout)
        
#         pe = torch.zeros(max_len, d_model)
#         position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) * (-math.log(10000.0) / d_model))
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(1)  # Shape: (max_len, 1, d_model)
#         self.register_buffer('pe', pe)
        
#     def forward(self, x):
#         # x shape: (seq_len, batch_size, d_model)
#         x = x + self.pe[:x.size(0)]
#         return self.dropout(x)

# # -------------------------------
# # Transformer Encoder–Only Classifier
# # -------------------------------
# class TransformerEncoderClassifier(nn.Module):
#     def __init__(self, input_dim=258, seq_len=512, d_model=256, num_layers=4, nhead=8, num_classes=100, dropout=0.1):
#         """
#         input_dim: Dimensionality of each frame's features (258)
#         seq_len: Number of frames per video (512)
#         d_model: Model dimension after projection
#         num_layers: Number of transformer encoder layers
#         nhead: Number of attention heads
#         num_classes: Number of output classes
#         """
#         super(TransformerEncoderClassifier, self).__init__()
#         # Project the input to the model dimension
#         self.input_proj = nn.Linear(input_dim, d_model)
#         self.positional_encoding = PositionalEncoding(d_model, dropout=dropout, max_len=seq_len+1)
        
#         encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout)
#         self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
#         # Define a learnable [CLS] token for classification
#         self.cls_token = nn.Parameter(torch.randn(1, 1, d_model))
#         self.fc = nn.Linear(d_model, num_classes)
        
#     def forward(self, x):
#         # x shape: (batch_size, seq_len, input_dim)
#         batch_size = x.size(0)
#         # Project input to d_model
#         x = self.input_proj(x)  # (batch_size, seq_len, d_model)
        
#         # Prepend the CLS token to each sequence
#         cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # (batch_size, 1, d_model)
#         x = torch.cat([cls_tokens, x], dim=1)  # (batch_size, seq_len+1, d_model)
        
#         # Transformer encoder expects input shape (seq_len+1, batch_size, d_model)
#         x = x.transpose(0, 1)  # (seq_len+1, batch_size, d_model)
        
#         # Add positional encoding
#         x = self.positional_encoding(x)
        
#         # Pass through transformer encoder
#         x = self.transformer_encoder(x)  # (seq_len+1, batch_size, d_model)
        
#         # Use the output of the CLS token for classification
#         cls_output = x[0]  # (batch_size, d_model)
#         logits = self.fc(cls_output)  # (batch_size, num_classes)
#         return logits

In [2]:
video_directory = 'Latest-WLASL-100'

total = 0

gesture_folder = np.array(os.listdir(video_directory))
for gestures in gesture_folder:
    gesture = []

    for fname in os.listdir(os.path.join(video_directory, gestures)):
        path = os.path.join(video_directory, gestures, fname)
        if os.path.isdir(path):
            gesture.append(fname)

    total += len(gesture) 
    # print(gestures, end =" : ")        
    # print(len(gesture))

print("Total gestures: ", len(gesture_folder), "; Total videos: ", total)

Total gestures:  100 ; Total videos:  4076


In [3]:
label_map = {label: num for num, label in enumerate(gesture_folder)}
len(label_map)

100

In [5]:
X = np.load('X_TRAIN_landmarks_flipped.npy')
y = np.load('y_TRAIN_landmarks_flipped.npy')

print(X.shape, y.shape)

(4076, 512, 258) (4076,)


In [6]:
y = tf.keras.utils.to_categorical(y, num_classes=len(gesture_folder))
y.shape

(4076, 100)

In [7]:
y_labels = np.argmax(y, axis=1)

In [8]:
X_train_ori, X_test_ori, y_train_ori, y_test_ori = train_test_split(X, y, test_size=0.2, stratify=y_labels, random_state=42)

In [9]:
X_test_ori, X_val_ori, y_test_ori, y_val_ori = train_test_split(
    X_test_ori, y_test_ori, test_size=0.5, stratify=y_test_ori.argmax(axis=1), random_state=42)

In [10]:
y_train_ori.shape, y_test_ori.shape, y_val_ori.shape

((3260, 100), (408, 100), (408, 100))

In [11]:
import torch
import torch.nn as nn
import math
from torch.utils.data import DataLoader, TensorDataset

# -------------------------------
# Positional Encoding Module
# -------------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=600):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)  # Shape: (max_len, 1, d_model)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        # x shape: (seq_len, batch_size, d_model)
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# -------------------------------
# Transformer Encoder–Only Classifier
# -------------------------------
class TransformerEncoderClassifier(nn.Module):
    def __init__(self, input_dim=258, seq_len=512, d_model=256, num_layers=4, nhead=8, num_classes=100, dropout=0.1):
        """
        input_dim: Dimensionality of each frame's features (258)
        seq_len: Number of frames per video (512)
        d_model: Model dimension after projection
        num_layers: Number of transformer encoder layers
        nhead: Number of attention heads
        num_classes: Number of output classes
        """
        super(TransformerEncoderClassifier, self).__init__()
        # Project the input to the model dimension
        self.input_proj = nn.Linear(input_dim, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout=dropout, max_len=seq_len+1)
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Define a learnable [CLS] token for classification
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_model))
        self.fc = nn.Linear(d_model, num_classes)
        
    def forward(self, x):
        # x shape: (batch_size, seq_len, input_dim)
        batch_size = x.size(0)
        # Project input to d_model
        x = self.input_proj(x)  # (batch_size, seq_len, d_model)
        
        # Prepend the CLS token to each sequence
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # (batch_size, 1, d_model)
        x = torch.cat([cls_tokens, x], dim=1)  # (batch_size, seq_len+1, d_model)
        
        # Transformer encoder expects input shape (seq_len+1, batch_size, d_model)
        x = x.transpose(0, 1)  # (seq_len+1, batch_size, d_model)
        
        # Add positional encoding
        x = self.positional_encoding(x)
        
        # Pass through transformer encoder
        x = self.transformer_encoder(x)  # (seq_len+1, batch_size, d_model)
        
        # Use the output of the CLS token for classification
        cls_output = x[0]  # (batch_size, d_model)
        logits = self.fc(cls_output)  # (batch_size, num_classes)
        return logits

# Convert data to tensors
X_train = torch.tensor(X_train_ori, dtype=torch.float32)
X_test = torch.tensor(X_test_ori, dtype=torch.float32)
y_train = torch.tensor(y_train_ori.argmax(axis=1), dtype=torch.long)
y_test = torch.tensor(y_test_ori.argmax(axis=1), dtype=torch.long)
X_val = torch.tensor(X_val_ori, dtype=torch.float32)
y_val = torch.tensor(y_val_ori.argmax(axis=1), dtype=torch.long)

# DataLoader
batch_size = 32
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=32, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model, loss, and optimizer
input_size = X_train.size(-1)
seq_len = X_train.size(1) if len(X_train.shape) > 2 else 1  # Get sequence length if available
num_classes = len(label_map)

# Create the new transformer model
model = TransformerEncoderClassifier(
    input_dim=input_size,
    seq_len=seq_len,
    d_model=256,
    num_layers=4,
    nhead=8,
    num_classes=num_classes,
    dropout=0.1
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 400
loss_history = []
val_loss_history = []

# loss threshold
loss_threshold = 0.1

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for X_batch, y_batch in train_loader:
        
        # Move batch to GPU
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(X_batch)  # Forward pass
        loss = criterion(outputs, y_batch)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    loss_history.append(avg_loss)
    
    # Validation phase
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)  # Forward pass
            loss = criterion(outputs, y_batch)  # Compute loss
            val_loss += loss.item()  # Accumulate validation loss

            # Compute accuracy
            predictions = outputs.argmax(dim=1)
            correct += (predictions == y_batch).sum().item()
            total += y_batch.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_loss_history.append(avg_val_loss)
    val_accuracy = correct / total

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')

    if avg_loss < loss_threshold:
        print(f'Loss threshold of {loss_threshold} reached. Stopping training.')
        break

# Evaluate the model
model.eval()
with torch.no_grad():
    # Move test data to GPU
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    
    test_outputs = model(X_test)
    test_loss = criterion(test_outputs, y_test)
    accuracy = (test_outputs.argmax(dim=1) == y_test).float().mean()
    print(f'Test Loss: {test_loss.item():.4f}, Test Accuracy: {accuracy.item():.4f}')

Epoch [1/400], Loss: 4.7677, Val Loss: 4.6690, Val Accuracy: 0.0098
Epoch [2/400], Loss: 4.7067, Val Loss: 4.6518, Val Accuracy: 0.0172
Epoch [3/400], Loss: 4.6837, Val Loss: 4.6406, Val Accuracy: 0.0196
Epoch [4/400], Loss: 4.6761, Val Loss: 4.6293, Val Accuracy: 0.0196
Epoch [5/400], Loss: 4.6593, Val Loss: 4.6205, Val Accuracy: 0.0098
Epoch [6/400], Loss: 4.6527, Val Loss: 4.5398, Val Accuracy: 0.0221
Epoch [7/400], Loss: 4.6514, Val Loss: 4.6131, Val Accuracy: 0.0196
Epoch [8/400], Loss: 4.6454, Val Loss: 4.6161, Val Accuracy: 0.0196
Epoch [9/400], Loss: 4.6445, Val Loss: 4.6069, Val Accuracy: 0.0196
Epoch [10/400], Loss: 4.6396, Val Loss: 4.6044, Val Accuracy: 0.0147
Epoch [11/400], Loss: 4.6391, Val Loss: 4.6023, Val Accuracy: 0.0172
Epoch [12/400], Loss: 4.6338, Val Loss: 4.6041, Val Accuracy: 0.0196
Epoch [13/400], Loss: 4.6271, Val Loss: 4.6060, Val Accuracy: 0.0172
Epoch [14/400], Loss: 4.6256, Val Loss: 4.6017, Val Accuracy: 0.0196
Epoch [15/400], Loss: 4.6244, Val Loss: 4.6

KeyboardInterrupt: 