<a href="https://colab.research.google.com/github/ValentinaEmili/Sign_language/blob/main/ASL_recognition_100.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The glossary is made of 100 different words but the instances for each word are not the same as the ones in WLASL_v0.3 file. Indeed, some links were broken and the correspective instances have been removed. Every word has at least one instance.

In [1]:
# mount google drive on colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [43]:
import pandas as pd
import cv2
from google.colab.patches import cv2_imshow
from tqdm import tqdm
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence, pad_packed_sequence
from torch.nn import LSTM
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, Dataset
import shutil

In [3]:
js_100 = pd.read_json("/content/drive/MyDrive/NLP/WLASL100.json")
folder = "/content/drive/MyDrive/NLP/dataset/subset_100/"
original_folder = "/content/drive/MyDrive/NLP/dataset/"

training_folder = folder + "train/"
validation_folder = folder + "val/"
test_folder = folder + "test/"

training_video = training_folder + "video/"
validation_video = validation_folder + "video/"
test_video = test_folder + "video/"

training_images = training_folder + "images/"
validation_images = validation_folder + "images/"
test_images = test_folder + "images/"

os.makedirs(training_video, exist_ok=True)
os.makedirs(validation_video, exist_ok=True)
os.makedirs(test_video, exist_ok=True)

os.makedirs(training_images, exist_ok=True)
os.makedirs(validation_images, exist_ok=True)
os.makedirs(test_images, exist_ok=True)


Preprocess the data

In [4]:
train_gloss, val_gloss, test_gloss = set(), set(), set()
for image in os.listdir(training_images):
  word, _ = image.split("_")
  train_gloss.add(word)

for image in os.listdir(validation_images):
  word, _ = image.split("_")
  val_gloss.add(word)

for image in os.listdir(test_images):
  word, _ = image.split("_")
  test_gloss.add(word)
gloss = sorted(list(train_gloss | val_gloss | test_gloss))


label_map = {label: num for num, label in enumerate(gloss)}

Build and train LSTM Neural Network

In [5]:
class SignLanguageDataset(Dataset):
  def __init__(self, image_dir, label_map):
     self.image_dir = image_dir
     self.label_map = label_map
     self.files = sorted(os.listdir(image_dir))

  def __len__(self):
    return len(self.files)

  def __getitem__(self, idx):
    file_name = self.files[idx]
    np_array = np.load(os.path.join(self.image_dir, file_name))
    if np_array.size == 0 or len(np_array.shape) != 2 or np_array.shape[1] != 258:
      print(f"Warning: Empty or invalid shape for file: {file_name}")
      np_array = np.zeros((1, 258), dtype=np.float32)

    label, _ = file_name.split("_")
    label = self.label_map[label]

    return torch.tensor(np_array, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

# Add zero-padding to get sequences of the same length for each batch
def collate_fn(batch):
  sequences, labels = zip(*batch)
  lengths = [len(seq) for seq in sequences]
  padded_sequences = pad_sequence(sequences, batch_first=True)

  # pack the padded sequence
  packed_sequences = pack_padded_sequence(padded_sequences, lengths, batch_first=True, enforce_sorted=False)
  return packed_sequences, torch.tensor(labels)

train_dataset = SignLanguageDataset(training_images, label_map)
val_dataset = SignLanguageDataset(validation_images, label_map)
test_dataset = SignLanguageDataset(test_images, label_map)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [70]:
class SignLanguageLSTM(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers=4, num_classes=0, dropout_rate=0.5):
    super(SignLanguageLSTM, self).__init__()

    # input regularization
    self.input_bn = nn.BatchNorm1d(input_size)
    self.input_dropout = nn.Dropout(0.3)

    # single bidirectional LSTM layer
    self.lstm = nn.LSTM(
        input_size=input_size,
        hidden_size=hidden_size,
        batch_first=True,
        num_layers=num_layers,
        dropout=dropout_rate,
        bidirectional=True)

    # fully connected layers
    self.fc1 = nn.Linear(hidden_size * 2, hidden_size)
    self.fc2 = nn.Linear(hidden_size, num_classes)

    self.dropout = nn.Dropout(dropout_rate)

  def forward(self, packed_input):
    # unpack input for batch normalization
    padded_input, lengths = pad_packed_sequence(packed_input, batch_first=True)

    # apply input normalization and dropout
    padded_input = padded_input.transpose(1, 2)
    padded_input = self.input_bn(padded_input)
    padded_input = padded_input.transpose(1, 2)
    padded_input = self.input_dropout(padded_input)

    # re-pack input
    packed_input = pack_padded_sequence(padded_input, lengths, batch_first=True, enforce_sorted=False)

    # LSTM
    packed_output, (hn, cn) = self.lstm(packed_input)

    output_forward = hn[0, :, :] # last hidden state for forward direction
    output_backward = hn[1, :, :] # last hidden state for backward direction
    output = torch.cat((output_forward, output_backward), dim=1)

    output = F.relu(self.fc1(output))
    output = self.fc2(self.dropout(output))

    return output

In [59]:
all_labels = [image.split("_")[0] for image in os.listdir(training_images)]
label_counts = {label: all_labels.count(label) for label in label_map}
weight = sum(label_counts.values()) / len(label_counts)
weights = torch.tensor([weight / count for _, count in label_counts.items()], dtype=torch.float32)

In [None]:
# training configuration tailored for small datasets
def get_training_config():
  return {
    'hidden_size': 256, #{64, 128, 256}
    'learning_rate': 1e-3, # 'learning_rate': {1e-3, 1e-4, 1e-5}
    'num_epochs': 100,
    'weight_decay': 1e-4,
    'dropout_rate': 0.3,
    'scheduler_params': {
      'factor': 0.5,
      'min_lr': 1e-6
    },
  }

best_accuracy = 0.0
training_history = []
config = get_training_config()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = config['num_epochs']

model = SignLanguageLSTM(
    input_size=258,
    hidden_size = config['hidden_size'],
    num_classes = len(label_map),
    dropout_rate=config['dropout_rate']).to(device)

criterion = nn.CrossEntropyLoss() # for multi-class classification
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=config['learning_rate'],
    weight_decay=config['weight_decay'])

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='max',
    factor=config['scheduler_params']['factor'],
    min_lr=config['scheduler_params']['min_lr'],
    patience=5)

for epoch in range(num_epochs):
  model.train()
  running_loss = 0.0

  for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs} [Train]'):
    inputs, labels = inputs.to(device), labels.to(device)

    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    optimizer.step()

    running_loss += loss.item()

  avg_train_loss = running_loss / len(train_loader)
  print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_train_loss:.4f}')

  # evaluation phase
  model.eval()
  val_loss, correct, total = 0, 0, 0

  with torch.no_grad():
    for inputs, labels in tqdm(val_loader, desc=f'Epoch {epoch + 1}/{num_epochs} [Valid]'):
      inputs, labels = inputs.to(device), labels.to(device)
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      val_loss += loss.item()

      _, predicted = torch.max(outputs, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()
  accuracy = correct / total
  avg_val_loss = val_loss / len(val_loader)

  # update learning rate based on validation accuracy
  scheduler.step(accuracy)

  print(f'Validation Accuracy: {accuracy * 100:.2f}%')
  print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {avg_val_loss:.4f}')

  # store training history
  training_history.append({
    'epoch': epoch + 1,
    'train_loss': avg_train_loss,
    'val_loss': avg_val_loss,
    'acc': round(accuracy * 100, 2),  # store as percentage
    'lr': optimizer.param_groups[0]['lr']
    })

  # save best model
  if accuracy > best_accuracy:
    best_accuracy = accuracy
    torch.save({
      'epoch': epoch,
      'model_state_dict': model.state_dict(),
      'optimizer_state_dict': optimizer.state_dict(),
      'accuracy': accuracy, # saved as decimal
      'val_loss': avg_val_loss,
    }, '/content/drive/MyDrive/NLP/saved_models/best_model_100.pth')
    print(f'Saved new best model with accuracy: {best_accuracy * 100:.2f}%\n-----')

Epoch 1/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 11.21it/s]


Epoch [1/100], Loss: 4.6332


Epoch 1/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.41it/s]


Validation Accuracy: 1.42%
Epoch [1/100], Validation Loss: 4.5999
Saved new best model with accuracy: 1.42%
-----


Epoch 2/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.11it/s]


Epoch [2/100], Loss: 4.5024


Epoch 2/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 14.97it/s]


Validation Accuracy: 1.90%
Epoch [2/100], Validation Loss: 4.4280
Saved new best model with accuracy: 1.90%
-----


Epoch 3/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.99it/s]


Epoch [3/100], Loss: 4.2860


Epoch 3/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.71it/s]


Validation Accuracy: 2.37%
Epoch [3/100], Validation Loss: 4.3490
Saved new best model with accuracy: 2.37%
-----


Epoch 4/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.60it/s]


Epoch [4/100], Loss: 4.1694


Epoch 4/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 14.78it/s]


Validation Accuracy: 0.95%
Epoch [4/100], Validation Loss: 4.3077


Epoch 5/100 [Train]: 100%|██████████| 58/58 [00:06<00:00,  8.45it/s]


Epoch [5/100], Loss: 4.1042


Epoch 5/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.53it/s]


Validation Accuracy: 1.42%
Epoch [5/100], Validation Loss: 4.2865


Epoch 6/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.46it/s]


Epoch [6/100], Loss: 4.0225


Epoch 6/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.13it/s]


Validation Accuracy: 2.84%
Epoch [6/100], Validation Loss: 4.2856
Saved new best model with accuracy: 2.84%
-----


Epoch 7/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 11.04it/s]


Epoch [7/100], Loss: 3.9866


Epoch 7/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.13it/s]


Validation Accuracy: 2.37%
Epoch [7/100], Validation Loss: 4.2334


Epoch 8/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.51it/s]


Epoch [8/100], Loss: 3.9108


Epoch 8/100 [Valid]: 100%|██████████| 14/14 [00:02<00:00,  6.26it/s]


Validation Accuracy: 0.95%
Epoch [8/100], Validation Loss: 4.2754


Epoch 9/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.76it/s]


Epoch [9/100], Loss: 3.8176


Epoch 9/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.01it/s]


Validation Accuracy: 3.32%
Epoch [9/100], Validation Loss: 4.1093
Saved new best model with accuracy: 3.32%
-----


Epoch 10/100 [Train]: 100%|██████████| 58/58 [00:05<00:00,  9.78it/s]


Epoch [10/100], Loss: 3.7443


Epoch 10/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 15.84it/s]


Validation Accuracy: 3.32%
Epoch [10/100], Validation Loss: 4.1407


Epoch 11/100 [Train]: 100%|██████████| 58/58 [00:06<00:00,  8.99it/s]


Epoch [11/100], Loss: 3.6358


Epoch 11/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.28it/s]


Validation Accuracy: 2.84%
Epoch [11/100], Validation Loss: 4.0802


Epoch 12/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.63it/s]


Epoch [12/100], Loss: 3.5524


Epoch 12/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.18it/s]


Validation Accuracy: 4.27%
Epoch [12/100], Validation Loss: 4.1151
Saved new best model with accuracy: 4.27%
-----


Epoch 13/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.92it/s]


Epoch [13/100], Loss: 3.4405


Epoch 13/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.38it/s]


Validation Accuracy: 5.69%
Epoch [13/100], Validation Loss: 4.0917
Saved new best model with accuracy: 5.69%
-----


Epoch 14/100 [Train]: 100%|██████████| 58/58 [00:06<00:00,  8.41it/s]


Epoch [14/100], Loss: 3.3103


Epoch 14/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.36it/s]


Validation Accuracy: 7.11%
Epoch [14/100], Validation Loss: 4.0071
Saved new best model with accuracy: 7.11%
-----


Epoch 15/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 11.03it/s]


Epoch [15/100], Loss: 3.2772


Epoch 15/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.05it/s]


Validation Accuracy: 6.64%
Epoch [15/100], Validation Loss: 3.9792


Epoch 16/100 [Train]: 100%|██████████| 58/58 [00:07<00:00,  7.75it/s]


Epoch [16/100], Loss: 3.1644


Epoch 16/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.35it/s]


Validation Accuracy: 6.16%
Epoch [16/100], Validation Loss: 4.0446


Epoch 17/100 [Train]: 100%|██████████| 58/58 [00:04<00:00, 11.83it/s]


Epoch [17/100], Loss: 2.9817


Epoch 17/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.45it/s]


Validation Accuracy: 7.58%
Epoch [17/100], Validation Loss: 4.0626
Saved new best model with accuracy: 7.58%
-----


Epoch 18/100 [Train]: 100%|██████████| 58/58 [00:05<00:00,  9.82it/s]


Epoch [18/100], Loss: 2.9836


Epoch 18/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.54it/s]


Validation Accuracy: 9.95%
Epoch [18/100], Validation Loss: 4.0103
Saved new best model with accuracy: 9.95%
-----


Epoch 19/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 11.17it/s]


Epoch [19/100], Loss: 2.8803


Epoch 19/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.62it/s]


Validation Accuracy: 7.11%
Epoch [19/100], Validation Loss: 4.0044


Epoch 20/100 [Train]: 100%|██████████| 58/58 [00:07<00:00,  7.77it/s]


Epoch [20/100], Loss: 2.7529


Epoch 20/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 15.91it/s]


Validation Accuracy: 6.64%
Epoch [20/100], Validation Loss: 3.9647


Epoch 21/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 11.42it/s]


Epoch [21/100], Loss: 2.6841


Epoch 21/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 15.18it/s]


Validation Accuracy: 7.11%
Epoch [21/100], Validation Loss: 4.0255


Epoch 22/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.83it/s]


Epoch [22/100], Loss: 2.5390


Epoch 22/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 15.83it/s]


Validation Accuracy: 7.11%
Epoch [22/100], Validation Loss: 4.1390


Epoch 23/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 11.47it/s]


Epoch [23/100], Loss: 2.4222


Epoch 23/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 15.91it/s]


Validation Accuracy: 11.37%
Epoch [23/100], Validation Loss: 4.1534
Saved new best model with accuracy: 11.37%
-----


Epoch 24/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.20it/s]


Epoch [24/100], Loss: 2.3725


Epoch 24/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.70it/s]


Validation Accuracy: 9.95%
Epoch [24/100], Validation Loss: 4.1511


Epoch 25/100 [Train]: 100%|██████████| 58/58 [00:04<00:00, 11.61it/s]


Epoch [25/100], Loss: 2.2331


Epoch 25/100 [Valid]: 100%|██████████| 14/14 [00:02<00:00,  5.25it/s]


Validation Accuracy: 12.80%
Epoch [25/100], Validation Loss: 4.1167
Saved new best model with accuracy: 12.80%
-----


Epoch 26/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.79it/s]


Epoch [26/100], Loss: 2.1257


Epoch 26/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.23it/s]


Validation Accuracy: 9.95%
Epoch [26/100], Validation Loss: 4.2895


Epoch 27/100 [Train]: 100%|██████████| 58/58 [00:07<00:00,  8.12it/s]


Epoch [27/100], Loss: 2.0891


Epoch 27/100 [Valid]: 100%|██████████| 14/14 [00:01<00:00, 12.55it/s]


Validation Accuracy: 11.37%
Epoch [27/100], Validation Loss: 4.1696


Epoch 28/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.20it/s]


Epoch [28/100], Loss: 1.9374


Epoch 28/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.00it/s]


Validation Accuracy: 11.37%
Epoch [28/100], Validation Loss: 4.2594


Epoch 29/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.96it/s]


Epoch [29/100], Loss: 1.8666


Epoch 29/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 14.53it/s]


Validation Accuracy: 9.95%
Epoch [29/100], Validation Loss: 4.3255


Epoch 30/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 11.15it/s]


Epoch [30/100], Loss: 1.8086


Epoch 30/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 15.76it/s]


Validation Accuracy: 9.48%
Epoch [30/100], Validation Loss: 4.3733


Epoch 31/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 11.10it/s]


Epoch [31/100], Loss: 1.7037


Epoch 31/100 [Valid]: 100%|██████████| 14/14 [00:01<00:00, 12.34it/s]


Validation Accuracy: 11.85%
Epoch [31/100], Validation Loss: 4.3687


Epoch 32/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.93it/s]


Epoch [32/100], Loss: 1.4208


Epoch 32/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.71it/s]


Validation Accuracy: 13.74%
Epoch [32/100], Validation Loss: 4.2954
Saved new best model with accuracy: 13.74%
-----


Epoch 33/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 11.04it/s]


Epoch [33/100], Loss: 1.2878


Epoch 33/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 15.77it/s]


Validation Accuracy: 11.85%
Epoch [33/100], Validation Loss: 4.5334


Epoch 34/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.95it/s]


Epoch [34/100], Loss: 1.2347


Epoch 34/100 [Valid]: 100%|██████████| 14/14 [00:02<00:00,  6.62it/s]


Validation Accuracy: 13.74%
Epoch [34/100], Validation Loss: 4.5175


Epoch 35/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.28it/s]


Epoch [35/100], Loss: 1.1588


Epoch 35/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 14.94it/s]


Validation Accuracy: 15.64%
Epoch [35/100], Validation Loss: 4.4404
Saved new best model with accuracy: 15.64%
-----


Epoch 36/100 [Train]: 100%|██████████| 58/58 [00:05<00:00, 10.50it/s]


Epoch [36/100], Loss: 1.1270


Epoch 36/100 [Valid]: 100%|██████████| 14/14 [00:00<00:00, 16.06it/s]


Validation Accuracy: 14.69%
Epoch [36/100], Validation Loss: 4.5225


Epoch 37/100 [Train]:   0%|          | 0/58 [00:00<?, ?it/s]

In [69]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
  for inputs, labels in test_loader:
    inputs, labels = inputs.to(device), labels.to(device)
    outputs = model(inputs)
    _, predicted = torch.max(outputs, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
test_accuracy = correct / total  # Test accuracy
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

Test Accuracy: 14.29%
