<a href="https://colab.research.google.com/github/ValentinaEmili/Sign_language/blob/main/ASL_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

code inspired to

https://github.com/AvishakeAdhikary/Realtime-Sign-Language-Detection-Using-LSTM-Model/blob/main/RealTimeSignLanguageDetection.ipynb

In [None]:
# mount google drive on colab
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import pandas as pd
import cv2
from google.colab.patches import cv2_imshow
from tqdm import tqdm
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import LSTM
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

Load the data

In [5]:
js_file = pd.read_json("/content/drive/MyDrive/NLP/WLASL_v0.3.json")
folder = "/content/drive/MyDrive/NLP/dataset/"

training_folder = folder + "train/"
validation_folder = folder + "val/"
test_folder = folder + "test/"

training_video = training_folder + "video/"
validation_video = validation_folder + "video/"
test_video = test_folder + "video/"

training_images = training_folder + "images/"
validation_images = validation_folder + "images/"
test_images = test_folder + "images/"

Preprocess the data

In [6]:
train_gloss, val_gloss, test_gloss = set(), set(), set()
for image in os.listdir(training_images):
  word, _ = image.split("_") # rsplit("_") removes the extension
  train_gloss.add(word)

for image in os.listdir(validation_images):
  word, _ = image.split("_")
  val_gloss.add(word)

for image in os.listdir(test_images):
  word, _ = image.split("_")
  test_gloss.add(word)
gloss = sorted(list(train_gloss | val_gloss | test_gloss))

# the gloss 'wash face' is missing cause all the urls are broken

label_map = {label: num for num, label in enumerate(gloss)}

In [7]:
train_files = sorted(os.listdir(training_images))
val_files = sorted(os.listdir(validation_images))
test_files = sorted(os.listdir(test_images))

def load_data(images, label_map):
  X, y = [], []
  corrupted_files = []
  max_len = 0
  for image in os.listdir(images):
    np_array = np.load(os.path.join(images, image))
    label, _ = image.split("_")

    if np_array.size == 0 or len(np_array.shape) < 2:
      corrupted_files.append((image, np_array.shape))
      continue

    length = np_array.shape[0]
    if length > max_len: max_len = length

    X.append(np_array)
    y.append(label_map[label])

  padded_X = []
  for np_array in X:
    pad_length = max_len - np_array.shape[0]
    padded_np_array = np.pad(np_array, ((0, pad_length), (0, 0)), mode='constant', constant_values=0) # add zero padding at the end
    padded_X.append(padded_np_array)

  X = torch.tensor(np.array(padded_X), dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.long)
  return X, y

num_classes = len(gloss)

X_train, y_train = load_data(training_images, label_map)
X_val, y_val = load_data(validation_images, label_map)
X_test, y_test = load_data(test_images, label_map)

Build and train LSTM Neural Network

In [8]:
class SignLanguageLSTM(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers, num_classes):
    super(SignLanguageLSTM, self).__init__()

    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.lstm1 = nn.LSTM(
        input_size=input_size,
        hidden_size=hidden_size,
        batch_first=True)

    self.lstm2 = nn.LSTM(
        input_size=hidden_size,
        hidden_size=hidden_size * 2,
        batch_first=True)

    self.lstm3 = nn.LSTM(
        input_size=hidden_size * 2,
        hidden_size=hidden_size,
        batch_first=True)

    self.fc1 = nn.Linear(hidden_size, hidden_size * 2)
    self.fc2 = nn.Linear(hidden_size * 2, hidden_size * 4)
    self.fc3 = nn.Linear(hidden_size * 4, num_classes)

    self.relu = nn.ReLU()

  def forward(self, x):
    # LSTM 1
    x, _ = self.lstm1(x)
    x = self.relu(x)

    # LSTM 2
    x, _ = self.lstm2(x)
    x = self.relu(x)

    # LSTM 3
    x, _ = self.lstm3(x)
    x = self.relu(x)

    x = x[:, -1, :]

    # fully connected layers
    x = self.relu(self.fc1(x))
    x = self.relu(self.fc2(x))
    x = self.fc3(x)

    return x

input_size = 258
hidden_size = 128
num_layers = 2
learning_rate = 0.001
num_epochs = 100
batch_size = 32

model = SignLanguageLSTM(input_size, hidden_size, num_layers, num_classes)

In [9]:
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

validation_dataset = TensorDataset(X_val, y_val)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
best_val_acc = 0.0
save_model = "/content/drive/MyDrive/NLP/saved_models/"
os.makedirs(save_model, exist_ok=True)
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print('-' * 30)

    for phase in ['train', 'val']:
        if phase == 'train':
            model.train() # enables dropout layers and batch normalization updates
            loader = train_loader
        else:
            model.eval() # disables dropout layers and batch normalization updates
            loader = validation_loader

        running_loss = 0.0
        correct = 0
        total = 0

        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)

            if phase == 'train':
                optimizer.zero_grad() # clears accumulated gradients before each batch during training

            with torch.set_grad_enabled(phase == 'train'): # enables gradient computation only during training, conserving memory during validation
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                if phase == 'train':
                    loss.backward() # calculates gradients through automatic differentiation
                    optimizer.step() # updates model weights based on calculated gradients

            running_loss += loss.item() # scalar value of the loss
            _, predicted = torch.max(outputs, 1) # extract the predicted value taking the one with the higher score
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        epoch_loss = running_loss / len(loader)
        epoch_acc = correct / total

        print(f"{phase.upper()} Loss: {epoch_loss:.4f} | Accuracy: {epoch_acc:.4f}")

        # Save best model based on validation accuracy
        if phase == 'val' and epoch_acc > best_val_acc:
            best_val_acc = epoch_acc
            torch.save(model.state_dict(), f"{save_model}/best_model.pt")
            print("Saved new best model")

# Final Test Evaluation
model.eval()
test_correct = 0
test_total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        test_correct += (predicted == labels).sum().item()
        test_total += labels.size(0)

test_acc = test_correct / test_total
print(f"\nTEST Accuracy: {test_acc:.4f}")
