In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Cargar el dataset
from utils.lsa64.dataset import LSA64Dataset
from torch.utils.data import DataLoader, random_split
import torch

dataset = LSA64Dataset("../../data/LSA64/landmarks")
torch.manual_seed(42)

train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train_ds, test_ds = random_split(dataset, [train_size, test_size])

In [3]:
from torch.nn.utils.rnn import pad_sequence
# Cargarlo al GPU, random_split ya lo shufflea, padearlo con 0s asi tienen el mismo tamaño(al entrenar se ignoran esos 0s)
def collate_pad(batch):
    xs, ys = zip(*batch)
    lengths = torch.tensor([x.size(0) for x in xs])
    x_padded = pad_sequence(xs, batch_first=True, padding_value=0.0)
    y_tensor = torch.tensor(ys)
    return x_padded, lengths, y_tensor

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=6, collate_fn=collate_pad) # Shufflearlo por cada epoch
test_loader = DataLoader(test_ds, batch_size=32, num_workers=6, collate_fn=collate_pad)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [4]:
print(print(len(train_ds), len(test_ds)))

2552 639
None


In [5]:
from models import SimpleRNN

In [7]:
import torch.nn as nn

EPOCH = 200

model = SimpleRNN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-3)
criterion = nn.CrossEntropyLoss()

# ==========================================
# TRACK BEST PERFORMANCES
# ==========================================
best_train_acc = 0
best_train_acc_test_acc = 0

best_test_acc = 0
best_test_acc_train_acc = 0
# ==========================================

for epoch in range(EPOCH):
    # ======================
    # TRAIN
    # ======================
    model.train()
    for x, lengths, y in train_loader:
        x = x.to(device)
        y = y.to(device)

        y_pred = model(x, lengths)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # ======================
    # TRAIN ERROR (subset)
    # ======================
    model.eval()
    train_loss = 0
    train_correct = 0
    train_total = 0
    train_batches = 5

    with torch.no_grad():
        for i, (x, lengths, y) in enumerate(train_loader):
            if i >= train_batches:
                break

            x = x.to(device)
            y = y.to(device)

            y_pred = model(x, lengths)
            loss = criterion(y_pred, y)
            train_loss += loss.item()

            _, predicted = y_pred.max(1)
            train_correct += (predicted == y).sum().item()
            train_total += y.size(0)

    avg_train_loss = train_loss / train_batches
    train_acc = 100 * train_correct / train_total

    # ======================
    # TEST ERROR
    # ======================
    test_loss = 0
    test_correct = 0
    test_total = 0

    with torch.no_grad():
        for x, lengths, y in test_loader:
            x = x.to(device)
            y = y.to(device)

            y_pred = model(x, lengths)
            loss = criterion(y_pred, y)
            test_loss += loss.item()

            _, predicted = y_pred.max(1)
            test_correct += (predicted == y).sum().item()
            test_total += y.size(0)

    avg_test_loss = test_loss / len(test_loader)
    test_acc = 100 * test_correct / test_total

    # ==========================================
    # UPDATE BEST TRAIN & TEST RESULTS
    # ==========================================
    if train_acc > best_train_acc:
        best_train_acc = train_acc
        best_train_acc_test_acc = test_acc

    if test_acc > best_test_acc:
        best_test_acc = test_acc
        best_test_acc_train_acc = train_acc
        if epoch > 70: # Asi no guardo en todas las iteraciones iniciales
            torch.save(model.state_dict(), "best_params.pth")
    # ==========================================

    # ======================
    # PRINT SUMMARY
    # ======================
    print(f"Epoch {epoch+1}/{EPOCH} | Train Acc: {train_acc:.2f}% | Test Acc: {test_acc:.2f}%")

# ==========================================
# FINAL RESULTS
# ==========================================
print("\n================ FINAL RESULTS ================")
print(f"Highest Train Acc: {best_train_acc:.2f}% (Test Acc at that time: {best_train_acc_test_acc:.2f}%)")
print(f"Highest Test Acc:  {best_test_acc:.2f}% (Train Acc at that time: {best_test_acc_train_acc:.2f}%)")
print("===============================================\n")

Epoch 1/200 | Train Acc: 4.38% | Test Acc: 4.85%
Epoch 2/200 | Train Acc: 4.38% | Test Acc: 7.20%
Epoch 3/200 | Train Acc: 10.00% | Test Acc: 10.64%
Epoch 4/200 | Train Acc: 14.38% | Test Acc: 10.33%
Epoch 5/200 | Train Acc: 13.12% | Test Acc: 11.89%
Epoch 6/200 | Train Acc: 14.38% | Test Acc: 12.83%
Epoch 7/200 | Train Acc: 18.75% | Test Acc: 14.71%
Epoch 8/200 | Train Acc: 20.00% | Test Acc: 17.21%
Epoch 9/200 | Train Acc: 20.00% | Test Acc: 18.15%
Epoch 10/200 | Train Acc: 16.88% | Test Acc: 18.31%
Epoch 11/200 | Train Acc: 15.62% | Test Acc: 17.53%
Epoch 12/200 | Train Acc: 24.38% | Test Acc: 16.74%
Epoch 13/200 | Train Acc: 18.12% | Test Acc: 20.19%
Epoch 14/200 | Train Acc: 21.25% | Test Acc: 23.32%
Epoch 15/200 | Train Acc: 30.00% | Test Acc: 23.63%
Epoch 16/200 | Train Acc: 41.25% | Test Acc: 27.54%
Epoch 17/200 | Train Acc: 30.62% | Test Acc: 27.54%
Epoch 18/200 | Train Acc: 33.12% | Test Acc: 27.54%
Epoch 19/200 | Train Acc: 35.00% | Test Acc: 30.83%
Epoch 20/200 | Train Acc:

Highest Train Acc: 100.00% (Test Acc at that time: 95.45%)

Highest Test Acc:  98.27% (Train Acc at that time: 100.00%)

# Resultados
- RNN simple, tanh, 5 capas ocultas y 1 de salida, adam, lr=1e-4, lambda=1e-3: 99.38%, 89.94%
  -  Alrededor del doble de datos: 95%, 94.19%

Generaliza para el culo en el mundo real, no es capaz de predecir ni una seña. Sospecho porque en los videos estan todos sentados de la misma forma. Lo que quiero hacer es
- Sacar los landmarks de el torso. Solo tener los landmarks de los brazos y la mano
- Hacer una primera RNN que se dedique a determinar si es una seña de ambas manos o si es de solo la habil(derecha o izquierda, dataset augmentation para zurdos)
- Hacer una segunda etapa dedicada a determinar la seña
  - Una unica RNN con un filtro en su input y reemplazando la mano no-usada con ceros, transfer learning pero quizas le cueste generalizar
  - 2 RNNs diferentes. Una para ambas manos, otra para la mano habil, quizas generalize mejor pero no hay transfer learning

# Ideas para probar
- **REVISAR QUE MODELOS Y ARQUITECTURAS QUE HACEN ESTO YA EXISTEN**
- La idea de PCA para dataset augmentation
  - Agregar vectores de direccion aleatorios a la muñeca/nudillo y mover el resto del brazo con kinesis
- Transfer learning usando otro lenguaje de señas similar al argentino pero mas completo
-   Freezar las conexiones recurrentes? Freezar las primeras capas?
- Un par de capas ocultas mas
- Un par de capas ocultas no recurrentes con ReLU
- Usar una GRU o echo state network para mayor memoria a largo plazo(me parece al pedo una LSTM porque si ya con este nivel de memoria se maneja bien, entonces no creo necesitar tanto control)
- Encontrar una representacion menos ruidosa para los datos(el z lo infiere mal mediapipe en teoria, y la escala de las manos interpoladas tienen mucho ruido)
- Quizas reducir los fps a 6 o menos
- Aumentar el tamaño de los batches
- Reducir el learning rate a $10^{-4}$ una vez que llegue ~80% porque oscila mucho
  - Fijarme en un grafico el punto donde empieza a oscilar
  - No graficar simplemente el train error, graficar la diferencia porcentual entre el train error anterior y este

In [8]:
# Recargar el mejor modelo
model.load_state_dict(torch.load("best_params.pth", map_location=device))

  model.load_state_dict(torch.load("best_params.pth", map_location=device))


<All keys matched successfully>

In [10]:
# Ahora guardar la softmax de la seña correcta para cada seña en un csv, asi hago analisis despues
labels = []
softmax_score = []

model.eval()
with torch.no_grad():
    for loader in (train_loader, test_loader):
        print("a")
        for x, lengths, y in loader:
            x = x.to(device)
            y = y.to(device)

            y_pred = model(x, lengths)
            _, predicted = y_pred.max(1)
            probs = torch.softmax(y_pred, dim=1)

            for i in range(len(predicted)):
                if int(y[i]) == int(predicted[i]):
                    labels.append(y[i].item())
                    softmax_score.append(probs[i][y[i]].item())

a
a


In [11]:
import pandas as pd
df = pd.DataFrame({"label": labels, "softmax_score": softmax_score})
df.to_csv("rnn_scores.csv", index=False)