In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
# Cargar el dataset
from utils.lsa64.dataset import LSA64Dataset
from torch.utils.data import DataLoader, random_split
import torch

dataset = LSA64Dataset("../../data/LSA64/landmarks")
torch.manual_seed(42)

train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train_ds, test_ds = random_split(dataset, [train_size, test_size])

In [12]:
from torch.nn.utils.rnn import pad_sequence
# Cargarlo al GPU, random_split ya lo shufflea, padearlo con 0s asi tienen el mismo tamaño(al entrenar se ignoran esos 0s)
def collate_pad(batch):
    xs, ys = zip(*batch)
    lengths = torch.tensor([x.size(0) for x in xs])
    x_padded = pad_sequence(xs, batch_first=True, padding_value=0.0)
    y_tensor = torch.tensor(ys)
    return x_padded, lengths, y_tensor

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=6, collate_fn=collate_pad) # Shufflearlo por cada epoch
test_loader = DataLoader(test_ds, batch_size=32, num_workers=6, collate_fn=collate_pad)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [13]:
print(dataset[0][0].shape)

torch.Size([30, 177])


In [22]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence

class SimpleRNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.RNN(
            input_size=177,
            hidden_size=177,
            num_layers=5,
            batch_first=True
        )
        self.linear = nn.Linear(177, 64)

    def forward(self, x, lengths):
        # lengths must be on CPU for pack_padded_sequence
        lengths = lengths.cpu()

        packed = pack_padded_sequence(
            x, lengths,
            batch_first=True,
            enforce_sorted=False
        )

        packed_out, h = self.rnn(packed)
        # h: (num_layers, batch, hidden_size)
        last = h[-1]                      # final layer’s hidden state

        logits = self.linear(last)
        return logits

In [23]:
EPOCH = 100

model = SimpleRNN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(EPOCH):
    # ======================
    # TRAIN
    # ======================
    model.train()
    for x, lengths, y in train_loader:
        x = x.to(device)
        y = y.to(device)

        y_pred = model(x, lengths)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # ======================
    # TRAIN ERROR (small subset)
    # ======================
    model.eval()
    train_loss = 0
    train_correct = 0
    train_total = 0
    train_batches = 5   # <--- amount of batches to sample

    with torch.no_grad():
        for i, (x, lengths, y) in enumerate(train_loader):
            if i >= train_batches:
                break

            x = x.to(device)
            y = y.to(device)

            y_pred = model(x, lengths)
            loss = criterion(y_pred, y)
            train_loss += loss.item()

            _, predicted = y_pred.max(1)
            train_correct += (predicted == y).sum().item()
            train_total += y.size(0)

    avg_train_loss = train_loss / train_batches
    train_acc = 100 * train_correct / train_total

    # ======================
    # TEST ERROR
    # ======================
    test_loss = 0
    test_correct = 0
    test_total = 0

    with torch.no_grad():
        for x, lengths, y in test_loader:
            x = x.to(device)
            y = y.to(device)

            y_pred = model(x, lengths)
            loss = criterion(y_pred, y)
            test_loss += loss.item()

            _, predicted = y_pred.max(1)
            test_correct += (predicted == y).sum().item()
            test_total += y.size(0)

    avg_test_loss = test_loss / len(test_loader)
    test_acc = 100 * test_correct / test_total

    # ======================
    # PRINT SUMMARY
    # ======================
    print(f"Epoch {epoch+1}/{EPOCH} | Train Acc: {train_acc:.2f}% | Test Acc: {test_acc:.2f}%")

Epoch 1/100 | Train Acc: 3.75% | Test Acc: 2.79%
Epoch 2/100 | Train Acc: 6.25% | Test Acc: 6.70%
Epoch 3/100 | Train Acc: 12.50% | Test Acc: 8.10%
Epoch 4/100 | Train Acc: 13.75% | Test Acc: 12.01%
Epoch 5/100 | Train Acc: 15.62% | Test Acc: 13.13%
Epoch 6/100 | Train Acc: 21.88% | Test Acc: 15.92%
Epoch 7/100 | Train Acc: 23.12% | Test Acc: 19.83%
Epoch 8/100 | Train Acc: 21.25% | Test Acc: 18.72%
Epoch 9/100 | Train Acc: 26.25% | Test Acc: 21.79%
Epoch 10/100 | Train Acc: 29.38% | Test Acc: 27.09%
Epoch 11/100 | Train Acc: 35.62% | Test Acc: 31.84%
Epoch 12/100 | Train Acc: 35.62% | Test Acc: 32.40%
Epoch 13/100 | Train Acc: 34.38% | Test Acc: 28.77%
Epoch 14/100 | Train Acc: 40.62% | Test Acc: 31.01%
Epoch 15/100 | Train Acc: 38.75% | Test Acc: 34.92%
Epoch 16/100 | Train Acc: 38.12% | Test Acc: 39.11%
Epoch 17/100 | Train Acc: 45.62% | Test Acc: 38.55%
Epoch 18/100 | Train Acc: 46.25% | Test Acc: 37.71%
Epoch 19/100 | Train Acc: 40.00% | Test Acc: 41.62%
Epoch 20/100 | Train Acc: 

Por ahora me gusta, pero quiero encontrar formas de mejorar la generalizacion porque cuando tenga datasets peores, ahi si voy a tener mucho mas overfitting.

En un paper lei que tuvieron 94% de accuracy, voy a intentar llegar a eso.
# Resultados
- RNN simple, tanh, 5 capas ocultas y 1 de salida, adam, lr=1e-4, lambda=1e-3: 99.38%, 89.94%

# Ideas para probar
- **REVISAR QUE MODELOS Y ARQUITECTURAS QUE HACEN ESTO YA EXISTEN**
- La idea de PCA para dataset augmentation
- Transfer learning usando otro lenguaje de señas similar al argentino pero mas completo
-   Freezar las conexiones recurrentes? Freezar las primeras capas?
- Un par de capas ocultas mas
- Un par de capas ocultas no recurrentes con ReLU
- Usar una GRU o echo state network para mayor memoria a largo plazo(me parece al pedo una LSTM porque si ya con este nivel de memoria se maneja bien, entonces no creo necesitar tanto control)
- Encontrar una representacion menos ruidosa para los datos(el z lo infiere mal mediapipe en teoria, y la escala de las manos interpoladas tienen mucho ruido)
- Quizas reducir los fps a 6 o menos
- Aumentar el tamaño de los batches
- Reducir el learning rate a $10^{-4}$ una vez que llegue ~80% porque oscila mucho
  - Fijarme en un grafico el punto donde empieza a oscilar
  - No graficar simplemente el train error, graficar la diferencia porcentual entre el train error anterior y este