In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os

SKELETON_PKL_PATH = "/content/drive/MyDrive/Colab Notebooks/ucf101_2d.pkl"

print("Existe:", os.path.exists(SKELETON_PKL_PATH))
print("Tamaño:", os.path.getsize(SKELETON_PKL_PATH))


Existe: True
Tamaño: 1070780736


In [3]:
import pickle
import numpy as np

with open(SKELETON_PKL_PATH, "rb") as f:
    data = pickle.load(f)

print("Keys en el dict:", data.keys())

split = data.get("split", None)
annotations = data.get("annotations", None)

print("Tipo de split:", type(split))
print("Tipo de annotations:", type(annotations))
print("Numero de anotaciones:", len(annotations))

print("\nEjemplo de anotacion:")
example = annotations[0]
for k, v in example.items():
    if isinstance(v, np.ndarray):
        print(f"{k}: ndarray, shape = {v.shape}")
    else:
        print(f"{k}: {v}")


Keys en el dict: dict_keys(['split', 'annotations'])
Tipo de split: <class 'dict'>
Tipo de annotations: <class 'list'>
Numero de anotaciones: 13320

Ejemplo de anotacion:
keypoint: ndarray, shape = (1, 119, 17, 2)
keypoint_score: ndarray, shape = (1, 119, 17)
frame_dir: v_ApplyEyeMakeup_g08_c01
total_frames: 119
original_shape: (256, 340)
img_shape: (256, 340)
label: 0


In [4]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

MAX_FRAMES = 60
BATCH_SIZE = 32



Device: cuda


In [5]:
total = len(annotations)
train_size = int(0.8 * total)
val_size = total - train_size

train_subset, val_subset = random_split(annotations, [train_size, val_size])

train_ann = list(train_subset)
val_ann = list(val_subset)

print("Total anotaciones:", total)
print("Train:", len(train_ann))
print("Val:", len(val_ann))


Total anotaciones: 13320
Train: 10656
Val: 2664


In [6]:
all_labels = sorted(list({ann["label"] for ann in (train_ann + val_ann)}))
print("Total de clases:", len(all_labels))

used_labels = all_labels

label_to_idx = {lbl: i for i, lbl in enumerate(used_labels)}
idx_to_label = {i: lbl for lbl, i in label_to_idx.items()}

print("Ejemplo mapeo:", list(label_to_idx.items())[:10])



Total de clases: 101
Ejemplo mapeo: [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9)]


In [7]:
class SkeletonDataset(Dataset):
    def __init__(self, annotations_list, label_to_idx, max_frames=60, use_first_person=True, used_labels=None):
        self.max_frames = max_frames
        self.label_to_idx = label_to_idx
        self.use_first_person = use_first_person
        self.used_labels = set(used_labels) if used_labels is not None else None

        self.samples = []
        for ann in annotations_list:
            lbl = ann["label"]
            if self.used_labels is not None and lbl not in self.used_labels:
                continue
            if lbl not in self.label_to_idx:
                continue
            self.samples.append(ann)

        print("Total de muestras:", len(self.samples))

    def __len__(self):
        return len(self.samples)

    def _process_keypoint(self, keypoint):
        keypoint = keypoint[0]
        T, V, C = keypoint.shape

        if T >= self.max_frames:
            keypoint = keypoint[:self.max_frames]
        else:
            pad_len = self.max_frames - T
            pad = np.zeros((pad_len, V, C), dtype=keypoint.dtype)
            keypoint = np.concatenate([keypoint, pad], axis=0)

        keypoint = keypoint.reshape(self.max_frames, V * C)
        return keypoint

    def __getitem__(self, idx):
        ann = self.samples[idx]
        keypoint = self._process_keypoint(ann["keypoint"])
        seq = torch.from_numpy(keypoint).float()
        label = self.label_to_idx[ann["label"]]
        return seq, label



In [8]:
MAX_FRAMES = 60
BATCH_SIZE = 32

train_dataset = SkeletonDataset(
    annotations_list=train_ann,
    label_to_idx=label_to_idx,
    max_frames=MAX_FRAMES,
    use_first_person=True,
    used_labels=used_labels
)

val_dataset = SkeletonDataset(
    annotations_list=val_ann,
    label_to_idx=label_to_idx,
    max_frames=MAX_FRAMES,
    use_first_person=True,
    used_labels=used_labels
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

batch_x, batch_y = next(iter(train_loader))
print("Batch X shape:", batch_x.shape)
print("Batch Y shape:", batch_y.shape)


Total de muestras: 10656
Total de muestras: 2664
Batch X shape: torch.Size([32, 60, 34])
Batch Y shape: torch.Size([32])


In [9]:
train_dataset = SkeletonDataset(
    annotations_list=train_ann,
    label_to_idx=label_to_idx,
    max_frames=MAX_FRAMES,
    use_first_person=True,
    used_labels=used_labels
)

val_dataset = SkeletonDataset(
    annotations_list=val_ann,
    label_to_idx=label_to_idx,
    max_frames=MAX_FRAMES,
    use_first_person=True,
    used_labels=used_labels
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

batch_x, batch_y = next(iter(train_loader))
print("Batch X shape:", batch_x.shape)
print("Batch Y shape:", batch_y.shape)


Total de muestras: 10656
Total de muestras: 2664
Batch X shape: torch.Size([32, 60, 34])
Batch Y shape: torch.Size([32])


In [10]:
input_dim = batch_x.shape[2]
num_classes = len(used_labels)

class ActionBiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_classes, dropout=0.3):
        super().__init__()
        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True
        )
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        out, (h_n, c_n) = self.lstm(x)
        h_forward = h_n[-2]
        h_backward = h_n[-1]
        h_cat = torch.cat((h_forward, h_backward), dim=1)
        logits = self.fc(h_cat)
        return logits

hidden_dim = 256
num_layers = 2

model = ActionBiLSTM(input_dim, hidden_dim, num_layers, num_classes).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

print("Input dim:", input_dim)
print("Num classes:", num_classes)
print("Device modelo:", next(model.parameters()).device)



Input dim: 34
Num classes: 101
Device modelo: cuda:0


In [11]:
def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0
    for x, y in dataloader:
        x = x.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == y).sum().item()
        total += y.size(0)

    avg_loss = total_loss / total
    acc = correct / total
    return avg_loss, acc

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(device)
            y = y.to(device)

            logits = model(x)
            loss = criterion(logits, y)

            total_loss += loss.item() * x.size(0)
            preds = logits.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

    avg_loss = total_loss / total
    acc = correct / total
    return avg_loss, acc


In [13]:
num_epochs = 100

history_bi = {
    "train_loss": [],
    "train_acc": [],
    "val_loss": [],
    "val_acc": []
}

for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, DEVICE)
    val_loss, val_acc = evaluate(model, val_loader, criterion, DEVICE)

    history_bi["train_loss"].append(train_loss)
    history_bi["train_acc"].append(train_acc)
    history_bi["val_loss"].append(val_loss)
    history_bi["val_acc"].append(val_acc)

    print(f"Epoch {epoch:02d} | "
          f"Train loss: {train_loss:.4f} acc: {train_acc:.4f} | "
          f"Val loss: {val_loss:.4f} acc: {val_acc:.4f}")


Epoch 01 | Train loss: 3.5450 acc: 0.1199 | Val loss: 3.4997 acc: 0.1126
Epoch 02 | Train loss: 3.4478 acc: 0.1319 | Val loss: 3.4127 acc: 0.1329
Epoch 03 | Train loss: 3.4248 acc: 0.1288 | Val loss: 3.3764 acc: 0.1434
Epoch 04 | Train loss: 3.3325 acc: 0.1498 | Val loss: 3.2863 acc: 0.1535
Epoch 05 | Train loss: 3.2795 acc: 0.1562 | Val loss: 3.4746 acc: 0.1250
Epoch 06 | Train loss: 3.2205 acc: 0.1635 | Val loss: 3.2909 acc: 0.1607
Epoch 07 | Train loss: 3.1630 acc: 0.1799 | Val loss: 3.1873 acc: 0.1682
Epoch 08 | Train loss: 3.0593 acc: 0.1949 | Val loss: 3.1546 acc: 0.1918
Epoch 09 | Train loss: 3.0027 acc: 0.2064 | Val loss: 3.1634 acc: 0.1689
Epoch 10 | Train loss: 2.9785 acc: 0.2080 | Val loss: 3.0804 acc: 0.1877
Epoch 11 | Train loss: 2.9222 acc: 0.2242 | Val loss: 3.1560 acc: 0.1783
Epoch 12 | Train loss: 2.8801 acc: 0.2349 | Val loss: 3.0223 acc: 0.2050
Epoch 13 | Train loss: 2.7951 acc: 0.2465 | Val loss: 3.0652 acc: 0.1967
Epoch 14 | Train loss: 2.7333 acc: 0.2631 | Val los

En este proyecto se implementó un modelo de reconocimiento de acciones utilizando el dataset UCF101 en su versión de esqueletos 2D, lo cual reduce la información visual a coordenadas de articulaciones. El modelo baseline, basado en un LSTM unidireccional con dimensión oculta de 128, alcanzó alrededor de 14% de accuracy en validación, superando ampliamente el azar (~1%) en un problema de 101 clases, demostrando que incluso con información reducida es posible aprender patrones temporales significativos.

Posteriormente, se mejoró la arquitectura utilizando un BiLSTM con mayor capacidad (hidden_dim=256) y un entrenamiento extendido, alcanzando aproximadamente 40% de accuracy en validación. Esta mejora evidencia que incrementar la complejidad del modelo y permitir el flujo bidireccional de información beneficia el desempeño en tareas de secuencias. Los resultados finales muestran un aprendizaje sólido dentro de las limitaciones del formato esquelético y cumplen con los objetivos del portafolio.