In [1]:
# Research Paper
# [Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition](https://arxiv.org/abs/1801.07455)
# [On loss functions and regret bounds for multi-category classification](https://arxiv.org/abs/2005.08155)

In [None]:
# Input: Keypoints [T, num_keypoints, 2] (e.g., [30, 8, 2])
#     ↓
# Graph Feature Extractor (GCN):
#     - Models relationships between body parts
#     - Outputs spatial embeddings [T, num_keypoints, d]
#     ↓
# Temporal Module (GRU or Attention Mechanics):
#     - Captures temporal dynamics in keypoint movement
#     - Outputs temporal embeddings [T, d]
#     ↓
# Global Average Pooling:
#     - Aggregates information across time
#     ↓
# Fully Connected Layers:
#     - Dense layers for classification
#     - Dropout for regularization
#     ↓
# Output: Behavior Class Probabilities


In [3]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

In [4]:
from sp_utils import update_config, pose_estimation, classification , save_model

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score

from rich import print
import os

In [None]:
import torch
import torch.nn as nn

from torch.utils.data import DataLoader
from torchvision import transforms

In [7]:
IMG_WIDTH = 320
IMG_HEIGHT = 240
NUM_KEYPOINTS = 7
NUM_BATCH = 16

CONFIG_PATH = "config.json"
MODEL_PATH = "models/classification"
DATASET_ROOT = "datasets"
TRAIN_DATASET_FILE = DATASET_ROOT + "/train_dataset.csv"
TEST_DATASET_FILE = DATASET_ROOT + "/test_dataset.csv"


train_df = pd.read_csv(TRAIN_DATASET_FILE)
test_df = pd.read_csv(TEST_DATASET_FILE)

In [8]:
train_df.head()

Unnamed: 0,behavior,image_id,image_file,head_x,head_y,beak_base_x,beak_base_y,beak_tip_x,beak_tip_y,neck_x,neck_y,body1_x,body1_y,body2_x,body2_y,tail_base_x,tail_base_y
0,1,p_018,79-20151219231702-00.jpg,65.861956,53.596922,91.482298,103.698925,102.299776,124.76454,128.48946,96.866834,150.124416,158.355656,320.357358,127.041904,358.503202,38.794058
1,1,p_018,79-20151219231703-00.jpg,72.694047,29.115262,82.372843,87.188038,75.540752,116.224426,117.102641,88.896061,147.277711,158.355656,307.831858,130.45795,362.488588,34.808671
2,1,p_018,79-20151219231704-00.jpg,80.095479,14.881738,78.387457,72.954514,59.029864,95.158811,102.869117,87.188038,152.971121,161.202361,298.153061,137.290041,348.255065,47.903513
3,1,p_018,79-20151219231705-00.jpg,69.278001,29.115262,77.248775,82.63331,59.599205,96.866834,101.730435,90.604084,129.058801,158.924997,286.196902,146.399496,340.284291,33.100648
4,1,p_018,79-20151219231706-00.jpg,88.066253,9.757669,86.927571,68.969128,72.124706,84.341333,109.131868,105.976289,148.985734,165.187748,307.831858,136.7207,339.71495,33.100648


In [9]:
# Device-Agnostic
DeviceLikeType = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(DeviceLikeType)
print(f"""
Device: {device}
Device CUDNN enabled: {torch.backends.cudnn.enabled}
""")

In [None]:
LABELS = ["nesting", "preening"] # [0, 1]
NUM_CLASS = len(train_df["behavior"].unique())
HIDDEN_DIM = 128


In [None]:
# Image Transformations Defination
img_transform = transforms.Compose([
    transforms.Resize((IMG_HEIGHT, IMG_WIDTH)),
    transforms.ToTensor(),
])

kp_transform = pose_estimation.NormalizeKeypoints(IMG_WIDTH, IMG_HEIGHT)

# Create datasets
train_dataset = classification.BehaviorDataset(
    dataframe=train_df, 
    dataset_root_folder=DATASET_ROOT, 
    img_transform=img_transform, 
    kp_transform=kp_transform
)

test_dataset = classification.BehaviorDataset(
    dataframe=test_df, 
    dataset_root_folder=DATASET_ROOT, 
    img_transform=img_transform, 
    kp_transform=kp_transform
)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=NUM_BATCH, shuffle=True, num_workers=0, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=NUM_BATCH, shuffle=False, num_workers=0, pin_memory=True)

In [None]:
model = classification.BirdBehaviorClassifier(NUM_KEYPOINTS, HIDDEN_DIM).to(device)

In [14]:
# Assuming num_keypoints corresponds to the number of keypoints in your skeleton
num_keypoints = 7

# Define the skeleton as pairs of connected keypoints
skeleton = [
    ["head", "beak_base"],
    ["beak_base", "beak_tip"],
    ["head", "neck"],
    ["neck", "body1"],
    ["body1", "body2"],
    ["body2", "tail_base"]
]

# Map keypoint names to indices
keypoint_names = [
    "head", "beak_base", "beak_tip", "neck", "body1", "body2", "tail_base"
]

keypoint_index_map = {name: idx for idx, name in enumerate(keypoint_names)}

# Generate edges based on the skeleton
edges = []
for connection in skeleton:
    idx1 = keypoint_index_map[connection[0]]
    idx2 = keypoint_index_map[connection[1]]
    edges.append([idx1, idx2])

# Convert edges to tensor and move to device (e.g., CUDA or CPU)
edges = torch.tensor(edges).t().to(device)

# Print edges for validation
print("Edges:", edges)

In [15]:
EPOCHS = 50
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 1e-4

In [None]:
from tqdm import tqdm

# Define optimizer and criterion
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
criterion = nn.BCEWithLogitsLoss()  # Proper binary classification loss

# Training Loop
for epoch in range(EPOCHS):
    model.train()  # Set model to training mode
    total_loss = 0.0
    all_preds = []
    all_labels = []

    with tqdm(train_loader, desc=f'Epoch {epoch + 1}/{EPOCHS}', unit='batch') as pbar:
        for batch in train_loader:
            images, keypoints, _, labels = batch  # Only use keypoints and labels
            keypoints, labels = keypoints.to(device), labels.to(device)
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(keypoints, edges).squeeze(1)  # Logits output
            
            # Compute loss
            loss = criterion(outputs, labels.float())  # BCEWithLogitsLoss requires float labels
            total_loss += loss.item()
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            # Compute binary predictions
            preds = (outputs >= 0).int()  # Threshold for binary classification
            all_preds.extend(preds.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())

            # Update progress bar
            pbar.update(1)
            pbar.set_postfix(batch_loss=loss.item())  # Show per-batch loss
    
    # Calculate epoch metrics
    epoch_loss = total_loss / len(train_loader)
    epoch_accuracy = accuracy_score(all_labels, all_preds)
    
    print(f"Epoch [{epoch + 1}/{EPOCHS}] - Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")


Epoch 1/50: 100%|██████████| 66/66 [00:04<00:00, 13.39batch/s, batch_loss=0.624]


Epoch 2/50: 100%|██████████| 66/66 [00:03<00:00, 21.42batch/s, batch_loss=0.792]


Epoch 3/50: 100%|██████████| 66/66 [00:03<00:00, 20.66batch/s, batch_loss=0.667]


Epoch 4/50: 100%|██████████| 66/66 [00:03<00:00, 20.82batch/s, batch_loss=0.707]


Epoch 5/50: 100%|██████████| 66/66 [00:03<00:00, 21.12batch/s, batch_loss=0.68] 


Epoch 6/50: 100%|██████████| 66/66 [00:03<00:00, 21.52batch/s, batch_loss=0.701]


Epoch 7/50: 100%|██████████| 66/66 [00:03<00:00, 21.15batch/s, batch_loss=0.696]


Epoch 8/50: 100%|██████████| 66/66 [00:03<00:00, 21.81batch/s, batch_loss=0.731]


Epoch 9/50: 100%|██████████| 66/66 [00:03<00:00, 19.95batch/s, batch_loss=0.681]


Epoch 10/50: 100%|██████████| 66/66 [00:03<00:00, 17.44batch/s, batch_loss=0.745]


Epoch 11/50: 100%|██████████| 66/66 [00:03<00:00, 20.29batch/s, batch_loss=0.693]


Epoch 12/50: 100%|██████████| 66/66 [00:03<00:00, 20.93batch/s, batch_loss=0.641]


Epoch 13/50: 100%|██████████| 66/66 [00:03<00:00, 20.77batch/s, batch_loss=0.693]


Epoch 14/50: 100%|██████████| 66/66 [00:03<00:00, 21.52batch/s, batch_loss=0.531]


Epoch 15/50: 100%|██████████| 66/66 [00:03<00:00, 20.94batch/s, batch_loss=0.695]


Epoch 16/50: 100%|██████████| 66/66 [00:03<00:00, 20.47batch/s, batch_loss=0.578]


Epoch 17/50: 100%|██████████| 66/66 [00:03<00:00, 20.36batch/s, batch_loss=0.49] 


Epoch 18/50: 100%|██████████| 66/66 [00:03<00:00, 20.23batch/s, batch_loss=0.871]


Epoch 19/50: 100%|██████████| 66/66 [00:03<00:00, 20.47batch/s, batch_loss=0.671]


Epoch 20/50: 100%|██████████| 66/66 [00:03<00:00, 20.38batch/s, batch_loss=0.689]


Epoch 21/50: 100%|██████████| 66/66 [00:03<00:00, 21.13batch/s, batch_loss=0.381]


Epoch 22/50: 100%|██████████| 66/66 [00:03<00:00, 20.21batch/s, batch_loss=0.601]


Epoch 23/50: 100%|██████████| 66/66 [00:03<00:00, 20.45batch/s, batch_loss=0.717]


Epoch 24/50: 100%|██████████| 66/66 [00:03<00:00, 21.50batch/s, batch_loss=0.621]


Epoch 25/50: 100%|██████████| 66/66 [00:03<00:00, 21.55batch/s, batch_loss=0.497]


Epoch 26/50: 100%|██████████| 66/66 [00:02<00:00, 22.11batch/s, batch_loss=0.37] 


Epoch 27/50: 100%|██████████| 66/66 [00:03<00:00, 20.89batch/s, batch_loss=0.705]


Epoch 28/50: 100%|██████████| 66/66 [00:03<00:00, 20.22batch/s, batch_loss=0.526]


Epoch 29/50: 100%|██████████| 66/66 [00:03<00:00, 20.52batch/s, batch_loss=0.727]


Epoch 30/50: 100%|██████████| 66/66 [00:03<00:00, 20.75batch/s, batch_loss=0.305]


Epoch 31/50: 100%|██████████| 66/66 [00:03<00:00, 20.52batch/s, batch_loss=0.586]


Epoch 32/50: 100%|██████████| 66/66 [00:03<00:00, 21.44batch/s, batch_loss=0.559]


Epoch 33/50: 100%|██████████| 66/66 [00:03<00:00, 21.90batch/s, batch_loss=0.336]


Epoch 34/50: 100%|██████████| 66/66 [00:03<00:00, 20.92batch/s, batch_loss=0.328]


Epoch 35/50: 100%|██████████| 66/66 [00:03<00:00, 19.91batch/s, batch_loss=0.392]


Epoch 36/50: 100%|██████████| 66/66 [00:03<00:00, 20.12batch/s, batch_loss=0.687]


Epoch 37/50: 100%|██████████| 66/66 [00:03<00:00, 19.94batch/s, batch_loss=0.492]


Epoch 38/50: 100%|██████████| 66/66 [00:03<00:00, 17.89batch/s, batch_loss=0.492]


Epoch 39/50: 100%|██████████| 66/66 [00:03<00:00, 18.81batch/s, batch_loss=0.87] 


Epoch 40/50: 100%|██████████| 66/66 [00:03<00:00, 18.52batch/s, batch_loss=0.361]


Epoch 41/50: 100%|██████████| 66/66 [00:03<00:00, 20.77batch/s, batch_loss=0.725]


Epoch 42/50: 100%|██████████| 66/66 [00:03<00:00, 20.79batch/s, batch_loss=0.384]


Epoch 43/50: 100%|██████████| 66/66 [00:03<00:00, 20.28batch/s, batch_loss=0.987]


Epoch 44/50: 100%|██████████| 66/66 [00:03<00:00, 19.48batch/s, batch_loss=0.331]


Epoch 45/50: 100%|██████████| 66/66 [00:03<00:00, 20.74batch/s, batch_loss=0.577]


Epoch 46/50: 100%|██████████| 66/66 [00:03<00:00, 20.79batch/s, batch_loss=0.804]


Epoch 47/50: 100%|██████████| 66/66 [00:03<00:00, 20.18batch/s, batch_loss=0.979]


Epoch 48/50: 100%|██████████| 66/66 [00:03<00:00, 20.67batch/s, batch_loss=0.483]


Epoch 49/50: 100%|██████████| 66/66 [00:03<00:00, 20.63batch/s, batch_loss=0.667]


Epoch 50/50: 100%|██████████| 66/66 [00:03<00:00, 20.95batch/s, batch_loss=0.378]


In [None]:
# Model Save
cls_model = save_model(model, model.name, MODEL_PATH, EPOCHS, LEARNING_RATE, "cls")

## Classification Testing

In [None]:
classification.evaluate(model, test_loader, device, criterion, edges, True)

{'accuracy': 0.9732142857142857,
 'loss': 0.12085052047457014,
 'precision': 1.0,
 'recall': 0.9032258064516129,
 'f1_score': 0.9491525423728814}

In [26]:

update_config(CONFIG_PATH, "model", {
    "classification": {
        "model_file" : cls_model,
        "architecture": model.name,
        "labels": LABELS,
        "hidden_dim": HIDDEN_DIM,
        "edges": edges.tolist(),
        "epochs": EPOCHS,
        "batch_size": NUM_BATCH,
        "learning_rate": LEARNING_RATE,
        "optimizer": type(optimizer).__name__,
        "criterion": type(criterion).__name__,
        "evaluation_metric": "precision_recall_fscore",
        "device_like_type": DeviceLikeType,
    }
})

Updated config.json model successfully.
