In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [None]:
df = pd.read_csv("pose_landmarks_all.csv")

df = df.sort_values(by=["video_name", "frame"]).reset_index(drop=True)

meta_cols = ["frame", "video_name", "rep_counter", "phase"]
pose_cols = [c for c in df.columns if c not in meta_cols]


label_encoder = LabelEncoder()
df["phase_label"] = label_encoder.fit_transform(df["phase"])

# Scale numeric features
scaler = StandardScaler()
df[pose_cols] = scaler.fit_transform(df[pose_cols])

print(df[["phase", "phase_label"]].head())


✅ Data prepared.
  phase  phase_label
0    S1            0
1    S1            0
2    S1            0
3    S1            0
4    S1            0


In [None]:
def create_windows_for_video(data, labels, window_size=30, stride=5):
    X, y = [], []
    for i in range(0, len(data) - window_size, stride):
        X.append(data[i:i+window_size])
        y.append(labels[i+window_size//2])  # get the middle frame of th window
    return np.array(X), np.array(y)

X_all, y_all = [], []

# group by videos so windows doesn't overlap between videos
for _, group in df.groupby("video_name"):
    X_tmp, y_tmp = create_windows_for_video(
        group[pose_cols].values,
        group["phase_label"].values,
        window_size=30,
        stride=5
    )
    X_all.append(X_tmp)
    y_all.append(y_tmp)

X_all = np.concatenate(X_all, axis=0)
y_all = np.concatenate(y_all, axis=0)

print(f"Size of windowed data: X={X_all.shape}, y={y_all.shape}")

✅ Created windowed data: X=(2283, 30, 99), y=(2283,)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X_all, y_all, test_size=0.3, random_state=42, stratify=y_all)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

# Convert to PyTorch tensors
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.long)
X_val_t   = torch.tensor(X_val, dtype=torch.float32)
y_val_t   = torch.tensor(y_val, dtype=torch.long)
X_test_t  = torch.tensor(X_test, dtype=torch.float32)
y_test_t  = torch.tensor(y_test, dtype=torch.long)

# Dataloaders
train_ds = TensorDataset(X_train_t, y_train_t)
val_ds = TensorDataset(X_val_t, y_val_t)
test_ds = TensorDataset(X_test_t, y_test_t)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)# shuffle is good for training so model does't see similar consecutive frames every epoch
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Train: (1598, 30, 99), Val: (342, 30, 99), Test: (343, 30, 99)


In [None]:
class TransformerAutoencoderClassifier(nn.Module):
    def __init__(self, num_features, seq_len, d_model=128, nhead=8, num_layers=3, num_classes=4):
        super().__init__()
        self.seq_len = seq_len

        # Encoder Layer
        self.input_proj = nn.Linear(num_features, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Decoder for reconstruction
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.output_proj = nn.Linear(d_model, num_features)

        # Classification Head
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(d_model * seq_len, d_model),
            nn.ReLU(),
            nn.Linear(d_model, num_classes)
        )

    def forward(self, x):
        z = self.input_proj(x)
        memory = self.encoder(z)
        reconstructed = self.decoder(z, memory)
        recon_out = self.output_proj(reconstructed)
        class_out = self.classifier(memory)
        return recon_out, class_out


In [None]:
seq_len = X_train.shape[1]
num_features = X_train.shape[2]
num_classes = len(np.unique(y_all))

model = TransformerAutoencoderClassifier(num_features, seq_len, num_classes=num_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
mse_loss = nn.MSELoss()# for reconstruction loss
ce_loss = nn.CrossEntropyLoss()# for classification loss

epochs = 30


In [None]:
for epoch in range(epochs):
    model.train()
    train_loss, train_correct, total = 0, 0, 0

    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        recon_out, class_out = model(x_batch)

        loss_recon = mse_loss(recon_out, x_batch)
        loss_class = ce_loss(class_out, y_batch)
        loss = loss_recon + 0.5 * loss_class# mix between losses to give more importance for each job that has higher loss

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds = torch.argmax(class_out, dim=1)
        train_correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

    train_acc = train_correct / total

    # Validation
    model.eval()
    val_loss, val_correct, val_total = 0, 0, 0
    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val, y_val = x_val.to(device), y_val.to(device)
            recon_out, class_out = model(x_val)
            loss_recon = mse_loss(recon_out, x_val)
            loss_class = ce_loss(class_out, y_val)
            loss = loss_recon + 0.5 * loss_class

            val_loss += loss.item()
            preds = torch.argmax(class_out, dim=1)
            val_correct += (preds == y_val).sum().item()
            val_total += y_val.size(0)

    val_acc = val_correct / val_total

    print(f"Epoch [{epoch+1}/{epochs}] | Train Loss: {train_loss/len(train_loader):.4f} | "
          f"Val Loss: {val_loss/len(val_loader):.4f} | Train Acc: {train_acc:.3f} | Val Acc: {val_acc:.3f}")


Epoch [1/30] | Train Loss: 0.9498 | Val Loss: 0.5377 | Train Acc: 0.714 | Val Acc: 0.825
Epoch [2/30] | Train Loss: 0.4724 | Val Loss: 0.3794 | Train Acc: 0.844 | Val Acc: 0.877
Epoch [3/30] | Train Loss: 0.3441 | Val Loss: 0.2903 | Train Acc: 0.880 | Val Acc: 0.880
Epoch [4/30] | Train Loss: 0.2895 | Val Loss: 0.2608 | Train Acc: 0.888 | Val Acc: 0.889
Epoch [5/30] | Train Loss: 0.2283 | Val Loss: 0.2201 | Train Acc: 0.916 | Val Acc: 0.889
Epoch [6/30] | Train Loss: 0.1939 | Val Loss: 0.1764 | Train Acc: 0.931 | Val Acc: 0.921
Epoch [7/30] | Train Loss: 0.1800 | Val Loss: 0.1744 | Train Acc: 0.936 | Val Acc: 0.912
Epoch [8/30] | Train Loss: 0.1608 | Val Loss: 0.2074 | Train Acc: 0.937 | Val Acc: 0.883
Epoch [9/30] | Train Loss: 0.1513 | Val Loss: 0.1536 | Train Acc: 0.934 | Val Acc: 0.915
Epoch [10/30] | Train Loss: 0.1332 | Val Loss: 0.1814 | Train Acc: 0.942 | Val Acc: 0.898
Epoch [11/30] | Train Loss: 0.1363 | Val Loss: 0.1337 | Train Acc: 0.933 | Val Acc: 0.936
Epoch [12/30] | Tra

In [None]:
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch = x_batch.to(device)
        _, class_out = model(x_batch)
        preds = torch.argmax(class_out, dim=1).cpu().numpy()
        y_true.extend(y_batch.numpy())
        y_pred.extend(preds)

print("Test Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))


✅ Test Accuracy: 0.9533527696793003

Classification Report:
              precision    recall  f1-score   support

          S1       0.96      0.98      0.97       163
          S2       0.90      0.92      0.91        49
          S3       0.96      0.98      0.97       104
          S4       0.95      0.78      0.86        27

    accuracy                           0.95       343
   macro avg       0.95      0.91      0.93       343
weighted avg       0.95      0.95      0.95       343



In [None]:
torch.save(model.state_dict(), "transformer_multi_task.pth")
joblib.dump(scaler, "pose_scaler.pkl")
joblib.dump(label_encoder, "phase_encoder.pkl")
print("Model and preprocessors saved.")


✅ Model and preprocessors saved.
