In [16]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [17]:
def calculate_angle(a, b, c):
    """
    Calculates the angle in degrees between three points (a, b, c),
    where 'b' is the vertex.
    """
    a, b, c = np.array(a), np.array(b), np.array(c)
    ba, bc = a - b, c - b
    dot_product = np.dot(ba, bc)
    norm_ba, norm_bc = np.linalg.norm(ba), np.linalg.norm(bc)
    if norm_ba == 0 or norm_bc == 0:
        return 0.0
    cosine_angle = np.clip(dot_product / (norm_ba * norm_bc), -1.0, 1.0)
    return np.degrees(np.arccos(cosine_angle))


def add_pose_angles(df):

    df = df.copy()


    left_knee_angles, right_knee_angles = [], []
    left_hip_angles, right_hip_angles = [], []
    left_torso_angles, right_torso_angles = [], []
    left_ankle_angles, right_ankle_angles = [], []

    for _, row in df.iterrows():
        # --- Left landmarks ---
        left_shoulder = [row['LEFT_SHOULDER_x'], row['LEFT_SHOULDER_y']]
        left_hip = [row['LEFT_HIP_x'], row['LEFT_HIP_y']]
        left_knee = [row['LEFT_KNEE_x'], row['LEFT_KNEE_y']]
        left_ankle = [row['LEFT_ANKLE_x'], row['LEFT_ANKLE_y']]
        left_foot_index = [row['LEFT_FOOT_INDEX_x'], row['LEFT_FOOT_INDEX_y']]

        # --- Right landmarks ---
        right_shoulder = [row['RIGHT_SHOULDER_x'], row['RIGHT_SHOULDER_y']]
        right_hip = [row['RIGHT_HIP_x'], row['RIGHT_HIP_y']]
        right_knee = [row['RIGHT_KNEE_x'], row['RIGHT_KNEE_y']]
        right_ankle = [row['RIGHT_ANKLE_x'], row['RIGHT_ANKLE_y']]
        right_foot_index = [row['RIGHT_FOOT_INDEX_x'], row['RIGHT_FOOT_INDEX_y']]

        # --- Calculate angles ---
        left_knee_angle = calculate_angle(left_hip, left_knee, left_ankle)
        right_knee_angle = calculate_angle(right_hip, right_knee, right_ankle)
        left_hip_angle = calculate_angle(left_shoulder, left_hip, left_knee)
        right_hip_angle = calculate_angle(right_shoulder, right_hip, right_knee)

        left_hip_vertical = [left_hip[0], left_hip[1] - 1]
        right_hip_vertical = [right_hip[0], right_hip[1] - 1]
        left_torso_angle = calculate_angle(left_shoulder, left_hip, left_hip_vertical)
        right_torso_angle = calculate_angle(right_shoulder, right_hip, right_hip_vertical)

        left_ankle_angle = calculate_angle(left_knee, left_ankle, left_foot_index)
        right_ankle_angle = calculate_angle(right_knee, right_ankle, right_foot_index)

        # --- Append to lists ---
        left_knee_angles.append(left_knee_angle)
        right_knee_angles.append(right_knee_angle)
        left_hip_angles.append(left_hip_angle)
        right_hip_angles.append(right_hip_angle)
        left_torso_angles.append(left_torso_angle)
        right_torso_angles.append(right_torso_angle)
        left_ankle_angles.append(left_ankle_angle)
        right_ankle_angles.append(right_ankle_angle)

    # --- Add new columns ---
    df['left_knee_angle'] = left_knee_angles
    df['right_knee_angle'] = right_knee_angles
    df['left_hip_angle'] = left_hip_angles
    df['right_hip_angle'] = right_hip_angles
    df['left_torso_angle'] = left_torso_angles
    df['right_torso_angle'] = right_torso_angles
    df['left_ankle_angle'] = left_ankle_angles
    df['right_ankle_angle'] = right_ankle_angles

    return df

In [18]:
df = pd.read_csv("last_csv_all.csv")
df = add_pose_angles(df)
print(df.head())
df = df.sort_values(by=["video_name", "frame"]).reset_index(drop=True)

df.drop(columns=['NOSE_x','NOSE_y','NOSE_visibility','LEFT_EYE_INNER_x','LEFT_EYE_INNER_y','LEFT_EYE_INNER_visibility','LEFT_EYE_x','LEFT_EYE_y','LEFT_EYE_visibility','LEFT_EYE_OUTER_x','LEFT_EYE_OUTER_y','LEFT_EYE_OUTER_visibility','RIGHT_EYE_INNER_x','RIGHT_EYE_INNER_y','RIGHT_EYE_INNER_visibility','RIGHT_EYE_x','RIGHT_EYE_y','RIGHT_EYE_visibility','RIGHT_EYE_OUTER_x','RIGHT_EYE_OUTER_y','RIGHT_EYE_OUTER_visibility','LEFT_EAR_x','LEFT_EAR_y','LEFT_EAR_visibility','RIGHT_EAR_x','RIGHT_EAR_y','RIGHT_EAR_visibility','MOUTH_LEFT_x','MOUTH_LEFT_y','MOUTH_LEFT_visibility','MOUTH_RIGHT_x','MOUTH_RIGHT_y','MOUTH_RIGHT_visibility'],inplace=True)



meta_cols = ["frame", "video_name", "rep_counter", "phase"]
pose_cols = [c for c in df.columns if c not in meta_cols]


label_encoder = LabelEncoder()
df["phase_label"] = label_encoder.fit_transform(df["phase"])

# Scale numeric features
scaler = StandardScaler()
df[pose_cols] = scaler.fit_transform(df[pose_cols])

print(df[["phase", "phase_label"]].head())


   frame   video_name  rep_counter  NOSE_x  NOSE_y  NOSE_visibility  \
0      1  plus233.mp4            0     234      87         0.999974   
1      2  plus233.mp4            0     234      84         0.999972   
2      3  plus233.mp4            0     233      82         0.999969   
3      4  plus233.mp4            0     233      82         0.999968   
4      5  plus233.mp4            0     235      82         0.999968   

   LEFT_EYE_INNER_x  LEFT_EYE_INNER_y  LEFT_EYE_INNER_visibility  LEFT_EYE_x  \
0               231                77                   0.999970         231   
1               230                74                   0.999968         230   
2               229                72                   0.999967         229   
3               229                73                   0.999966         229   
4               231                73                   0.999965         231   

   ...  RIGHT_FOOT_INDEX_visibility  phase  left_knee_angle  right_knee_angle  \
0  ...     

In [19]:
def create_windows_for_video(data, labels, window_size=30, stride=5):
    X, y = [], []
    for i in range(0, len(data) - window_size, stride):
        X.append(data[i:i+window_size])
        y.append(labels[i+window_size//2])  # get the middle frame of th window
    return np.array(X), np.array(y)

X_all, y_all = [], []

# group by videos so windows doesn't overlap between videos
for _, group in df.groupby("video_name"):
    X_tmp, y_tmp = create_windows_for_video(
        group[pose_cols].values,
        group["phase_label"].values,
        window_size=30,
        stride=5
    )
    X_all.append(X_tmp)
    y_all.append(y_tmp)

X_all = np.concatenate(X_all, axis=0)
y_all = np.concatenate(y_all, axis=0)

print(f"Size of windowed data: X={X_all.shape}, y={y_all.shape}")

Size of windowed data: X=(7046, 30, 74), y=(7046,)


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X_all, y_all, test_size=0.2, random_state=42, stratify=y_all)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

# Convert to PyTorch tensors
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.long)
X_val_t   = torch.tensor(X_val, dtype=torch.float32)
y_val_t   = torch.tensor(y_val, dtype=torch.long)
X_test_t  = torch.tensor(X_test, dtype=torch.float32)
y_test_t  = torch.tensor(y_test, dtype=torch.long)

# Dataloaders
train_ds = TensorDataset(X_train_t, y_train_t)
val_ds = TensorDataset(X_val_t, y_val_t)
test_ds = TensorDataset(X_test_t, y_test_t)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)# shuffle is good for training so model does't see similar consecutive frames every epoch
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Train: (5636, 30, 74), Val: (705, 30, 74), Test: (705, 30, 74)


In [21]:
class TransformerAutoencoderClassifier(nn.Module):
    def __init__(self, num_features, seq_len, d_model=128, nhead=8, num_layers=3, num_classes=4):
        super().__init__()
        self.seq_len = seq_len

        # Encoder Layer
        self.input_proj = nn.Linear(num_features, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Decoder for reconstruction
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.output_proj = nn.Linear(d_model, num_features)

        # Classification Head
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(d_model * seq_len, d_model),
            nn.ReLU(),
            nn.Linear(d_model, num_classes)
        )

    def forward(self, x):
        z = self.input_proj(x)
        memory = self.encoder(z)
        reconstructed = self.decoder(z, memory)
        recon_out = self.output_proj(reconstructed)
        class_out = self.classifier(memory)
        return recon_out, class_out


In [22]:
seq_len = X_train.shape[1]
num_features = X_train.shape[2]
num_classes = len(np.unique(y_all))

model = TransformerAutoencoderClassifier(num_features, seq_len, num_classes=num_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
mse_loss = nn.MSELoss()# for reconstruction loss
ce_loss = nn.CrossEntropyLoss()# for classification loss

epochs = 30


In [23]:
for epoch in range(epochs):
    model.train()
    train_loss, train_correct, total = 0, 0, 0

    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        recon_out, class_out = model(x_batch)

        loss_recon = mse_loss(recon_out, x_batch)
        loss_class = ce_loss(class_out, y_batch)
        loss = loss_recon + 0.5 * loss_class# mix between losses to give more importance for each job that has higher loss

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds = torch.argmax(class_out, dim=1)
        train_correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

    train_acc = train_correct / total

    # Validation
    model.eval()
    val_loss, val_correct, val_total = 0, 0, 0
    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val, y_val = x_val.to(device), y_val.to(device)
            recon_out, class_out = model(x_val)
            loss_recon = mse_loss(recon_out, x_val)
            loss_class = ce_loss(class_out, y_val)
            loss = loss_recon + 0.5 * loss_class

            val_loss += loss.item()
            preds = torch.argmax(class_out, dim=1)
            val_correct += (preds == y_val).sum().item()
            val_total += y_val.size(0)

    val_acc = val_correct / val_total

    print(f"Epoch [{epoch+1}/{epochs}] | Train Loss: {train_loss/len(train_loader):.4f} | "
          f"Val Loss: {val_loss/len(val_loader):.4f} | Train Acc: {train_acc:.3f} | Val Acc: {val_acc:.3f}")


Epoch [1/30] | Train Loss: 0.5543 | Val Loss: 0.3370 | Train Acc: 0.845 | Val Acc: 0.877
Epoch [2/30] | Train Loss: 0.2747 | Val Loss: 0.2458 | Train Acc: 0.894 | Val Acc: 0.906
Epoch [3/30] | Train Loss: 0.1988 | Val Loss: 0.3053 | Train Acc: 0.909 | Val Acc: 0.870
Epoch [4/30] | Train Loss: 0.1668 | Val Loss: 0.1722 | Train Acc: 0.918 | Val Acc: 0.919
Epoch [5/30] | Train Loss: 0.1463 | Val Loss: 0.1724 | Train Acc: 0.921 | Val Acc: 0.913
Epoch [6/30] | Train Loss: 0.1284 | Val Loss: 0.1771 | Train Acc: 0.929 | Val Acc: 0.913
Epoch [7/30] | Train Loss: 0.1215 | Val Loss: 0.1581 | Train Acc: 0.930 | Val Acc: 0.925
Epoch [8/30] | Train Loss: 0.1090 | Val Loss: 0.1429 | Train Acc: 0.940 | Val Acc: 0.922
Epoch [9/30] | Train Loss: 0.1055 | Val Loss: 0.1738 | Train Acc: 0.939 | Val Acc: 0.912
Epoch [10/30] | Train Loss: 0.0992 | Val Loss: 0.1450 | Train Acc: 0.943 | Val Acc: 0.926
Epoch [11/30] | Train Loss: 0.0903 | Val Loss: 0.1868 | Train Acc: 0.948 | Val Acc: 0.913
Epoch [12/30] | Tra

In [24]:
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch = x_batch.to(device)
        _, class_out = model(x_batch)
        preds = torch.argmax(class_out, dim=1).cpu().numpy()
        y_true.extend(y_batch.numpy())
        y_pred.extend(preds)

print("Test Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))


Test Accuracy: 0.9560283687943263

Classification Report:
              precision    recall  f1-score   support

          S1       0.98      0.99      0.99       356
          S2       0.91      0.85      0.87        91
          S3       0.95      0.96      0.96       195
          S4       0.89      0.89      0.89        63

    accuracy                           0.96       705
   macro avg       0.93      0.92      0.93       705
weighted avg       0.96      0.96      0.96       705



In [25]:
torch.save(model.state_dict(), "transformer_multi_task.pth")
joblib.dump(scaler, "pose_scaler.pkl")
joblib.dump(label_encoder, "phase_encoder.pkl")
print("Model and preprocessors saved.")


Model and preprocessors saved.
