In [4]:
import os
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import train_test_split

# -----------------------
# DEVICE SETUP
# -----------------------
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# -----------------------
# LABELS
# -----------------------
label_map = {
    "good": 0,
    "bothknees": 1,
    "buttwink": 2,
    "halfsquat": 3,
    "leanforward": 4,
    "leftknee": 5,
    "rightknee": 6
}

# -----------------------
# CUSTOM DATASET
# -----------------------
class SquatDataset(Dataset):
    def __init__(self, csv_dir, label_map, target_len=74):
        self.csv_dir = csv_dir
        self.files = [f for f in os.listdir(csv_dir) if f.endswith(".csv")]
        self.label_map = label_map
        self.target_len = target_len

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        fname = self.files[idx]
        label_str = fname.split("_")[0].lower()
        label = self.label_map[label_str]

        df = pd.read_csv(os.path.join(self.csv_dir, fname))
        df = df.select_dtypes(include=[np.number])
        data = df.to_numpy()

        if data.shape[0] > self.target_len:
            data = data[:self.target_len]
        elif data.shape[0] < self.target_len:
            pad_len = self.target_len - data.shape[0]
            pad = np.zeros((pad_len, data.shape[1]))
            data = np.vstack((data, pad))

        data = torch.tensor(data, dtype=torch.float32)
        label = torch.tensor(label, dtype=torch.long)
        return data, label

# -----------------------
# MODEL
# -----------------------
class LSTMSquatClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

        for name, param in self.lstm.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.fc(hn[-1])

# -----------------------
# TRAINING FUNCTION
# -----------------------
def train(model, train_loader, val_loader, optimizer, criterion, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss, correct, total = 0, 0, 0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            output = model(X_batch)

            if torch.isnan(output).any() or torch.isnan(X_batch).any():
                print("⚠️ NaNs detected — skipping batch")
                continue

            loss = criterion(output, y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()
            preds = output.argmax(dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)

        train_acc = correct / total if total > 0 else 0
        val_acc = evaluate(model, val_loader)
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {total_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

# -----------------------
# EVALUATION FUNCTION
# -----------------------
def evaluate(model, dataloader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            output = model(X_batch)
            preds = output.argmax(dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
    model.train()
    return correct / total if total > 0 else 0

# -----------------------
# MAIN
# -----------------------
if __name__ == "__main__":
    csv_dir = "../outputs_normalized"
    full_dataset = SquatDataset(csv_dir, label_map)

    # Stratified split: train (80%) + temp (20%)
    indices = list(range(len(full_dataset)))
    train_idx, temp_idx = train_test_split(
        indices,
        test_size=0.2,
        stratify=[label_map[f.split("_")[0].lower()] for f in full_dataset.files],
        random_state=42
    )

    # Stratified split: val (10%) + test (10%) from temp
    val_idx, test_idx = train_test_split(
        temp_idx,
        test_size=0.5,
        stratify=[label_map[full_dataset.files[i].split("_")[0].lower()] for i in temp_idx],
        random_state=42
    )

    # Datasets & Loaders
    train_dataset = Subset(full_dataset, train_idx)
    val_dataset = Subset(full_dataset, val_idx)
    test_dataset = Subset(full_dataset, test_idx)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)
    test_loader = DataLoader(test_dataset, batch_size=32)

    # Model
    sample_input, _ = full_dataset[0]
    input_size = sample_input.shape[1]

    model = LSTMSquatClassifier(input_size=input_size, hidden_size=64, num_classes=7).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()

    train(model, train_loader, val_loader, optimizer, criterion, epochs=200)

Using device: mps
Epoch 1/200 | Train Loss: 26.4863 | Train Acc: 0.3102 | Val Acc: 0.3333
Epoch 2/200 | Train Loss: 25.6953 | Train Acc: 0.3333 | Val Acc: 0.3333
Epoch 3/200 | Train Loss: 25.5524 | Train Acc: 0.3333 | Val Acc: 0.3333
Epoch 4/200 | Train Loss: 25.3633 | Train Acc: 0.3333 | Val Acc: 0.3333
Epoch 5/200 | Train Loss: 25.2498 | Train Acc: 0.3333 | Val Acc: 0.3333
Epoch 6/200 | Train Loss: 24.9527 | Train Acc: 0.3333 | Val Acc: 0.3333
Epoch 7/200 | Train Loss: 24.6096 | Train Acc: 0.3333 | Val Acc: 0.3333
Epoch 8/200 | Train Loss: 24.1742 | Train Acc: 0.3333 | Val Acc: 0.3333
Epoch 9/200 | Train Loss: 23.8622 | Train Acc: 0.3333 | Val Acc: 0.3333
Epoch 10/200 | Train Loss: 23.4393 | Train Acc: 0.3333 | Val Acc: 0.3333
Epoch 11/200 | Train Loss: 23.0292 | Train Acc: 0.3333 | Val Acc: 0.3333
Epoch 12/200 | Train Loss: 23.0066 | Train Acc: 0.3333 | Val Acc: 0.3333
Epoch 13/200 | Train Loss: 22.8109 | Train Acc: 0.3333 | Val Acc: 0.3333
Epoch 14/200 | Train Loss: 22.7743 | Train

In [3]:
import os
import pandas as pd
import numpy as np

csv_dir = "../outputs_normalized"
bad_files = []

for fname in os.listdir(csv_dir):
    if not fname.endswith(".csv"):
        continue

    path = os.path.join(csv_dir, fname)
    try:
        df = pd.read_csv(path)

        # Keep only numeric columns
        df = df.select_dtypes(include=[np.number])

        if df.shape[1] == 0:
            print(f"❌ No numeric columns: {fname}")
            bad_files.append(fname)
            continue

        if df.isna().all(axis=1).any():
            print(f"❌ Row(s) full of NaNs: {fname}")
            bad_files.append(fname)
            continue

        if df.dropna().empty:
            print(f"❌ All rows have NaNs or are empty: {fname}")
            bad_files.append(fname)
            continue

    except Exception as e:
        print(f"❌ Failed to process {fname}: {e}")
        bad_files.append(fname)

print("\n🔍 Scan complete.")
print(f"Total files scanned: {len(os.listdir(csv_dir))}")
print(f"Bad files found: {len(bad_files)}")


🔍 Scan complete.
Total files scanned: 361
Bad files found: 0


In [2]:
import os
import shutil

# Path to the directory
csv_dir = "../outputs_normalized"

# Get list of all CSV files
all_files = [f for f in os.listdir(csv_dir) if f.endswith(".csv")]

# Track how many were duplicated
dup_count = 0

for filename in all_files:
    if not filename.lower().startswith("good_"):
        original_path = os.path.join(csv_dir, filename)

        # Construct a new filename
        name, ext = os.path.splitext(filename)
        new_filename = f"{name}_dup{ext}"
        new_path = os.path.join(csv_dir, new_filename)

        # Copy the file
        shutil.copyfile(original_path, new_path)
        dup_count += 1

print(f"✅ Duplicated {dup_count} files.")

✅ Duplicated 180 files.
