In [3]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import DataLoader, Dataset
import numpy as np

# --- 1. CONFIGURATION ---
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SFM_CSV = "sfm_features.csv"
BATCH_SIZE = 8

# --- 2. RESTORE DATA & CLASSES ---
full_df = pd.read_csv(SFM_CSV)

label_encoder = LabelEncoder()
full_df['label_encoded'] = label_encoder.fit_transform(full_df['label_name'])
NUM_CLASSES = len(label_encoder.classes_) # <--- This fixes your NameError
print(f"‚úÖ Restored NUM_CLASSES: {NUM_CLASSES} ({label_encoder.classes_})")

# --- 3. RESTORE DATASET CLASS ---
class VoicePathologyDataset(Dataset):
    def __init__(self, df, sfm_scaler, sfm_cols):
        self.df = df.reset_index(drop=True)
        self.sfm_scaler = sfm_scaler
        self.sfm_cols = sfm_cols

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        file_path = row['file_path']
        # We only need the path for Audio-Only model, but we keep format consistent
        raw_sfm = row[self.sfm_cols].values.astype(np.float32)
        norm_sfm = self.sfm_scaler.transform([raw_sfm])[0]
        sfm_tensor = torch.tensor(norm_sfm, dtype=torch.float32)
        label = torch.tensor(row['label_encoded'], dtype=torch.long)
        return file_path, sfm_tensor, label

# --- 4. RESTORE LOADERS ---
sfm_cols = ['jitter_local', 'jitter_rap', 'shimmer_local', 'shimmer_apq3', 
            'hnr', 'f1', 'f2', 'f3', 'f4', 'f0_mean']
scaler = StandardScaler()
train_subset = full_df[full_df['split'] == 'train']
scaler.fit(train_subset[sfm_cols].values)

# Custom Collate (Required for Audio-MAE)
def custom_collate(batch):
    paths = [item[0] for item in batch] 
    sfms = torch.stack([item[1] for item in batch])
    labels = torch.stack([item[2] for item in batch])
    return paths, sfms, labels

train_ds = VoicePathologyDataset(full_df[full_df['split'] == 'train'], scaler, sfm_cols)
val_ds = VoicePathologyDataset(full_df[full_df['split'] == 'val'], scaler, sfm_cols)
test_ds = VoicePathologyDataset(full_df[full_df['split'] == 'test'], scaler, sfm_cols)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate)

print("‚úÖ Data Loaders Ready!")

‚úÖ Restored NUM_CLASSES: 6 (['Cysts_Structural' 'Dysarthia' 'Laryngitis' 'Vox senilis' 'parkinson'
 'spasmodische_dysphonie'])
‚úÖ Data Loaders Ready!


In [4]:
import torch.nn as nn
from transformers import AutoModel
from sklearn.metrics import accuracy_score, classification_report

# --- MODEL DEFINITION ---
class AudioOnlyModel(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        print("üéß Loading Audio-MAE (Audio Only)...")
        self.audio_encoder = AutoModel.from_pretrained(
            "hance-ai/audiomae", 
            trust_remote_code=True
        )
        self.hidden_size = 768 
        self.classifier = nn.Sequential(
            nn.Linear(self.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, num_classes)
        )

    def forward(self, audio_paths):
        audio_feats_list = []
        for path in audio_paths:
            feat = self.audio_encoder(path) 
            audio_feats_list.append(feat)
            
        audio_feats = torch.stack(audio_feats_list)
        device = self.classifier[0].weight.device
        audio_feats = audio_feats.to(device)
        
        # Global Average Pooling (Collapse 8x64 grid -> 1 vector)
        audio_emb = audio_feats.mean(dim=(2, 3)) 
        return self.classifier(audio_emb)

# --- TRAINING LOOP ---
AUDIO_ONLY_LR = 2e-5
EPOCHS = 15

audio_model = AudioOnlyModel(num_classes=NUM_CLASSES).to(DEVICE)
optimizer = torch.optim.AdamW(audio_model.parameters(), lr=AUDIO_ONLY_LR, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()

print("\nüî• Starting Audio-Only Baseline Training...")
best_acc = 0.0

for epoch in range(EPOCHS):
    audio_model.train()
    total_loss = 0
    
    for paths, sfms, labels in train_loader:
        labels = labels.to(DEVICE)
        optimizer.zero_grad()
        logits = audio_model(paths)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    # Validation
    audio_model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for paths, sfms, labels in val_loader:
            labels = labels.to(DEVICE)
            logits = audio_model(paths)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
    val_acc = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {total_loss/len(train_loader):.4f} | Val Acc: {val_acc:.4f}")
    
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(audio_model.state_dict(), "baseline_audio_only.pth")

print(f"\nüèÜ Best Audio-Only Accuracy: {best_acc*100:.2f}%")

üéß Loading Audio-MAE (Audio Only)...





üî• Starting Audio-Only Baseline Training...
Epoch 1/15 | Loss: 1.6823 | Val Acc: 0.5216
Epoch 2/15 | Loss: 1.6010 | Val Acc: 0.5529
Epoch 3/15 | Loss: 1.5552 | Val Acc: 0.5769
Epoch 4/15 | Loss: 1.5060 | Val Acc: 0.5865
Epoch 5/15 | Loss: 1.4747 | Val Acc: 0.5865
Epoch 6/15 | Loss: 1.4379 | Val Acc: 0.5938
Epoch 7/15 | Loss: 1.4125 | Val Acc: 0.5986
Epoch 8/15 | Loss: 1.3877 | Val Acc: 0.6010
Epoch 9/15 | Loss: 1.3613 | Val Acc: 0.6106
Epoch 10/15 | Loss: 1.3413 | Val Acc: 0.6130
Epoch 11/15 | Loss: 1.3241 | Val Acc: 0.6202
Epoch 12/15 | Loss: 1.3074 | Val Acc: 0.6274
Epoch 13/15 | Loss: 1.2858 | Val Acc: 0.6130
Epoch 14/15 | Loss: 1.2675 | Val Acc: 0.6514
Epoch 15/15 | Loss: 1.2559 | Val Acc: 0.6538

üèÜ Best Audio-Only Accuracy: 65.38%


In [5]:
from sklearn.metrics import classification_report

# Load Best Baseline
audio_model.load_state_dict(torch.load("baseline_audio_only.pth"))
audio_model.eval()

print("üöÄ Evaluating Audio-Only Model on Test Set...")

# Use Test Loader (defined in previous steps)
all_preds = []
all_labels = []

with torch.no_grad():
    for paths, sfms, labels in test_loader:
        labels = labels.to(DEVICE)
        
        logits = audio_model(paths)
        preds = torch.argmax(logits, dim=1)
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Print Report
print(f"\nüìä Audio-Only Baseline Results:")
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

üöÄ Evaluating Audio-Only Model on Test Set...

üìä Audio-Only Baseline Results:
                        precision    recall  f1-score   support

      Cysts_Structural       0.29      0.09      0.14        22
             Dysarthia       0.70      0.71      0.71        42
            Laryngitis       0.25      0.43      0.32        42
           Vox senilis       0.46      0.41      0.43        93
             parkinson       0.78      0.84      0.81       200
spasmodische_dysphonie       0.00      0.00      0.00        20

              accuracy                           0.61       419
             macro avg       0.41      0.41      0.40       419
          weighted avg       0.59      0.61      0.59       419



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
