# Tone Classifier

### Imports

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report



from data_loader import extract_features

### Dataset

In [2]:
data_dir = 'data/'

wav_files_list = [f for f in os.listdir(data_dir) if f.endswith('.wav')]
# print(wav_files_list)

In [3]:
# Extract the label out of file names
data = []
for file in wav_files_list:
    parts = file.split('_')
    if len(parts) > 2:
        label = parts[2]
        filepath = os.path.join(data_dir, file)
        data.append({'file': filepath, 'label': label})

df = pd.DataFrame(data)
df.head()

Unnamed: 0,file,label
0,data/1001_DFA_ANG_XX.wav,ANG
1,data/1001_DFA_DIS_XX.wav,DIS
2,data/1001_DFA_FEA_XX.wav,FEA
3,data/1001_DFA_HAP_XX.wav,HAP
4,data/1001_DFA_NEU_XX.wav,NEU


## Tone Classification model

For lightweight tone classification we will use SVM and XGBoost. 
We use LSTMs for sequential tone analysis 

In [4]:
max_len = 100
X = []
y = []

for _, row in df.iterrows():
    features = extract_features(row['file'])  # shape: (time_steps, feature_dim)
    if features is not None:
        feature_dim = features.shape[1]

        # Pad or truncate to fixed length
        if features.shape[0] > max_len:
            features = features[:max_len, :]
        elif features.shape[0] < max_len:
            pad_width = max_len - features.shape[0]
            pad = np.zeros((pad_width, feature_dim))
            features = np.vstack([features, pad])

        X.append(features)
        y.append(row['label'])

X = np.array(X)
y = np.array(y)


In [5]:
# Encode lables (converts emotions to numbers cause SVM only takes numerical values)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [6]:
# X.shape = (samples, time_steps, feature_dim)
n_samples, time_steps, n_features = X.shape

# Flatten to 2D
X_reshaped = X.reshape(-1, n_features)

# Normalize across all frames and features
scaler = StandardScaler()
X_scaled_reshaped = scaler.fit_transform(X_reshaped)

# Reshape back to original shape
X_scaled = X_scaled_reshaped.reshape(n_samples, time_steps, n_features)

# Now you can split and use in LSTM
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.25, stratify=y_encoded, random_state=42
)


In [None]:
# LSTM
# class CNNLSTMToneClassifier(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim):
#         super(CNNLSTMToneClassifier, self).__init__()

#         self.cnn = nn.Sequential(
#             nn.Conv1d(input_dim, 64, kernel_size=3, padding=1),
#             nn.ReLU(),
#             nn.BatchNorm1d(64),
            
#             nn.Conv1d(64, 128, kernel_size=3, padding=1),
#             nn.ReLU(),
#             nn.BatchNorm1d(128),
#             nn.MaxPool1d(kernel_size=2)
#         )

#         self.lstm = nn.LSTM(input_size=128, hidden_size=hidden_dim, num_layers=2,
#                             batch_first=True, dropout=0.3)

#         self.fc_layers = nn.Sequential(
#             nn.Linear(hidden_dim, hidden_dim),
#             nn.ReLU(),
#             nn.Dropout(0.3),
#             nn.Linear(hidden_dim, output_dim)
#         )

#     def forward(self, x):
#         x = x.transpose(1, 2)  # (B, F, T) for CNN
#         x = self.cnn(x)  # (B, 128, T//2)
#         x = x.transpose(1, 2)  # (B, T//2, 128) for LSTM
#         _, (hn, _) = self.lstm(x)
#         out = self.fc_layers(hn[-1]) # Use output from last LSTM layer
#         return out

class AttentionPooling(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.attn = nn.Linear(input_dim, 1)

    def forward(self, x):
        # x: (B, T, F)
        weights = torch.softmax(self.attn(x), dim=1)  # (B, T, 1)
        out = (x * weights).sum(dim=1)  # (B, F)
        return out

class CNNAttentionToneClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(CNNAttentionToneClassifier, self).__init__()

        self.cnn = nn.Sequential(
            nn.Conv1d(input_dim, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(64),

            nn.Conv1d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(128),

            nn.Conv1d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(128)
        )

        self.attention_pool = AttentionPooling(input_dim=128)

        self.classifier = nn.Sequential(
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        x = x.transpose(1, 2)  # (B, F, T)
        x = self.cnn(x)        # (B, 128, T)
        x = x.transpose(1, 2)  # (B, T, 128)
        x = self.attention_pool(x)  # (B, 128)
        return self.classifier(x)


In [None]:
#K-Fold Cross Validation on training set
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

input_dim = X_train.shape[2]
hidden_dim = 32
output_dim = len(np.unique(y_encoded))
batch_size = 16
epochs = 50

fold_accuracies = []
best_model = None
best_acc = 0

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"\n=== Fold {fold+1}/{n_splits} ===")

    X_fold_train = X_train[train_idx]
    y_fold_train = y_train[train_idx]
    X_fold_val = X_train[val_idx]
    y_fold_val = y_train[val_idx]

    # Loaders
    train_loader = DataLoader(TensorDataset(torch.tensor(X_fold_train, dtype=torch.float32),
                                            torch.tensor(y_fold_train, dtype=torch.long)), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TensorDataset(torch.tensor(X_fold_val, dtype=torch.float32),
                                          torch.tensor(y_fold_val, dtype=torch.long)), batch_size=batch_size)

    # Model
    # model = CNNLSTMToneClassifier(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)
    model = CNNAttentionToneClassifier(input_dim=input_dim, output_dim=output_dim)


    # Weighted loss
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights = torch.tensor(class_weights, dtype=torch.float32)
    criterion = nn.CrossEntropyLoss(weight=class_weights)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.5)

    best_val_loss = float('inf')
    patience = 5
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        running_train_loss = 0.0
        for xb, yb in train_loader:
            optimizer.zero_grad()
            outputs = model(xb)
            loss = criterion(outputs, yb)
            loss.backward()
            optimizer.step()
            running_train_loss += loss.item()

        avg_train_loss = running_train_loss / len(train_loader)

        # Training accuracy
        model.eval()
        train_preds, train_labels = [], []
        with torch.no_grad():
            for xb, yb in train_loader:
                outputs = model(xb)
                _, predicted = torch.max(outputs, 1)
                train_preds.extend(predicted.cpu().numpy())
                train_labels.extend(yb.cpu().numpy())
        train_acc = accuracy_score(train_labels, train_preds)

        # Validation
        all_preds, all_labels = [], []
        val_loss_total = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                outputs = model(xb)
                loss = criterion(outputs, yb)
                val_loss_total += loss.item()
                _, predicted = torch.max(outputs, 1)
                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(yb.cpu().numpy())

        avg_val_loss = val_loss_total / len(val_loader)
        val_acc = accuracy_score(all_labels, all_preds)

        print(f"Epoch {epoch+1:02d}/{epochs} | Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

        scheduler.step(avg_val_loss)

        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

    fold_acc = accuracy_score(all_labels, all_preds)
    fold_accuracies.append(fold_acc)

    if fold_acc > best_acc:
        best_acc = fold_acc
        best_model = model

    print("Classification Report:\n", classification_report(all_labels, all_preds, target_names=le.classes_))

print(f"\n=== Average Accuracy over {n_splits} folds: {np.mean(fold_accuracies):.4f} ± {np.std(fold_accuracies):.4f}")


=== Fold 1/5 ===
Epoch 01/50 | Train Loss: 1.6896 | Train Acc: 0.2070 | Val Loss: 1.7916 | Val Acc: 0.2113
Epoch 02/50 | Train Loss: 1.6461 | Train Acc: 0.3233 | Val Loss: 1.6003 | Val Acc: 0.3178
Epoch 03/50 | Train Loss: 1.5864 | Train Acc: 0.3640 | Val Loss: 1.5285 | Val Acc: 0.3563
Epoch 04/50 | Train Loss: 1.5983 | Train Acc: 0.3255 | Val Loss: 1.5847 | Val Acc: 0.3250
Epoch 05/50 | Train Loss: 1.5609 | Train Acc: 0.3589 | Val Loss: 1.5525 | Val Acc: 0.3581
Epoch 06/50 | Train Loss: 1.5308 | Train Acc: 0.3766 | Val Loss: 1.4907 | Val Acc: 0.3733
Epoch 07/50 | Train Loss: 1.5145 | Train Acc: 0.4068 | Val Loss: 1.4584 | Val Acc: 0.4029
Epoch 08/50 | Train Loss: 1.5417 | Train Acc: 0.3535 | Val Loss: 1.5170 | Val Acc: 0.3563
Epoch 09/50 | Train Loss: 1.5310 | Train Acc: 0.3963 | Val Loss: 1.4844 | Val Acc: 0.3966
Epoch 10/50 | Train Loss: 1.4983 | Train Acc: 0.3905 | Val Loss: 1.4714 | Val Acc: 0.3832
Epoch 11/50 | Train Loss: 1.4957 | Train Acc: 0.4066 | Val Loss: 1.4772 | Val Acc: