In [1]:
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import eval_metrics as em

### Padding and Data Loader

In [None]:

PADDING_VALUE = 0.0         
BATCH_SIZE = 32
LABEL_BONAFIDE = 1
LABEL_SPOOF = 0


# --- Process pitch sequences and match them with labels ---
class PitchDataset(Dataset):
    def __init__(self, feature_path, label_file_path):
        
        # NOTE: watch out for numpy compatibility 
        self.labels = torch.tensor(pd.read_pickle(label_file_path), dtype=torch.long)
        features = pd.read_pickle(feature_path)
        # NOTE: need to take the transpose: original shape (feature_dimension, timeframe)
        self.processed_features = [torch.tensor(arr, dtype=torch.torch.float32).T for arr in features]
        

    def __len__(self):
        """Returns the total number of matched samples in the dataset."""
        return len(self.processed_features)

    def __getitem__(self, idx):
        """
        Returns one sample from the dataset: a preprocessed pitch sequence and its label.
        """
        feature_sequence = self.processed_features[idx]
        label = self.labels[idx]
        return feature_sequence, label

# --- Custom Collate Function for Dynamic Padding  ---
def collate_fn(batch, padding_value=PADDING_VALUE):
    """
    Pads sequences within a batch to the same length.
    """
    sequences = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    lengths = torch.tensor([len(seq) for seq in sequences], dtype=torch.long)
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=padding_value)
    labels = torch.stack(labels)
    if padded_sequences.ndim == 2:
        padded_sequences = padded_sequences.unsqueeze(2)    # required dim: (batch_size, seq_len, input_dim)
    return padded_sequences, lengths, labels


>NOTE: ^Many PyTorch layers, especially those dealing with sequences like LSTMs, GRUs, or 1D Convolutional layers, expect input in a 3D format.If your pad_sequence results in a (batch_size, sequence_length) tensor because each time step in your sequence is represented by a single number, unsqueeze(2) reshapes it to (batch_size, sequence_length, 1). This '1' signifies that there is one feature (or channel) per time step in the sequence. This makes the tensor compatible with layers that expect this 3D input.

In [None]:
### For remote server
mfcc_train_path = '/home/users1/liqe/TeamLab_phonetics/features_qianru/mfcc_train_60_cnn.pkl'
mfcc_dev_path = '/home/users1/liqe/TeamLab_phonetics/features_qianru/mfcc_dev_60_cnn.pkl'

In [None]:
### for remote
label_train_path = '/home/users1/liqe/TeamLab_phonetics/features_qianru/mfcc_train_60_cnn_label.pkl'
label_dev_path = '/home/users1/liqe/TeamLab_phonetics/features_qianru/mfcc_dev_60_cnn_label.pkl'

In [9]:
mfcc_dataset_train = PitchDataset(feature_path=mfcc_train_path,
                                     label_file_path=label_train_path)

train_dataloader = DataLoader(
    mfcc_dataset_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn
)

mfcc_dataset_dev = PitchDataset(feature_path=mfcc_dev_path,
                                     label_file_path=label_dev_path)

dev_dataloader = DataLoader(
    mfcc_dataset_dev, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn
)

## For inspection
for i, batch_data in enumerate(train_dataloader):
    # batch_data is a tuple: (padded_sequences, lengths, labels)
    batch_sequences, batch_lengths, batch_labels = batch_data
    print(f"\n--- Batch {i+1} ---")
    print(f"  Padded Sequences Shape: {batch_sequences.shape}")
    print(f"  Original Lengths (first 5): {batch_lengths[:5]}")
    print(f"  Labels (first 5): {batch_labels[:5]}")
    

    if i == 0: # Break after the first batch for inspection
        break



--- Batch 1 ---
  Padded Sequences Shape: torch.Size([32, 221, 60])
  Original Lengths (first 5): tensor([ 72, 221, 199, 108, 111])
  Labels (first 5): tensor([0, 0, 0, 0, 0])


### Finding the weight (for weighted cross entropy)

is there different ways calculating weitghs?

In [12]:
labels =  labels = pd.read_pickle(label_train_path)
total = len(labels)
count_bonafide = class_count = sum(1 for value in labels if value == LABEL_BONAFIDE)
count_spoof =  total - count_bonafide
weight_bonafide = total / (count_bonafide * 2)
weight_spoof = total / (count_spoof * 2)

### LSTM classifier

In [None]:
class SimplePitchLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout):

        super().__init__()
        
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.bidirectional = bidirectional
        
        # 1. LSTM Layer
        self.lstm = nn.LSTM(input_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout if n_layers > 1 else 0,
                           batch_first=True) # Input/output tensors are (batch, seq, feature)
        
        # 2. Fully Connected Layer (Linear Layer)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        # 3. Dropout Layer (for regularization on the output of LSTM)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, sequences, sequence_lengths):

        # 1. Pack sequence
        packed_input = rnn_utils.pack_padded_sequence(sequences, sequence_lengths.cpu(), batch_first=True, enforce_sorted=False)
        
        # 2. Pass packed sequence through LSTM
        packed_output, (hidden, cell) = self.lstm(packed_input)

        # 3. Concatenate the final forward and backward hidden states (if bidirectional)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
            
        # 4. Pass the processed hidden state through the fully connected layer
        output = self.fc(hidden)
        
        return output

### Initiate the model

In [14]:
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

CUDA available: True
CUDA device count: 4


In [15]:
class_weights = torch.tensor([weight_bonafide, weight_spoof], dtype=torch.float32).to(DEVICE)

In [16]:
model = SimplePitchLSTMClassifier(input_dim=60, hidden_dim=128, output_dim=2, n_layers=1,
                 bidirectional=True, dropout=0.0).to(DEVICE)
criterion = torch.nn.CrossEntropyLoss(reduction='mean', weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

### Evaluation

In [None]:
def evaluate_pitch_classifier(data_loader, model, criterion):

    model.eval()  # Set the model to evaluation mode (disables dropout, etc.)
    
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    scores_bonafide = []
    scores_spoof = []

    with torch.no_grad():  # Disable gradient calculations during evaluation
        for batch_sequences, batch_lengths, batch_labels in data_loader:
            # Move data to the same device as the model
            batch_sequences = batch_sequences.to(DEVICE)
            batch_labels = batch_labels.to(DEVICE)

            # Forward pass: Get model outputs (logits)
            logits = model(batch_sequences, batch_lengths)
            
            # Calculate loss for the current batch
            loss = criterion(logits, batch_labels)
            total_loss += loss.item() * batch_sequences.size(0) # Accumulate loss, weighted by batch size

            # for EER
            probabilities = torch.softmax(logits, dim=1)

            for i in range(len(batch_labels)):
                current_label = batch_labels[i]
                current_score = probabilities[i]

                if current_label == LABEL_BONAFIDE:
                    scores_bonafide.append(current_score[LABEL_BONAFIDE].cpu())     ## numpy is cpu only, need to move tensor from gpu
                elif current_label == LABEL_SPOOF:
                    scores_spoof.append(current_score[LABEL_BONAFIDE].cpu())
            
            total_samples += batch_labels.size(0) # Count number of samples in this batch

    average_loss = total_loss / total_samples if total_samples > 0 else 0.0

    scores_bonafide_np = np.array(scores_bonafide)    
    scores_spoof_np = np.array(scores_spoof)
    eer, threshold = em.compute_eer(scores_bonafide_np, scores_spoof_np)
    
    return average_loss, eer, threshold

### The training loop

In [None]:
def train_model(model, train_dataloader, dev_dataloader, criterion, optimizer, num_epochs, device):

    print(f"Training started on device: {device}")
    model.to(device) # Ensure model is on the correct device

    # Initial metric dictionary for the progress bar
    metric_dict = {'train_loss': 'N/A', 'val_loss': 'N/A', 'val_eer': 'N/A', 'val_threshold': 'N/A'}

    # Evaluate on validation set first to get a baseline
    print("Evaluating on validation set before training...")
    model.eval() # Set model to evaluation mode
    val_loss_initial, val_eer_initial, threshold_initial = evaluate_pitch_classifier(dev_dataloader, model, criterion)
    metric_dict.update({'val_loss': f'{val_loss_initial:.3f}', 'val_eer': f'{val_eer_initial*100:.2f}%', 'val_threshold': f'{threshold_initial*100:.2f}%'})
    print(f"Initial Validation - Loss: {val_loss_initial:.4f}, EER: {val_eer_initial*100:.2f}%, Threshold: {threshold_initial*100:.2f}%")

    # Progress bar setup
    # Total steps: number of epochs * number of batches per epoch
    total_steps = num_epochs * len(train_dataloader)
    pbar = tqdm(total=total_steps, initial=0, postfix=metric_dict, unit="batch")

    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode (enables dropout, etc.)
        pbar.set_description(f"Epoch {epoch + 1}/{num_epochs}")
        
        running_train_loss = 0.0
        num_train_batches = 0

        for batch_sequences, batch_lengths, batch_labels in train_dataloader:
            # Move data to the specified device
            batch_sequences = batch_sequences.to(device)
            # batch_lengths are used by pack_padded_sequence which expects them on CPU
            batch_labels = batch_labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass: Get model outputs (logits)
            logits = model(batch_sequences, batch_lengths)
            
            # Calculate loss
            loss = criterion(logits, batch_labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Update statistics for progress bar and logging
            running_train_loss += loss.item()
            num_train_batches += 1
            
            pbar.update(1) # Increment progress bar by one batch
            metric_dict.update({'train_loss': f'{loss.item():.3f}'}) # Current batch loss
            pbar.set_postfix(metric_dict)
        
        # Calculate average training loss for the epoch
        avg_epoch_train_loss = running_train_loss / num_train_batches if num_train_batches > 0 else 0.0
        metric_dict.update({'train_loss': f'{avg_epoch_train_loss:.3f}'}) # Average epoch loss
        
        # Evaluate on validation set after each epoch
        avg_val_loss, val_eer, val_threshold = evaluate_pitch_classifier(dev_dataloader, model, criterion)
        
        metric_dict.update({'val_loss': f'{avg_val_loss:.3f}', 'val_eer': f'{val_eer*100:.2f}%', 'val_threshold': f'{val_threshold*100:.2f}%'})
        pbar.set_postfix(metric_dict) # Update with latest validation metrics
        
        # Optional: Print epoch summary
        print(f"\nEpoch {epoch+1} Summary: Avg Train Loss: {avg_epoch_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, EER: {val_eer*100:.2f}%, Threshold: {val_threshold*100:.2f}%")

    pbar.close()
    print("Training finished.")

In [19]:
NUM_EPOCHS = 20
train_model(model, train_dataloader, dev_dataloader, criterion, optimizer, NUM_EPOCHS, DEVICE)

Training started on device: cuda
Evaluating on validation set before training...
Initial Validation - Loss: 0.5367, EER: 53.22%, Threshold: 40.54%


  0%|          | 0/15880 [00:00<?, ?batch/s, train_loss=N/A, val_eer=53.22%, val_loss=0.537, val_threshold=40.…


Epoch 1 Summary: Avg Train Loss: 0.0619, Val Loss: 0.0617, EER: 19.31%, Threshold: 5.00%

Epoch 2 Summary: Avg Train Loss: 0.0589, Val Loss: 0.0556, EER: 18.17%, Threshold: 1.76%

Epoch 3 Summary: Avg Train Loss: 0.0542, Val Loss: 0.0605, EER: 18.52%, Threshold: 0.42%

Epoch 4 Summary: Avg Train Loss: 0.0499, Val Loss: 0.0476, EER: 12.20%, Threshold: 0.59%

Epoch 5 Summary: Avg Train Loss: 0.0437, Val Loss: 0.0450, EER: 8.83%, Threshold: 0.36%

Epoch 6 Summary: Avg Train Loss: 0.0375, Val Loss: 0.0480, EER: 11.82%, Threshold: 0.11%

Epoch 7 Summary: Avg Train Loss: 0.0350, Val Loss: 0.0463, EER: 16.56%, Threshold: 3.01%

Epoch 8 Summary: Avg Train Loss: 0.0345, Val Loss: 0.0352, EER: 12.01%, Threshold: 0.16%

Epoch 9 Summary: Avg Train Loss: 0.0349, Val Loss: 0.0400, EER: 10.47%, Threshold: 1.53%

Epoch 10 Summary: Avg Train Loss: 0.0374, Val Loss: 0.0712, EER: 18.60%, Threshold: 0.03%

Epoch 11 Summary: Avg Train Loss: 0.0340, Val Loss: 0.0417, EER: 13.08%, Threshold: 0.17%

Epoch 12