In [14]:
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import eval_metrics as em

### Padding and Data Loader

In [None]:
PITCH_COLUMN = 'PITCH'
HNR_COLUMN = 'HNR'
AUDIO_ID_COLUMN = 'AUDIO_ID'
NAN_REPLACEMENT_VALUE = 0.0  
PADDING_VALUE = 0.0         
BATCH_SIZE = 32
LABEL_BONAFIDE = 1
LABEL_SPOOF = 0


# --- Load Labels from Text File ---
def load_labels_from_file(label_file_path):
    """
    Returns:
        dict: A dictionary mapping AUDIO_ID (str) to numerical label (int).
              e.g., {'LA_T_9351820': 1, 'LA_T_1004644': 0}
    """
    labels_map = {}
    try:
        with open(label_file_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                #if len(parts) >= 3: # Speaker_ID AUDIO_ID ... Label
                audio_id = parts[1]
                label_str = parts[-1].lower() # Get the last part as label
                
                if label_str == 'bonafide':
                    labels_map[audio_id] = LABEL_BONAFIDE
                elif label_str == 'spoof':
                    labels_map[audio_id] = LABEL_SPOOF
                # Else: unknown label string, ignore or log
            # Else: line format incorrect, ignore or log
    except FileNotFoundError:
        print(f"Error: Label file not found at {label_file_path}")
    return labels_map

# --- Process pitch sequences and match them with labels ---
class PitchDataset(Dataset):
    def __init__(self, dataframe, pitch_col, audio_id_col, label_file_path, nan_replacement=NAN_REPLACEMENT_VALUE):
       
        #self.nan_replacement = nan_replacement
        
        labels_map = load_labels_from_file(label_file_path)
        
        self.processed_pitches = []
        self.labels = []
        
        print(f"Attempting to match {len(dataframe)} entries from DataFrame with labels from '{label_file_path}'...")
        found_count = 0
        # Iterate through the DataFrame and match with loaded labels
        for index, row in dataframe.iterrows():    
            audio_id = row[audio_id_col]
            if audio_id in labels_map:
                pitch_sequence_raw = row[pitch_col]
                processed_seq = np.nan_to_num(pitch_sequence_raw, nan=nan_replacement)
                
                self.processed_pitches.append(torch.tensor(processed_seq, dtype=torch.float32))
                self.labels.append(labels_map[audio_id])
                found_count += 1
        
        if not self.processed_pitches:
            raise ValueError("No samples were successfully matched and processed. Check your AUDIO_IDs and label file.")

        self.labels = torch.tensor(self.labels, dtype=torch.long) # Assuming labels are integers for classification
        print(f"Successfully matched and processed {found_count} samples out of {len(dataframe)} DataFrame entries.")


    def __len__(self):
        """Returns the total number of matched samples in the dataset."""
        return len(self.processed_pitches)

    def __getitem__(self, idx):
        """
        Returns one sample from the dataset: a preprocessed pitch sequence and its label.
        """
        pitch_sequence = self.processed_pitches[idx]
        label = self.labels[idx]
        return pitch_sequence, label

# --- Custom Collate Function for Dynamic Padding  ---
def collate_fn(batch, padding_value=PADDING_VALUE):
    """
    Pads sequences within a batch to the same length.
    """
    sequences = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    lengths = torch.tensor([len(seq) for seq in sequences], dtype=torch.long)
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=padding_value)
    labels = torch.stack(labels)
    if padded_sequences.ndim == 2:
        padded_sequences = padded_sequences.unsqueeze(2)
    return padded_sequences, lengths, labels


>NOTE: ^Many PyTorch layers, especially those dealing with sequences like LSTMs, GRUs, or 1D Convolutional layers, expect input in a 3D format.If your pad_sequence results in a (batch_size, sequence_length) tensor because each time step in your sequence is represented by a single number, unsqueeze(2) reshapes it to (batch_size, sequence_length, 1). This '1' signifies that there is one feature (or channel) per time step in the sequence. This makes the tensor compatible with layers that expect this 3D input.

In [None]:
### For remote server
train_features_path = '/home/users1/liqe/TeamLab/prosody_features_train.parquet'
dev_features_path = '/home/users1/liqe/TeamLab/prosody_features_dev.parquet'

df_train = pd.read_parquet(train_features_path, engine='pyarrow')
df_dev = pd.read_parquet(dev_features_path, engine='pyarrow')

### For local
# train_features_path = r'C:\Users\ivyap\Desktop\25SU\TEAMLAB\prosody_features\prosody_features_train.parquet'
# dev_features_path = r'C:\Users\ivyap\Desktop\25SU\TEAMLAB\prosody_features\prosody_features_dev.parquet'

# df_train = pd.read_pickle(train_features_path)
# df_dev = pd.read_pickle(dev_features_path)

In [None]:
### for remote
labels_train = '/home/users1/liqe/TeamLab/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt'
labels_dev = '/home/users1/liqe/TeamLab/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.dev.trl.txt'

### For local 
# labels_train = r'C:\Users\ivyap\Desktop\25SU\TEAMLAB\LA\ASVspoof2019_LA_cm_protocols\ASVspoof2019.LA.cm.train.trn.txt'
# labels_dev = r'C:\Users\ivyap\Desktop\25SU\TEAMLAB\LA\ASVspoof2019_LA_cm_protocols\ASVspoof2019.LA.cm.dev.trl.txt'


# print(df.loc[0, PITCH_COLUMN])
# print(type(df.loc[0, PITCH_COLUMN]))

In [18]:
pitch_dataset_train = PitchDataset(dataframe=df_train,
                                     pitch_col=PITCH_COLUMN,
                                     audio_id_col=AUDIO_ID_COLUMN,
                                     label_file_path=labels_train,
                                     nan_replacement=NAN_REPLACEMENT_VALUE)

train_dataloader = DataLoader(
    pitch_dataset_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn
)

pitch_dataset_dev = PitchDataset(dataframe=df_dev,
                                     pitch_col=PITCH_COLUMN,
                                     audio_id_col=AUDIO_ID_COLUMN,
                                     label_file_path=labels_dev,
                                     nan_replacement=NAN_REPLACEMENT_VALUE)

dev_dataloader = DataLoader(
    pitch_dataset_dev, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn
)

### For inspection
# for i, batch_data in enumerate(train_dataloader):
#     # batch_data is a tuple: (padded_sequences, lengths, labels)
#     batch_sequences, batch_lengths, batch_labels = batch_data
#     print(f"\n--- Batch {i+1} ---")
#     print(f"  Padded Sequences Shape: {batch_sequences.shape}")
#     print(f"  Original Lengths (first 5): {batch_lengths[:5]}")
#     print(f"  Labels (first 5): {batch_labels[:5]}")
    

#     if i == 0: # Break after the first batch for inspection
#         break


Attempting to match 25379 entries from DataFrame with labels from '/home/users1/liqe/TeamLab/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt'...
Successfully matched and processed 25379 samples out of 25379 DataFrame entries.
Attempting to match 24986 entries from DataFrame with labels from '/home/users1/liqe/TeamLab/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.dev.trl.txt'...
Successfully matched and processed 24844 samples out of 24986 DataFrame entries.


### Finding the weight (for weighted cross entropy)

is there different ways calculating weitghs?

In [19]:
labels = load_labels_from_file(labels_train)
total = len(labels)
count_bonafide = class_count = sum(1 for value in labels.values() if value == LABEL_BONAFIDE)
count_spoof =  total - count_bonafide
weight_bonafide = total / (count_bonafide * 2)
weight_spoof = total / (count_spoof * 2)

### LSTM classifier

In [20]:
class SimplePitchLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout):
        """
        Initialize the model.

        Args:
            input_dim (int): The number of features for each time step in the input sequence
                             (e.g., 1 if pitch is a single value like MIDI note).
            hidden_dim (int): The dimensionality of the hidden state in the LSTM.
            output_dim (int): The dimensionality of the output (e.g., 2 for binary classification).
            n_layers (int): The number of LSTM layers.
            bidirectional (bool): If True, becomes a bidirectional LSTM.
            dropout (float): The dropout probability.
        """
        super().__init__()
        
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.bidirectional = bidirectional
        
        # 1. LSTM Layer
        self.lstm = nn.LSTM(input_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout if n_layers > 1 else 0,
                           batch_first=True) # Input/output tensors are (batch, seq, feature)
        
        # 2. Fully Connected Layer (Linear Layer)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        # 3. Dropout Layer (for regularization on the output of LSTM)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, sequences, sequence_lengths):
        """
        Defines the forward pass of the model.

        Args:
            sequences (torch.Tensor): A tensor of pitch sequences.
                                      Shape: (batch_size, seq_len, input_dim)
                                      If input_dim is 1, and your data is (batch_size, seq_len),
                                      you'll need to unsqueeze it: sequences.unsqueeze(2).
            sequence_lengths (torch.Tensor): A tensor of the actual lengths of each sequence in the batch.
                                             Shape: (batch_size)
        
        Returns:
            torch.Tensor: The model's predictions (logits). Shape: (batch_size, output_dim)
        """
        
        # sequences shape: (batch_size, seq_len, input_dim)
        # Example: If input_dim is 1 (e.g. MIDI note numbers), and your input `sequences`
        # has shape (batch_size, seq_len), you should do:
        # if self.input_dim == 1 and sequences.ndim == 2:
        #     sequences = sequences.unsqueeze(2)
        # For this example, we assume `sequences` is already (batch_size, seq_len, input_dim)

        # 1. Pack sequence
        # This is important for efficiency with variable length sequences.
        # `enforce_sorted=False` as lengths might not be pre-sorted.
        # `sequence_lengths` needs to be on CPU for `pack_padded_sequence`.
        packed_input = rnn_utils.pack_padded_sequence(sequences, sequence_lengths.cpu(), batch_first=True, enforce_sorted=False)
        
        # 2. Pass packed sequence through LSTM
        # packed_output: contains all hidden states for each time step from the last LSTM layer.
        # hidden: final hidden state for each element in the batch.
        # cell: final cell state for each element in the batch.
        packed_output, (hidden, cell) = self.lstm(packed_input)
        
        # `hidden` shape: (num_layers * num_directions, batch_size, hidden_dim)
        # `cell` shape: (num_layers * num_directions, batch_size, hidden_dim)

        # 3. Concatenate the final forward and backward hidden states (if bidirectional)
        # We take the hidden state from the last LSTM layer.
        if self.lstm.bidirectional:
            # hidden[-2,:,:] is the last forward hidden state
            # hidden[-1,:,:] is the last backward hidden state
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            # hidden[-1,:,:] is the last hidden state (from the single direction)
            hidden = self.dropout(hidden[-1,:,:])
            
        # `hidden` shape after processing: (batch_size, hidden_dim * num_directions)
        
        # 4. Pass the processed hidden state through the fully connected layer
        output = self.fc(hidden)
        # `output` shape: (batch_size, output_dim)
        
        return output

### Initiate the model

In [21]:
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

CUDA available: True
CUDA device count: 4


In [22]:
class_weights = torch.tensor([weight_bonafide, weight_spoof], dtype=torch.float32).to(DEVICE)

In [27]:
model = SimplePitchLSTMClassifier(input_dim=1, hidden_dim=128, output_dim=2, n_layers=1,
                 bidirectional=True, dropout=0.0).to(DEVICE)
criterion = torch.nn.CrossEntropyLoss(reduction='mean', weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

### Evaluation

In [24]:
def evaluate_pitch_classifier(data_loader, model, criterion):
    """
    Evaluates the SimplePitchLSTMClassifier.

    Args:
        data_loader (DataLoader): DataLoader for the evaluation dataset.
        model (nn.Module): The trained SimplePitchLSTMClassifier model.
        criterion (nn.Module): The loss function (e.g., CrossEntropyLoss).

    Returns:
        tuple: (average_loss, accuracy)
    """
    model.eval()  # Set the model to evaluation mode (disables dropout, etc.)
    
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    scores_bonafide = []
    scores_spoof = []

    with torch.no_grad():  # Disable gradient calculations during evaluation
        for batch_sequences, batch_lengths, batch_labels in data_loader:
            # Move data to the same device as the model
            batch_sequences = batch_sequences.to(DEVICE)
            # batch_lengths are used by pack_padded_sequence which expects them on CPU,
            # but the labels should be on the same device as model outputs for loss calculation.
            batch_labels = batch_labels.to(DEVICE)

            # Forward pass: Get model outputs (logits)
            # The model expects sequences and their original lengths
            logits = model(batch_sequences, batch_lengths)
            
            # Calculate loss for the current batch
            loss = criterion(logits, batch_labels)
            total_loss += loss.item() * batch_sequences.size(0) # Accumulate loss, weighted by batch size

            # for EER
            probabilities = torch.softmax(logits, dim=1)
            
            
            for i in range(len(batch_labels)):
                current_label = batch_labels[i]
                current_score = probabilities[i]

                if current_label == LABEL_BONAFIDE:
                    scores_bonafide.append(current_score[LABEL_BONAFIDE].cpu())     ## numpy is cpu only, need to move tensor from gpu
                elif current_label == LABEL_SPOOF:
                    scores_spoof.append(current_score[LABEL_BONAFIDE].cpu())
            

            # Get predicted classes by finding the index of the max logit
            # Logits shape: (batch_size, num_classes)
            # Predicted_classes shape: (batch_size)
            # predicted_classes = torch.argmax(logits, dim=1)
            
            # Compare predictions with true labels
            # correct_predictions += (predicted_classes == batch_labels).sum().item()
            total_samples += batch_labels.size(0) # Count number of samples in this batch

    average_loss = total_loss / total_samples if total_samples > 0 else 0.0
    #accuracy = correct_predictions / total_samples if total_samples > 0 else 0.0

    scores_bonafide_np = np.array(scores_bonafide)    
    scores_spoof_np = np.array(scores_spoof)
    eer, threshold = em.compute_eer(scores_bonafide_np, scores_spoof_np)
    
    return average_loss, eer, threshold

### The training loop

In [25]:
def train_model(model, train_dataloader, dev_dataloader, criterion, optimizer, num_epochs, device):
    """
    Trains the SimplePitchLSTMClassifier model.

    Args:
        model (nn.Module): The model to train.
        train_dataloader (DataLoader): DataLoader for the training set.
        dev_dataloader (DataLoader): DataLoader for the validation set.
        criterion (nn.Module): The loss function.
        optimizer (torch.optim.Optimizer): The optimizer.
        num_epochs (int): Number of epochs to train for.
        device (torch.device): The device to train on (e.g., 'cuda' or 'cpu').
    """
    
    print(f"Training started on device: {device}")
    model.to(device) # Ensure model is on the correct device

    # Initial metric dictionary for the progress bar
    metric_dict = {'train_loss': 'N/A', 'val_loss': 'N/A', 'val_eer': 'N/A', 'val_threshold': 'N/A'}

    # Evaluate on validation set first to get a baseline
    print("Evaluating on validation set before training...")
    model.eval() # Set model to evaluation mode
    val_loss_initial, val_eer_initial, threshold_initial = evaluate_pitch_classifier(dev_dataloader, model, criterion)
    metric_dict.update({'val_loss': f'{val_loss_initial:.3f}', 'val_eer': f'{val_eer_initial*100:.2f}%', 'val_threshold': f'{threshold_initial*100:.2f}%'})
    print(f"Initial Validation - Loss: {val_loss_initial:.4f}, EER: {val_eer_initial*100:.2f}%, Threshold: {threshold_initial*100:.2f}%")

    # Progress bar setup
    # Total steps: number of epochs * number of batches per epoch
    total_steps = num_epochs * len(train_dataloader)
    pbar = tqdm(total=total_steps, initial=0, postfix=metric_dict, unit="batch")

    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode (enables dropout, etc.)
        pbar.set_description(f"Epoch {epoch + 1}/{num_epochs}")
        
        running_train_loss = 0.0
        num_train_batches = 0

        for batch_sequences, batch_lengths, batch_labels in train_dataloader:
            # Move data to the specified device
            batch_sequences = batch_sequences.to(device)
            # batch_lengths are used by pack_padded_sequence which expects them on CPU
            batch_labels = batch_labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass: Get model outputs (logits)
            # The model expects sequences and their original lengths
            logits = model(batch_sequences, batch_lengths)
            
            # Calculate loss
            # Logits shape: (batch_size, num_classes)
            # batch_labels shape: (batch_size)
            loss = criterion(logits, batch_labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Update statistics for progress bar and logging
            running_train_loss += loss.item()
            num_train_batches += 1
            
            pbar.update(1) # Increment progress bar by one batch
            metric_dict.update({'train_loss': f'{loss.item():.3f}'}) # Current batch loss
            pbar.set_postfix(metric_dict)
        
        # Calculate average training loss for the epoch
        avg_epoch_train_loss = running_train_loss / num_train_batches if num_train_batches > 0 else 0.0
        metric_dict.update({'train_loss': f'{avg_epoch_train_loss:.3f}'}) # Average epoch loss
        
        # Evaluate on validation set after each epoch
        model.eval() # Set model to evaluation mode
        avg_val_loss, val_eer, val_threshold = evaluate_pitch_classifier(dev_dataloader, model, criterion)
        
        metric_dict.update({'val_loss': f'{avg_val_loss:.3f}', 'val_eer': f'{val_eer*100:.2f}%', 'val_threshold': f'{val_threshold*100:.2f}%'})
        pbar.set_postfix(metric_dict) # Update with latest validation metrics
        
        # Optional: Print epoch summary
        print(f"\nEpoch {epoch+1} Summary: Avg Train Loss: {avg_epoch_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, EER: {val_eer*100:.2f}%, Threshold: {val_threshold*100:.2f}%")

    pbar.close()
    print("Training finished.")

In [26]:
NUM_EPOCHS = 20
train_model(model, train_dataloader, dev_dataloader, criterion, optimizer, NUM_EPOCHS, DEVICE)

Training started on device: cuda
Evaluating on validation set before training...
Initial Validation - Loss: 0.6792, EER: 83.75%, Threshold: 49.19%


  0%|          | 0/15880 [00:00<?, ?batch/s, train_loss=N/A, val_eer=83.75%, val_loss=0.679, val_threshold=49.…


Epoch 1 Summary: Avg Train Loss: 0.0660, Val Loss: 0.0620, EER: 34.57%, Threshold: 2.30%

Epoch 2 Summary: Avg Train Loss: 0.0648, Val Loss: 0.0625, EER: 31.75%, Threshold: 2.04%

Epoch 3 Summary: Avg Train Loss: 0.0661, Val Loss: 0.0644, EER: 29.74%, Threshold: 2.55%

Epoch 4 Summary: Avg Train Loss: 0.0650, Val Loss: 0.0637, EER: 28.88%, Threshold: 2.30%

Epoch 5 Summary: Avg Train Loss: 0.0643, Val Loss: 0.0640, EER: 29.08%, Threshold: 3.20%

Epoch 6 Summary: Avg Train Loss: 0.0644, Val Loss: 0.0653, EER: 30.77%, Threshold: 1.19%

Epoch 7 Summary: Avg Train Loss: 0.0629, Val Loss: 0.0610, EER: 31.52%, Threshold: 2.88%

Epoch 8 Summary: Avg Train Loss: 0.0628, Val Loss: 0.0623, EER: 30.77%, Threshold: 3.79%

Epoch 9 Summary: Avg Train Loss: 0.0627, Val Loss: 0.0681, EER: 35.67%, Threshold: 1.01%

Epoch 10 Summary: Avg Train Loss: 0.0634, Val Loss: 0.0612, EER: 31.16%, Threshold: 2.54%

Epoch 11 Summary: Avg Train Loss: 0.0629, Val Loss: 0.0616, EER: 36.65%, Threshold: 2.20%

Epoch 1