In [3]:
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
import torch.optim as optim
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import librosa
import librosa.display
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split
import os

In [4]:
def audio_to_mel_spectrogram(audio_path, sr=22050, n_mels=128, hop_length=512):
    y, sr = librosa.load(audio_path, sr=sr)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop_length)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    return mel_spectrogram_db

def normalize_mel_spectrogram(mel_spectrogram):
    return (mel_spectrogram - mel_spectrogram.min()) / (mel_spectrogram.max() - mel_spectrogram.min())

In [5]:
class SpectrogramImageDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_paths = []
        self.labels = [] # 0 for control 1 for positive

        print(f"--- Starting dataset initialization ---")
        print(f"Root directory provided: '{root_dir}'")

        if not os.path.exists(root_dir):
            raise FileNotFoundError(f"Error: Root directory '{root_dir}' not found. Please check your path.")
        if not os.path.isdir(root_dir):
            raise NotADirectoryError(f"Error: Path '{root_dir}' is not a directory.")

        top_level_items = os.listdir(root_dir)

        for vowel_folder in top_level_items:
            vowel_folder_path = os.path.join(root_dir, vowel_folder)
            
            if os.path.isdir(vowel_folder_path):
                files_in_vowel_folder = os.listdir(vowel_folder_path)

                for image_name in files_in_vowel_folder:
                    # --- NEW CHECK: Skip AppleDouble files ---
                    if image_name.startswith('._'):
                        continue

                    if ':Zone.Identifier' in image_name:
                        continue
                    
                    if not image_name.endswith('.png'):
                        continue

                    label = None
                    lower_image_name = image_name.lower()
                    
                    is_l_ach_present = 'l_ach_' in lower_image_name
                    is_l_c_ach_present = 'l_c_ach_' in lower_image_name
                    
                    if is_l_ach_present and not is_l_c_ach_present:
                        label = 1 # Hypernasality
                    elif is_l_c_ach_present:
                        label = 0 # Control 
                    
                    if label is not None:
                        image_path = os.path.join(vowel_folder_path, image_name)
                        self.image_paths.append(image_path)
                        self.labels.append(label)
                    else:
                        print(f"    SKIPPING: '{image_name}' - Did not match any expected pattern (l_ach_ or l_c_ach_).")
            else:
                pass # Skip non-directory items 
                
        print(f"\n--- Finished scanning directories ---")
        if not self.image_paths:
            raise ValueError(f"No valid PNG images found matching expected patterns in '{root_dir}' or its subfolders. Please double-check your directory path and filename conventions.")
        
        print(f"Total samples successfully loaded into dataset: {len(self.image_paths)}")

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]

        image = Image.open(img_path).convert('RGB') 

        if self.transform:
            image = self.transform(image)
        
        return image, label

train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [6]:
class HypernasalityDetectorResNet18(nn.Module):
    def __init__(self, num_classes=2): # binary for hypernasality 
        super(HypernasalityDetectorResNet18, self).__init__()
        
        # load resnet18 
        self.resnet18 = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)

        # Modify the final fully connected layer for # of classes
        num_ftrs = self.resnet18.fc.in_features
        self.resnet18.fc = nn.Linear(num_ftrs, num_classes)

    def forward(self, x):
        return self.resnet18(x)
    

# updated for my file structure can be easily adjusted
dataset_root_dir = '../Data' 

# Create the dataset instance
hypernasality_dataset = SpectrogramImageDataset(root_dir=dataset_root_dir, transform=val_transforms)

# This will print the final count, which should match the one from inside the class
print(f"Final Total samples loaded (from dataset object): {len(hypernasality_dataset)}")

train_size = int(0.8 * len(hypernasality_dataset))
val_size = len(hypernasality_dataset) - train_size
train_dataset, val_dataset = random_split(hypernasality_dataset, [train_size, val_size])

val_dataset.dataset.transform = val_transforms
train_dataset.dataset.transform = train_transforms # Ensure val set still uses val_transforms explicitly
# Create DataLoaders
batch_size = 32 # Adjust based on your GPU memory
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4) # num_workers can speed up data loading
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

print(f"\n--- Dataset Split Information ---")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Train batches: {len(train_dataloader)}")
print(f"Validation batches: {len(val_dataloader)}")


--- Starting dataset initialization ---
Root directory provided: '../Data'

--- Finished scanning directories ---
Total samples successfully loaded into dataset: 72
Final Total samples loaded (from dataset object): 72

--- Dataset Split Information ---
Training samples: 57
Validation samples: 15
Train batches: 2
Validation batches: 1


In [7]:
model = HypernasalityDetectorResNet18(num_classes=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

In [None]:
num_epochs = 10
best_val_accuracy = 0.0
best_epoch = -1
patience = 5 # Number of epochs to wait for improvement before stopping
epochs_no_improve = 0

print(f"\n--- Starting Training ---")
for epoch in range(num_epochs):
    # Training Phase
    model.train() # Set the model to training mode (e.g., enables dropout, batch norm updates)
    running_train_loss = 0.0
    correct_train = 0
    total_train = 0

    for batch_idx, (inputs, labels) in enumerate(train_dataloader):
        inputs, labels = inputs.to(device), labels.to(device) # Move inputs and labels to the device

        optimizer.zero_grad() # Zero the gradients before each batch
        outputs = model(inputs) # Forward pass
        loss = criterion(outputs, labels) # Calculate loss
        loss.backward() # Backpropagation
        optimizer.step() # Update model parameters

        running_train_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs.data, 1) # Get predictions
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    epoch_train_loss = running_train_loss / len(train_dataset)
    epoch_train_accuracy = 100 * correct_train / total_train
    
    # Validation Phase
    model.eval() # Set the model to evaluation mode (e.g., disables dropout, uses fixed batch norm)
    running_val_loss = 0.0
    correct_val = 0
    total_val = 0

    with torch.no_grad(): # Disable gradient calculation for validation
        for inputs, labels in val_dataloader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_val_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    epoch_val_loss = running_val_loss / len(val_dataset)
    epoch_val_accuracy = 100 * correct_val / total_val

    print(f"Epoch {epoch+1}/{num_epochs}: "
        f"Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_accuracy:.2f}% | "
        f"Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_accuracy:.2f}%")
    
    # --- Early Stopping Logic ---
    if epoch_val_accuracy > best_val_accuracy:
        best_val_accuracy = epoch_val_accuracy
        best_epoch = epoch + 1
        epochs_no_improve = 0
        # Save the model with the best validation accuracy
        torch.save(model.state_dict(), 'best_hypernasality_resnet18.pth')
        print(f"  --> Saved best model at Epoch {best_epoch} with Val Acc: {best_val_accuracy:.2f}%")
    else:
        epochs_no_improve += 1
        if epochs_no_improve == patience:
            print(f"  --> Early stopping triggered! No improvement for {patience} epochs.")
            print(f"  --> Best Validation Accuracy: {best_val_accuracy:.2f}% at Epoch {best_epoch}")
            break # Exit the training loop

print(f"\n--- Training Complete ---")

# --- Final Evaluation ---

print(f"\nModel training and evaluation setup complete. You can run this script to train your model.")
print(f"Consider saving your trained model's weights using: torch.save(model.state_dict(), 'hypernasality_resnet18.pth')")


--- Starting Training ---
Epoch 1/10: Train Loss: 0.7510, Train Acc: 54.39% | Val Loss: 1.6234, Val Acc: 40.00%
  --> Saved best model at Epoch 1 with Val Acc: 40.00%
Epoch 2/10: Train Loss: 0.6284, Train Acc: 70.18% | Val Loss: 3.3079, Val Acc: 60.00%
  --> Saved best model at Epoch 2 with Val Acc: 60.00%
Epoch 3/10: Train Loss: 0.1936, Train Acc: 92.98% | Val Loss: 4.4135, Val Acc: 60.00%
Epoch 4/10: Train Loss: 0.2990, Train Acc: 91.23% | Val Loss: 5.4081, Val Acc: 60.00%
Epoch 5/10: Train Loss: 0.1129, Train Acc: 91.23% | Val Loss: 7.6204, Val Acc: 60.00%
Epoch 6/10: Train Loss: 0.0308, Train Acc: 100.00% | Val Loss: 8.3255, Val Acc: 60.00%
Epoch 7/10: Train Loss: 0.1192, Train Acc: 96.49% | Val Loss: 8.0739, Val Acc: 60.00%
  --> Early stopping triggered! No improvement for 5 epochs.
  --> Best Validation Accuracy: 60.00% at Epoch 2

--- Training Complete ---

Model training and evaluation setup complete. You can run this script to train your model.
Consider saving your trained m

In [None]:
# Instantiate the model 
model = HypernasalityDetectorResNet18(num_classes=2)

model_path = 'best_hypernasality_resnet18.pth' # set path for use through API's
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))) # if you're loading on CPU

# Set the model to evaluation mode 
model.eval()


HypernasalityDetectorResNet18(
  (resnet18): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1,