Loading the Data

In [5]:
import torch
import torchaudio
import os
from torchaudio.datasets import SPEECHCOMMANDS
class SubsetSC(SPEECHCOMMANDS):
    def __init__(self, subset: str = None):
        super().__init__("./", download=True)
        
        def load_list(filename):
            filepath = os.path.join(self._path, filename)
            with open(filepath) as fileobj:
                return [os.path.normpath(os.path.join(self._path, line.strip())) for line in fileobj]

        if subset == "validation":
            self._walker = load_list("validation_list.txt")
        elif subset == "testing":
            self._walker = load_list("testing_list.txt")
        elif subset == "training":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            excludes = set(excludes)
            self._walker = [w for w in self._walker if w not in excludes]

train_set = SubsetSC("training")
test_set = SubsetSC("testing")

waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]

print(f"Shape of waveform: {waveform.size()}")
print(f"Label: {label}")
print(f"Sample Rate: {sample_rate}")

Shape of waveform: torch.Size([1, 16000])
Label: backward
Sample Rate: 16000


For the recognition, we will try to just recognize what a word looks like using a convolutional neural network. So we can transform the audio from raw audio into a 2d spectogram then input that into the cnn. Additionally, for shorter words, the cnn can find the word shape wherever it is in the word.

In [19]:
transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=16000,
    n_mels=64
)
spectrogram = transform(waveform)
print("Shape: ", spectrogram.shape)

Shape:  torch.Size([1, 64, 81])


Showing what a spectogram looks like

In [10]:
import matplotlib.pyplot as plt
import numpy as np
plt.plot(spectrogram[0][0])

NameError: name 'spectrogram' is not defined

Making labels for training. ready_data function is used to standardize the data and 

In [9]:
labels = sorted(list(set(datapoint[2] for datapoint in train_set)))
label_to_index = {label: index for index, label in enumerate(labels)}
def ready_data(batch):
    #A batch is a list of objects in the train set
    tensors, targets = [],[]
    for waveform, _, label, *_ in batch:
        tensors += [waveform.squeeze(0)]
        targets += [label_to_index[label]]
    # makes all samples the same size
    tensors = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True)

    #makes targets into single tensor
    targets = torch.stack([torch.tensor(t) for t in targets])
    return tensors, targets
    

In [12]:
num_labels = len(set(labels))
print(len(labels))
print(train_set[4][2])
print(num_labels)
print()

35
backward
35



Using the cnn we used for homework

In [1]:
from torchvision import models
import torch.nn as nn

In [2]:
model = models.mobilenet_v2(pretrained=True)



In [3]:
model.features

Sequential(
  (0): Conv2dNormActivation(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU6(inplace=True)
  )
  (1): InvertedResidual(
    (conv): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (2): InvertedResidual(
    (conv): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (

Im training using my mac so this is for using mac gpus.

In [6]:
import torch.optim as optim
from tqdm import tqdm
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS device found. Using MPS.")
else:
    device = torch.device("cpu")
    print("MPS not available. Using CPU.")

MPS device found. Using MPS.


training preperation

In [7]:
num_features = model.features[-1].out_channels
print(num_features)

1280


In [15]:
train_loader = torch.utils.data.DataLoader(
    train_set, 
    batch_size=32, 
    shuffle=True, 
    collate_fn=ready_data,
    num_workers=0
)

test_loader = torch.utils.data.DataLoader(
    test_set, 
    batch_size=32, 
    shuffle=False, 
    collate_fn=ready_data,
    num_workers=0
)

num_hidden = 100


model.classifier = nn.Sequential(
nn.Dropout(0.2),
nn.Linear(in_features=num_features, out_features=num_hidden),
nn.ReLU(),
nn.Linear(in_features=num_hidden, out_features=num_labels)
)
# Making process run on gpu
model = model.to(device)

transform = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=64).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

Train step

In [16]:
nepochs = 5
for epoch in range(nepochs):
    model.train()
    running_loss = 0.0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=True)
    for images, labels in loop:
        images = images.to(device)
        labels = labels.to(device)

        #making image 3d to match rgb
        spectrogram = transform(images)
        spectrogram = spectrogram.unsqueeze(1)
        spectrogram = spectrogram.repeat(1,3,1,1)

        optimizer.zero_grad()
        outputs = model(spectrogram)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1}, Training Loss: {running_loss:.4f}")

    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        loop = tqdm(test_loader, desc="Evaluating", leave=True)
        for images, labels in loop:

            images = images.to(device)
            labels = labels.to(device)
            spectrogram = transform(images)
            spectrogram = spectrogram.unsqueeze(1)
            spectrogram = spectrogram.repeat(1,3,1,1)
            outputs = model(spectrogram)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            accuracy = 100 * correct / total 
            loop.set_postfix({'Accuracy': f'{accuracy:.2f}%'})
    accuracy = 100 * correct / total
    print(f"Epoch {epoch+1}, Test Accuracy: {accuracy:.2f}%\n")


Epoch 1: 100%|██████████| 2652/2652 [06:12<00:00,  7.13it/s, loss=0.398]  


Epoch 1, Training Loss: 769.5584


Evaluating: 100%|██████████| 344/344 [00:36<00:00,  9.36it/s, Accuracy=60.75%]


Epoch 1, Test Accuracy: 60.75%



Epoch 2: 100%|██████████| 2652/2652 [05:58<00:00,  7.40it/s, loss=0.137]  


Epoch 2, Training Loss: 442.9202


Evaluating: 100%|██████████| 344/344 [00:35<00:00,  9.67it/s, Accuracy=68.88%]


Epoch 2, Test Accuracy: 68.88%



Epoch 3: 100%|██████████| 2652/2652 [06:10<00:00,  7.15it/s, loss=0.275]  


Epoch 3, Training Loss: 380.3444


Evaluating: 100%|██████████| 344/344 [00:35<00:00,  9.63it/s, Accuracy=83.47%]


Epoch 3, Test Accuracy: 83.47%



Epoch 4: 100%|██████████| 2652/2652 [06:11<00:00,  7.14it/s, loss=0.0102] 


Epoch 4, Training Loss: 338.0705


Evaluating: 100%|██████████| 344/344 [00:35<00:00,  9.58it/s, Accuracy=69.72%]


Epoch 4, Test Accuracy: 69.72%



Epoch 5: 100%|██████████| 2652/2652 [06:02<00:00,  7.32it/s, loss=0.0424]  


Epoch 5, Training Loss: 304.3359


Evaluating: 100%|██████████| 344/344 [00:34<00:00, 10.07it/s, Accuracy=67.72%]

Epoch 5, Test Accuracy: 67.72%






In [17]:
import torch
import torchaudio

def predict_single_file(filepath, model, transform, class_names, device):
    # 1. Load the audio
    waveform, sample_rate = torchaudio.load(filepath)
    
    # Optional: Resample if your file isn't 16000Hz
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform).to(device)

    # 2. Transform to Spectrogram
    # Input: [1, Time] -> Output: [1, Freq, Time]
    spectrogram = transform(waveform)
    
    # 3. Add Batch and Channel Dimensions
    # We need shape [1, 1, Freq, Time] for the model
    spectrogram = spectrogram.unsqueeze(0) 
    
    # Note: If you are using the 'AudioMobileNet' class we defined earlier, 
    # it handles the repetition to 3 channels internally. 
    # If using a raw model, you might need: spectrogram = spectrogram.repeat(1, 3, 1, 1)

    # 4. Move to GPU/CPU
    spectrogram = spectrogram.to(device)
    spectrogram = spectrogram.repeat(1,3,1,1)

    # 5. Predict
    model.eval() # Switch to eval mode (turns off dropout, etc.)
    with torch.no_grad():
        output = model(spectrogram)
        
        # Get the class with the highest score
        prediction_index = output.argmax(dim=1).item()
        predicted_label = class_names[prediction_index]
        confidence = output.softmax(dim=1).max().item()

    return predicted_label, confidence

In [18]:
# 1. Get your list of class names (ensure this matches your training!)
# If you used the SpeechCommands code earlier, it's likely this:
# class_names = sorted(list(set(datapoint[2] for datapoint in train_set)))

# 2. Define the path to your wav file
wav_file_path = "batHappy.wav" # <--- Replace this with your file path

# 3. Run prediction
prediction, conf = predict_single_file(
    wav_file_path, 
    model, 
    transform, 
    labels, 
    device
)

print(f"Prediction: {prediction}")
print(f"Confidence: {conf * 100:.2f}%")
print(label_to_index["happy"])

Prediction: 10
Confidence: 99.99%
12


Next, I will try a 1D CNN that is better accustomed for this task.

In [None]:
#same as ready data command but for 1D data
def ready_data_array(batch):
    batch = [item for item in batch]
    tensors = [item[0].permute(1,0) for item in batch]
    tensors = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True)
    tensors = tensors.permute(0, 2, 1)

    targets = torch.tensor([label_to_index[item[2]] for item in batch])

    return tensors, targets

In [None]:
# New Data Loader
batch_size = 256

train_loader2 = torch.utils.data.DataLoader(
    train_set, batch_size=batch_size, shuffle=True, collate_fn=ready_data_array, num_workers=1
)

test_loader2 = torch.utils.data.DataLoader(
    test_set, batch_size=batch_size, shuffle=True, collate_fn=ready_data_array, num_workers=1
)

I will be using the M5 model for this. It appears that M5 is known for processing raw audio.

In [None]:
class M5(nn.Module):
    def __init__(self, n_input=1, _output=35, stride=16, n_channel=32):
        super().__init__()

        # Block 1
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
        self.bn1 = nn.BatcdhNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(4)

        # Block 2
        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.bn2 = nn.BatcdhNorm1d(n_channel)
        self.pool2 = nn.MaxPool1d(4)

        # Block 3
        self