<a href="https://colab.research.google.com/github/WyattRoss/csci4170/blob/main/Homework_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [63]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import librosa
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch import optim

In [44]:
# format of a recording is {digit}_{speaker}_{index}.wav
speakers = ["jackson", "nicolas", "theo", "yweweler", "george", "lucas"]
recordings_per_digit = 50 # 0-indexed

def load_recording(path):
  wave, sr = librosa.load(path, sr=None, mono=True)
  wave = wave[::3] # shortening to make the data more manageable
  mfcc_wave = librosa.feature.mfcc(y=wave, sr=8000, n_mfcc=13, n_fft=300)
  mfcc_wave = np.pad(mfcc_wave, ((0, 0), (0, 20-mfcc_wave.shape[1])), mode="constant")
  return mfcc_wave

def generate_dataset(basepath):
  df = pd.DataFrame(columns=["digit", "speaker", "recording_index", "waveform"])
  for speaker in speakers:
    for digit in range(10):
      for i in range(50):
        path = basepath + f"{digit}_{speaker}_{i}.wav"
        recording = load_recording(path)
        df.loc[len(df.index)] = [digit, speaker, i, recording]
  return df


In [42]:
recordings = generate_dataset("./drive/MyDrive/recordings/")

In [70]:
class AudioRNN(nn.Module):
  # Types: None, "LSTM", "GRU"
  def __init__(self, input_size=13, hidden_size=128, num_layers=2, num_classes=10, rnn_type="DEFAULT"):
    super(AudioRNN, self).__init__()

    if rnn_type.upper() == "LSTM":
      self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=False)
    elif rnn_type.upper() == "GRU":
      self.rnn = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=False)
    else:
      self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, bidirectional=False)

    self.fc = nn.Linear(hidden_size, num_classes)

  def forward(self, x):
    out, _ = self.rnn(x)
    out = self.fc(out[:, -1, :])
    return out
  def train_model(self, train_loader, num_epochs=10, learning_rate=0.001, device='cpu', verbose=True, checkpoints=10):
      self.to(device)
      criterion = nn.CrossEntropyLoss()
      optimizer = optim.Adam(self.parameters(), lr=learning_rate)

      self.train()
      for epoch in range(num_epochs):
          total_loss = 0
          for i, (inputs, labels) in enumerate(train_loader):
              inputs, labels = inputs.to(device), labels.to(device)
              outputs = self(inputs)
              loss = criterion(outputs, labels)
              optimizer.zero_grad()
              loss.backward()
              optimizer.step()
              total_loss += loss.item()

          if verbose and epoch % (num_epochs // checkpoints) == 0:
              avg_loss = total_loss / len(train_loader)
              print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
  def test_model(self, data_loader, device='cpu', verbose=True):
    self.eval()
    self.to(device)

    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = self(inputs)
            _, predicted = torch.max(outputs, dim=1)

            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = 100 * correct / total

    if verbose:
        print(f"Accuracy: {accuracy:.2f}%")

    return accuracy



In [48]:
class TorchAudioDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        x = torch.tensor(row['waveform'].T, dtype=torch.float32)  # Shape: [time, features]
        y = torch.tensor(row['digit'], dtype=torch.long)
        return x, y


In [49]:
dataset = TorchAudioDataset(recordings)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [55]:
df_train, df_val = train_test_split(recordings, test_size=0.2, stratify=recordings['digit'], random_state=42)

train_loader = DataLoader(TorchAudioDataset(df_train), batch_size=32, shuffle=True)
val_loader = DataLoader(TorchAudioDataset(df_val), batch_size=32)

In [71]:
model = AudioRNN()
model.train_model(train_loader, num_epochs=100, learning_rate=0.001, device='cpu', verbose=True)
model.test_model(val_loader, device='cpu', verbose=True)

Epoch [1/100], Loss: 2.3087
Epoch [11/100], Loss: 1.7860
Epoch [21/100], Loss: 1.4840
Epoch [31/100], Loss: 1.3052
Epoch [41/100], Loss: 1.1906
Epoch [51/100], Loss: 0.9962
Epoch [61/100], Loss: 0.9614
Epoch [71/100], Loss: 0.8829
Epoch [81/100], Loss: 0.7948
Epoch [91/100], Loss: 0.8506
Accuracy: 59.67%


59.666666666666664

In [76]:
modelLSTM = AudioRNN(rnn_type="LSTM")
modelLSTM.train_model(train_loader, num_epochs=10, learning_rate=0.001, device='cpu', verbose=True)
modelLSTM.test_model(val_loader, device='cpu', verbose=True)

Epoch [1/10], Loss: 2.3042
Epoch [2/10], Loss: 2.1321
Epoch [3/10], Loss: 1.8841
Epoch [4/10], Loss: 1.7743
Epoch [5/10], Loss: 1.6714
Epoch [6/10], Loss: 1.4921
Epoch [7/10], Loss: 1.4184
Epoch [8/10], Loss: 1.3371
Epoch [9/10], Loss: 1.2606
Epoch [10/10], Loss: 1.0969
Accuracy: 61.33%


61.333333333333336

In [73]:
modelGRU = AudioRNN(rnn_type="GRU")
modelGRU.train_model(train_loader, num_epochs=10, learning_rate=0.001, device='cpu', verbose=True)
modelGRU.test_model(val_loader, device='cpu', verbose=True)

Epoch [1/10], Loss: 2.2746
Epoch [2/10], Loss: 1.8537
Epoch [3/10], Loss: 1.3010
Epoch [4/10], Loss: 1.0416
Epoch [5/10], Loss: 0.8505
Epoch [6/10], Loss: 0.7091
Epoch [7/10], Loss: 0.6081
Epoch [8/10], Loss: 0.5073
Epoch [9/10], Loss: 0.4408
Epoch [10/10], Loss: 0.3992
Accuracy: 82.00%


82.0

- Vanilla RNNs suffer from the vanishing/exploding gradient problem, which makes it hard to learn long-term dependencies in sequences.
- They rely solely on a single hidden state updated at each time step, without any mechanism to control what information is remembered or forgotten.
- During backpropagation, gradients shrink or grow exponentially over time steps, leading to unstable or very slow training.
- LSTMs introduce gates (input, forget, output) and a dedicated cell state that allow the network to learn what information to keep or discard across time.
- GRUs simplify LSTMs by using fewer gates (update and reset), combining hidden and cell states, and typically converge faster with similar or better performance on smaller datasets.
- Both LSTMs and GRUs preserve gradient flow better, leading to more stable and faster training than vanilla RNNs.

A traditional feed-forward network could probably work for this problem. Performance will probably be worse though. We'd have to make sure all of the inputs are the same size, and then flatten them fully. Still, we'd lack actual temporal encoding, so we'd lose key context that the RNN variants thrive on.


The dataset in use is the [free spoken digit dataset](https://github.com/Jakobovski/free-spoken-digit-dataset/tree/master)