# Install Dependencies & Import

In [99]:
# %pip install git+https://github.com/openai/whisper.git
%pip install torchaudio




In [100]:
#imports
import os
import re
import string
import numpy as np

# Fixes dead kernel issue on my computer
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

# Load train and test dataset

Load train and test datasets (LibriTTS):

In [101]:
import torch
import torchaudio
import pickle

# Stabilize across runs
torch.manual_seed(7)


# train_url = "train-clean-100"
# test_url = "test-clean"

# train_data = torchaudio.datasets.LIBRITTS(root="data", url=train_url, download=True)
# test_data = torchaudio.datasets.LIBRITTS(root="data", url=test_url, download=True)

with open("prepared-data/dataset_4000.pkl", "rb") as f:
  train_data = pickle.load(f)

print(train_data[100][0])
print(len(train_data))

with open("prepared-data/dataset_1000.pkl", "rb") as f:
  test_data = pickle.load(f)

print(len(test_data))


tensor([[1.6849e-10, 8.7600e-12, 4.0567e-13,  ..., 3.2867e-14, 8.8765e-12,
         6.9121e-11],
        [1.9077e-10, 3.2339e-09, 5.5547e-10,  ..., 2.8855e-09, 3.2846e-10,
         1.0438e-09],
        [1.2463e-09, 9.3136e-09, 9.9012e-10,  ..., 2.9423e-10, 1.1114e-09,
         8.3715e-10],
        ...,
        [2.9192e-12, 5.3809e-12, 1.6127e-12,  ..., 5.1795e-12, 3.2359e-13,
         5.3989e-12],
        [1.2314e-12, 6.2185e-13, 7.1538e-12,  ..., 6.6053e-13, 9.6694e-12,
         5.6124e-12],
        [1.0694e-11, 5.9805e-12, 2.6338e-12,  ..., 2.6154e-12, 7.9533e-13,
         6.5433e-12]])
4000
1000


We pass the datasets as an argument to DataLoader.

In [102]:
# data_loader = torch.utils.data.DataLoader(data, batch_size=1, shuffle=True)
import torch.utils.data as data
from torchvision.transforms import ToTensor

batch_size = 20

train_loader = data.DataLoader(dataset=train_data,
                              batch_size=batch_size,
                              shuffle=True,
                              )
test_loader = data.DataLoader(dataset=test_data,
                              batch_size=batch_size,
                              shuffle=False,
                              )

# Create RNN model

In [103]:
from torch import nn

 # Get ideal device (CPU, GPU, or MPS for Apple Silicon)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(f"Using {device} device")

# Simple RNN model
class RNeuralNetwork(nn.Module):
     def __init__(self):
        super(RNeuralNetwork, self).__init__()
        self.RNN = nn.RNN(input_size=1, hidden_size=1, num_layers=3, nonlinearity = "relu", dropout = 0.01)
        
     def forward(self, x):
        x = self.RNN(x)
        return x

model = RNeuralNetwork().to(device)
print(model)

Using cpu device
RNeuralNetwork(
  (RNN): RNN(1, 1, num_layers=3, dropout=0.01)
)


# Optimize the Model Parameters


In [104]:
# Use cross-entropy loss as the loss function
loss_fn = nn.CrossEntropyLoss()
learning_rate = 1e-2

# Define a pytorch optimizer using stochastic gradient descent (SGD)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Train model using training dataset

In [105]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    
    # set the model to train mode
    model.train()
    
    # keep track of loss every batch
    losses = []
    
    for batch_idx, _data in enumerate(dataloader):
        spectrograms, labels = _data[0], _data[1] 
        torch.flatten(spectrograms)
        torch.reshape(spectrograms, (129))
        spectrograms, labels = spectrograms.to(device), labels.to(device)

        optimizer.zero_grad()

        output = model(spectrograms)  # (batch, time, n_class)
        output = F.log_softmax(output, dim=2)
        output = output.transpose(0, 1) # (time, batch, n_class)

        loss = criterion(output, labels, input_lengths, label_lengths)
        loss.backward()

        # print('loss', loss.item())
        # print('learning_rate', scheduler.get_lr())

        optimizer.step()

        loss = loss.item()
        if batch_idx % 10 == 0 or batch_idx == data_len:
          print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batch_idx * len(spectrograms), data_len,
            100. * batch_idx / len(train_loader), loss.item())
          )
#         if batch % 100 == 0:
#             print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
        losses.append(loss)
        
    return np.array(losses).mean()

# Test model using test dataset

In [106]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    
    # Set the model to eval mode
    model.eval()
    
    test_loss, correct = 0, 0
    with torch.no_grad():# no_grad mode doesn't compute gradients
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X) # compute predictions from X
            test_loss += loss_fn(pred, y).item() # compute the test loss
            correct += (pred.argmax(1) == y).type(torch.float).sum().item() # number of correct predictions
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return correct

In [107]:
epochs = 10

# for plotting the training loss
history = {'losses': [], 'accuracies': []}
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    history['losses'].append(train(train_loader, model, loss_fn, optimizer))
    history['accuracies'].append(test(test_loader, model, loss_fn))
    
    plt.clf()
    fig1 = plt.figure()
    plt.plot(history['losses'], 'r-', lw=2, label='loss')
    plt.legend()
    display.clear_output(wait=True)
    display.display(plt.gcf())

    plt.clf()
    fig2 = plt.figure()
    plt.plot(history['accuracies'], 'b-', lw=1, label='accuracy')
    plt.legend()
#     display.clear_output(wait=True)
    display.display(plt.gcf())
print("Done!")

Epoch 1
-------------------------------


RuntimeError: stack expects each tensor to be equal size, but got [100, 305] at entry 0 and [100, 589] at entry 1

### Notes
- Evaluation is hard. What about slight differences?
- Ground truths are possibly noisy.