In [None]:
import librosa
from librosa import feature
from librosa import display
import os
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
!wget http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz
!mkdir ./speech_commands/
!tar -xf /content/speech_commands_v0.02.tar.gz -C ./speech_commands

--2022-10-13 23:57:29--  http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz
Resolving download.tensorflow.org (download.tensorflow.org)... 74.125.20.128, 2607:f8b0:400e:c07::80
Connecting to download.tensorflow.org (download.tensorflow.org)|74.125.20.128|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2428923189 (2.3G) [application/gzip]
Saving to: ‘speech_commands_v0.02.tar.gz’


2022-10-13 23:57:49 (118 MB/s) - ‘speech_commands_v0.02.tar.gz’ saved [2428923189/2428923189]



In [172]:
dataset = {}
exclude = ["_background_noise_"]
BASEPATH = "/content/speech_commands/"
categories = ["one", "two", "three"]
LIMIT = 50
for label in categories:
  dataset[label] = []
  if (not os.path.isdir(BASEPATH + label) or label in exclude):
    continue
  counter = 0  
  for file in os.listdir(BASEPATH + label + "/"):
    y, sr = librosa.load(BASEPATH + label + "/" + file, sr=16000)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, )
    mfcc_delta = librosa.feature.delta(mfccs)
    mfcc_delta2 = librosa.feature.delta(mfccs, order=2)
    dataset[label].append({
        "signal": y,
        "mfcc": mfccs,
        "delta": mfcc_delta,
        "delta2": mfcc_delta2
    })
    counter+=1


dataset

In [190]:
dfs = []
for k,v in dataset.items():
  tmp = pd.DataFrame(v)
  tmp['label'] = k
  dfs.append(tmp)

df = pd.concat(dfs)

In [191]:
def visualize_row(row):
  fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True, figsize=(15, 15))
  img1 = display.specshow(row["mfcc"], ax=ax[0], x_axis='time')
  ax[0].set(title='MFCC')
  ax[0].label_outer()
  img2 = display.specshow(row["delta"], ax=ax[1], x_axis='time')
  ax[1].set(title=r'MFCC-$\Delta$')
  ax[1].label_outer()
  img3 = display.specshow(row["delta2"], ax=ax[2], x_axis='time')
  ax[2].set(title=r'MFCC-$\Delta^2$')
  fig.colorbar(img1, ax=[ax[0]])
  fig.colorbar(img2, ax=[ax[1]])
  fig.colorbar(img3, ax=[ax[2]])


# Clasificador

In [192]:
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch.nn as nn

In [193]:
def pad_tensor(tensor, max_size=32):
  target = torch.zeros(20, 32)
  _, y_shape = tensor.size()
  cut_value = min(y_shape, max_size)
  target[:, :cut_value] = tensor
  return target


In [194]:
c2i={}
i2c={}
categories = sorted(df["label"].unique())
for i, category in enumerate(categories):
  c2i[category]=i
  i2c[i]=category


class SCData(Dataset):
  def __init__(self, df, labels, c2i, i2c):
    self.df = df
    self.data = []
    self.labels = []
    self.caegories = labels
    self.c2i = c2i
    self.i2c = i2c
    for ind in tqdm(range(len(df))):
      row = df.iloc[ind]
      
      mfcc = pad_tensor(torch.Tensor(row['mfcc']))
      delta = pad_tensor(torch.Tensor(row['delta']))
      delta2 = pad_tensor(torch.Tensor(row['delta2']))

      self.data.append(torch.cat((mfcc , delta, delta2), 0))
      self.labels.append(self.c2i[row['label']])
  def __len__(self):
    return len(self.data)
  def __getitem__(self, idx):
    return self.data[idx], self.labels[idx]

In [195]:
import numpy as np

# train_df, test_df = train_test_split(df, test_size=0.25, random_state=42, shuffle=True)
train, validation, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

In [196]:
train_data = SCData(train, categories, c2i, i2c)
test_data = SCData(test, categories, c2i, i2c)
valid_data = SCData(validation, categories, c2i, i2c)
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(train_data, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=16, shuffle=True)

100%|██████████| 6898/6898 [00:01<00:00, 3451.01it/s]
100%|██████████| 2300/2300 [00:01<00:00, 1779.51it/s]
100%|██████████| 2299/2299 [00:00<00:00, 4775.50it/s]


In [197]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        out, _ = self.rnn(x, h0)  
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [198]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 

        out, _ = self.lstm(x, (h0,c0))  
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [199]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(GRU, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 

        out, _ = self.gru(x, h0)  
        out = out[:, -1, :]
         
        out = self.fc(out)
        return out

In [200]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_classes = len(categories)
num_epochs = 50
batch_size = 100
learning_rate = 0.001

input_size = 60
sequence_length = 32
hidden_size = 128
num_layers = 2

model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (audios, labels) in enumerate(train_loader):  

        audios = audios.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        
        outputs = model(audios)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

# Test
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for audios, labels in test_loader:
        audios = audios.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(audios)

        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the 10000 test audios: {acc} %')

Epoch [1/50], Step [100/432], Loss: 1.0962
Epoch [1/50], Step [200/432], Loss: 0.9889
Epoch [1/50], Step [300/432], Loss: 1.0540
Epoch [1/50], Step [400/432], Loss: 1.0333
Epoch [2/50], Step [100/432], Loss: 0.6854
Epoch [2/50], Step [200/432], Loss: 0.4360
Epoch [2/50], Step [300/432], Loss: 0.6951
Epoch [2/50], Step [400/432], Loss: 0.1760
Epoch [3/50], Step [100/432], Loss: 0.5290
Epoch [3/50], Step [200/432], Loss: 0.2574
Epoch [3/50], Step [300/432], Loss: 0.2852
Epoch [3/50], Step [400/432], Loss: 0.7481
Epoch [4/50], Step [100/432], Loss: 0.0783
Epoch [4/50], Step [200/432], Loss: 0.4697
Epoch [4/50], Step [300/432], Loss: 0.3167
Epoch [4/50], Step [400/432], Loss: 0.1192
Epoch [5/50], Step [100/432], Loss: 0.0149
Epoch [5/50], Step [200/432], Loss: 0.0676
Epoch [5/50], Step [300/432], Loss: 0.4225
Epoch [5/50], Step [400/432], Loss: 0.0568
Epoch [6/50], Step [100/432], Loss: 0.1706
Epoch [6/50], Step [200/432], Loss: 0.2514
Epoch [6/50], Step [300/432], Loss: 0.0427
Epoch [6/50

In [201]:
RNN_model = model
b = 0
m = 0
for i in range(1000):
  d, l = valid_data[i]
  y_hat = model(d.reshape(-1, sequence_length, input_size))
  y_hat = valid_data.i2c[torch.argmax(y_hat.data).item()]
  y = valid_data.i2c[l]
  if y_hat == y:
    b +=1
  else:
    m += 1
  # print("y_hat:", y_hat, "y:" , y)
print(b, m)

904 96


In [202]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_classes = len(categories)
num_epochs = 50
batch_size = 100
learning_rate = 0.001

input_size = 60
sequence_length = 32
hidden_size = 128
num_layers = 2

model = LSTM(input_size, hidden_size, num_layers, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (audios, labels) in enumerate(train_loader):  

        audios = audios.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        
        outputs = model(audios)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

# Test
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for audios, labels in test_loader:
        audios = audios.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(audios)

        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the 10000 test audios: {acc} %')

Epoch [1/50], Step [100/432], Loss: 0.5429
Epoch [1/50], Step [200/432], Loss: 0.1333
Epoch [1/50], Step [300/432], Loss: 0.0659
Epoch [1/50], Step [400/432], Loss: 0.3097
Epoch [2/50], Step [100/432], Loss: 0.4896
Epoch [2/50], Step [200/432], Loss: 0.0628
Epoch [2/50], Step [300/432], Loss: 0.0171
Epoch [2/50], Step [400/432], Loss: 0.2995
Epoch [3/50], Step [100/432], Loss: 0.2159
Epoch [3/50], Step [200/432], Loss: 0.0374
Epoch [3/50], Step [300/432], Loss: 0.0193
Epoch [3/50], Step [400/432], Loss: 0.0516
Epoch [4/50], Step [100/432], Loss: 0.0038
Epoch [4/50], Step [200/432], Loss: 0.0134
Epoch [4/50], Step [300/432], Loss: 0.0228
Epoch [4/50], Step [400/432], Loss: 0.0043
Epoch [5/50], Step [100/432], Loss: 0.0011
Epoch [5/50], Step [200/432], Loss: 0.0058
Epoch [5/50], Step [300/432], Loss: 0.0017
Epoch [5/50], Step [400/432], Loss: 0.0445
Epoch [6/50], Step [100/432], Loss: 0.0399
Epoch [6/50], Step [200/432], Loss: 0.2865
Epoch [6/50], Step [300/432], Loss: 0.0579
Epoch [6/50

In [203]:
LSTM_model = model
b = 0
m = 0
for i in range(1000):
  d, l = valid_data[i]
  y_hat = model(d.reshape(-1, sequence_length, input_size))
  y_hat = valid_data.i2c[torch.argmax(y_hat.data).item()]
  y = valid_data.i2c[l]
  if y_hat == y:
    b +=1
  else:
    m += 1
  # print("y_hat:", y_hat, "y:" , y)
print(b, m)

929 71


In [204]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_classes = len(categories)
num_epochs = 50
batch_size = 100
learning_rate = 0.001

input_size = 60
sequence_length = 32
hidden_size = 128
num_layers = 2

model = GRU(input_size, hidden_size, num_layers, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (audios, labels) in enumerate(train_loader):  

        audios = audios.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        
        outputs = model(audios)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

# Test
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for audios, labels in test_loader:
        audios = audios.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(audios)

        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the 10000 test audios: {acc} %')

Epoch [1/50], Step [100/432], Loss: 0.4938
Epoch [1/50], Step [200/432], Loss: 0.2204
Epoch [1/50], Step [300/432], Loss: 0.3470
Epoch [1/50], Step [400/432], Loss: 0.1790
Epoch [2/50], Step [100/432], Loss: 0.1508
Epoch [2/50], Step [200/432], Loss: 0.1027
Epoch [2/50], Step [300/432], Loss: 0.0536
Epoch [2/50], Step [400/432], Loss: 0.0446
Epoch [3/50], Step [100/432], Loss: 0.0074
Epoch [3/50], Step [200/432], Loss: 0.1189
Epoch [3/50], Step [300/432], Loss: 0.0058
Epoch [3/50], Step [400/432], Loss: 0.0235
Epoch [4/50], Step [100/432], Loss: 0.0106
Epoch [4/50], Step [200/432], Loss: 0.0318
Epoch [4/50], Step [300/432], Loss: 0.0186
Epoch [4/50], Step [400/432], Loss: 0.1974
Epoch [5/50], Step [100/432], Loss: 0.0525
Epoch [5/50], Step [200/432], Loss: 0.2970
Epoch [5/50], Step [300/432], Loss: 0.0152
Epoch [5/50], Step [400/432], Loss: 0.0128
Epoch [6/50], Step [100/432], Loss: 0.0029
Epoch [6/50], Step [200/432], Loss: 0.5586
Epoch [6/50], Step [300/432], Loss: 0.0129
Epoch [6/50

In [205]:
GRU_model = model
b = 0
m = 0
for i in range(1000):
  d, l = valid_data[i]
  y_hat = model(d.reshape(-1, sequence_length, input_size))
  y_hat = valid_data.i2c[torch.argmax(y_hat.data).item()]
  y = valid_data.i2c[l]
  if y_hat == y:
    b +=1
  else:
    m += 1
  # print("y_hat:", y_hat, "y:" , y)
print(b, m)

927 73


In [207]:
torch.save(RNN_model, "/content/rnn.pt")
torch.save(LSTM_model, "/content/lstm.pt")
torch.save(GRU_model, "/content/gru.pt")
