In [96]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import torchaudio


import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt


In [97]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [98]:
#Hyper parameters

epochs = 75
lr = 0.001
batch_size = 16




In [99]:
class VoiceDataset(Dataset):
    def __init__(self, root):
        self.dir_path = root
        self.classes = os.listdir(self.dir_path)

        print(self.classes)

        self.data_path = []
        self.labels = []

        for root, dirs, files in os.walk(self.dir_path):
            for file in files:
                label = os.path.basename(root)
                data_path = os.path.join(root, file)
                self.data_path.append(data_path)
                self.labels.append(self.classes.index(label))
          

        print(f'{len(self.labels)} data loaded from {len(set(self.labels))} classes')        

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        data_path = self.data_path[index]
        label = self.labels[index]
        
        signal, sample_rate = torchaudio.load(data_path)
        signal = torch.mean(signal, dim = 0, keepdim = True)
        transform = torchaudio.transforms.Resample(sample_rate, 8000)
        signal = transform(signal)

        return signal, label


In [100]:
dataset = VoiceDataset('dataset')

['alireza', 'amir', 'benyamin', 'hossein', 'maryam', 'mohammad', 'morteza', 'nahid', 'parisa', 'zahra', 'zeynab']
1355 data loaded from 11 classes


In [101]:
train_size = int(len(dataset)* 0.8)
test_size = len(dataset) - train_size
print(f'Train_size : {train_size} , ',f'Test_size : {test_size}')

Train_size : 1084 ,  Test_size : 271


In [102]:
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size,test_size])

train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size = batch_size, shuffle = True)

In [103]:
class M5(nn.Module):
    def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32):
        super().__init__()
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(n_channel)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(2 * n_channel)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
        self.bn4 = nn.BatchNorm1d(2 * n_channel)
        self.pool4 = nn.MaxPool1d(4)
        self.fc1 = nn.Linear(2 * n_channel, n_output)


    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = F.avg_pool1d(x, x.shape[-1])
        x = torch.flatten(x, start_dim = 1)
        x = self.fc1(x)
        x = F.softmax(x, dim = 1)
        return x

    def accuracy(self, preds, labels):
        maxs, indices = torch.max(preds, 1)
        acc = torch.sum(indices == labels) / len(preds)
        return acc.cpu()
            

In [104]:
model = M5(n_output = 11).to(device)
print(model)

n = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Number of parameters: %s" %n)


M5(
  (conv1): Conv1d(1, 32, kernel_size=(80,), stride=(16,))
  (bn1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(32, 32, kernel_size=(3,), stride=(1,))
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(32, 64, kernel_size=(3,), stride=(1,))
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(64, 64, kernel_size=(3,), stride=(1,))
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=64, out_features=11, bias=True)
)
Numbe

In [105]:
optimizer = torch.optim.Adam(model.parameters(), lr = lr)
loss_function = nn.CrossEntropyLoss()


In [106]:
#train

model.train()

for epoch in range(epochs):
    train_loss = 0.0
    train_acc = 0.0

    for audio, labels in(train_data_loader):
        audio,labels = audio.to(device), labels.to(device)
        labels_one_hot = torch.nn.functional.one_hot(labels, num_classes = 11).type(torch.FloatTensor).to(device)

        optimizer.zero_grad()
        preds = model(audio)
        loss = loss_function(preds, labels_one_hot)
        loss.backward()
        optimizer.step()
        train_loss += loss
        train_acc += model.accuracy(preds, labels)

    
    total_loss = train_loss / len(train_data_loader)
    total_acc = train_acc / len(train_data_loader)

    if epoch % 10 == 0 :
    
        print(f'Epoch : {epoch}, Loss : {total_loss}, Accuracy : {total_acc}')
        



Epoch : 0, Loss : 2.3141930103302, Accuracy : 0.2705269753932953
Epoch : 10, Loss : 1.8498220443725586, Accuracy : 0.7113970518112183
Epoch : 20, Loss : 1.7521463632583618, Accuracy : 0.805759847164154
Epoch : 30, Loss : 1.637529730796814, Accuracy : 0.9276960492134094
Epoch : 40, Loss : 1.605129599571228, Accuracy : 0.953125
Epoch : 50, Loss : 1.5905132293701172, Accuracy : 0.9623161554336548
Epoch : 60, Loss : 1.5764187574386597, Accuracy : 0.9742646813392639
Epoch : 70, Loss : 1.5785526037216187, Accuracy : 0.9724264740943909


<function matplotlib.pyplot.plot(*args, scalex=True, scaley=True, data=None, **kwargs)>

In [110]:
#evaluation
model.eval()


test_loss = 0.0
test_acc = 0.0

for audio, labels in(test_data_loader):
    audio,labels = audio.to(device), labels.to(device)
    labels_one_hot = torch.nn.functional.one_hot(labels, num_classes = 11).type(torch.FloatTensor).to(device)

    
    preds = model(audio)
    loss = loss_function(preds, labels_one_hot)

    test_loss += loss
    test_acc += model.accuracy(preds, labels)


total_loss = test_loss / len(test_data_loader)
total_acc = test_acc / len(test_data_loader)

print(f'Epoch : {epoch}, Loss : {total_loss}, Accuracy : {total_acc}')


        

    

Epoch : 74, Loss : 1.661403775215149, Accuracy : 0.8889706134796143


In [108]:
torch.save(model.state_dict(), 'weights.pth')

In [109]:
#inference
signal, sample_rate = torchaudio.load('input/test.wav')


signal = torch.mean(signal, dim = 0, keepdim = True)
transform = torchaudio.transforms.Resample(sample_rate, 8000)
signal = transform(signal)

tensor = signal.unsqueeze(0).to(device)

preds = model(tensor)

preds = preds.cpu().detach().numpy()

labels = ['alireza', 'amir', 'benyamin', 'hossein', 'maryam', 'mohammad', 'morteza', 'nahid', 'parisa', 'zahra', 'zeynab']
output = np.argmax(preds)

print(labels[output])




parisa
