In [11]:
import librosa
import numpy as np

import torch.nn as nn
from torch import optim

from x_vectors.models.LDE import LDE
from x_vectors.models.angleloss import AngleLinear
from x_vectors.models.tdnn import TDNN
import torch
from torch.utils.data import DataLoader

from utils import utils
from utils.utils import speech_collate

## Helpers

In [34]:
def load_wav(audio_filepath, sr=16000, min_dur_sec=4):
    audio_data, fs = librosa.load(audio_filepath, sr=sr)
    len_file = len(audio_data)

    if len_file < int(min_dur_sec * sr):
        dummy = np.zeros((1, int(min_dur_sec * sr) - len_file))
        extened_wav = np.concatenate((audio_data, dummy[0]))
    else:

        extened_wav = audio_data
    return extened_wav

def lin_spectogram_from_wav(wav, hop_length, win_length, n_fft=512):
    linear = librosa.stft(wav, n_fft=n_fft, win_length=win_length, hop_length=hop_length)  # linear spectrogram
    return linear.T

def load_data(filepath, sr=16000, min_dur_sec=4, win_length=400, hop_length=160, n_mels=40, spec_len=400, mode='train'):
    audio_data = load_wav(filepath, sr=sr, min_dur_sec=min_dur_sec)
    # linear_spect = lin_spectogram_from_wav(audio_data, hop_length, win_length, n_mels)
    linear_spect = lin_spectogram_from_wav(audio_data, hop_length, win_length, n_fft=512)
    mag, _ = librosa.magphase(linear_spect)  # magnitude
    # todo just my test, why doe they take magnitude
    #mag = linear_spect.real
    mag_T = mag.T

    if mode == 'train':
        randtime = np.random.randint(0, mag_T.shape[1] - spec_len)
        spec_mag = mag_T[:, randtime:randtime + spec_len]
    else:
        spec_mag = mag_T

    # preprocessing, subtract mean, divided by time-wise var
    mu = np.mean(spec_mag, 0, keepdims=True)
    std = np.std(spec_mag, 0, keepdims=True)
    return (spec_mag - mu) / (std + 1e-5)


class SpeechDataGenerator():
    """Speech dataset."""

    def __init__(self, audio_links, labels, mode='train', sr=16000):
        """
        Read the textfile and get the paths
        """
        self.mode = mode
        self.sr = sr
        self.audio_links = audio_links
        self.labels = labels

    def __len__(self):
        return len(self.audio_links)

    def __getitem__(self, idx):
        audio_link = self.audio_links[idx]
        class_id = self.labels[idx]
        spec = utils.load_data(audio_link, sr=self.sr, mode=self.mode)
        sample = {'features': torch.from_numpy(np.ascontiguousarray(spec)),
                  'labels': torch.from_numpy(np.ascontiguousarray(class_id))}
        return sample


class X_vector(nn.Module):
    def __init__(self, input_dim=257, num_classes=8, pooling='stat', use_angular=True, device='cpu'):
        super(X_vector, self).__init__()

        self.tdnn1 = TDNN(input_dim=input_dim, output_dim=512, context_size=5, dilation=1, dropout_p=0.5)
        self.tdnn2 = TDNN(input_dim=512, output_dim=512, context_size=3, dilation=1, dropout_p=0.5)
        self.tdnn3 = TDNN(input_dim=512, output_dim=512, context_size=2, dilation=2, dropout_p=0.5)
        self.tdnn4 = TDNN(input_dim=512, output_dim=512, context_size=1, dilation=1, dropout_p=0.5)
        self.tdnn5 = TDNN(input_dim=512, output_dim=512, context_size=1, dilation=3, dropout_p=0.5)
        self.pooling = pooling
        #### Frame levelPooling
        self.segment6 = nn.Linear(1024, 512)
        self.segment7 = nn.Linear(512, 512)
        self.output = nn.Linear(512, num_classes)
        self.softmax = nn.Softmax(dim=1)
        self.use_angluar = use_angular
        if self.use_angluar:
            self.fc2 = AngleLinear(num_classes, num_classes, device=device)

    def forward(self, inputs):

        tdnn1_out = self.tdnn1(inputs)
        tdnn2_out = self.tdnn2(tdnn1_out)
        tdnn3_out = self.tdnn3(tdnn2_out)
        tdnn4_out = self.tdnn4(tdnn3_out)
        tdnn5_out = self.tdnn5(tdnn4_out)
        ### Stat Pool
        mean = torch.mean(tdnn5_out, 1)
        std = torch.var(tdnn5_out, 1)
        stat_pooling = torch.cat((mean, std), 1)
        segment6_out = self.segment6(stat_pooling)
        x_vec = self.segment7(segment6_out)
        predictions_raw = self.output(x_vec)
        if self.use_angluar:
            predictions = self.fc2(predictions_raw)
            return predictions_raw, predictions, x_vec
        return predictions, x_vec


## Load data

In [35]:
audio_features=load_data('data_emotion/01_01_01_01_dogs-sitting_fear.wav')

In [36]:
audio_features.shape

(257, 400)

## Data loader

In [37]:
data_gen=SpeechDataGenerator(['data_emotion/01_01_01_01_dogs-sitting_fear.wav',
                             'data_emotion/01_01_01_01_dogs-sitting_fear.wav'], 
                             [0, 0])

dataloader = DataLoader(data_gen, batch_size=2, shuffle=True,
                                collate_fn=speech_collate)

## Model

In [38]:
model=X_vector()

optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0,
                                    betas=(0.9, 0.98), eps=1e-9)

In [39]:
# model

In [42]:
 for i_batch, sample_batched in enumerate(dataloader):
        features = torch.from_numpy(
            np.asarray([torch_tensor.numpy().T for torch_tensor in sample_batched[0]])).float()
        labels = torch.from_numpy(np.asarray([torch_tensor[0].numpy() for torch_tensor in sample_batched[1]]))
        features.requires_grad = True
        optimizer.zero_grad()
        pred_logits_raw, pred_logits, x_vec = model(features)

In [44]:
pred_logits_raw

tensor([[-0.0302, -0.0151,  0.0400, -0.0293,  0.0261,  0.0170,  0.0447, -0.0045],
        [-0.0303, -0.0148,  0.0399, -0.0291,  0.0266,  0.0172,  0.0448, -0.0042]],
       grad_fn=<AddmmBackward>)

In [43]:
pred_logits

(tensor([[-0.0186, -0.0076,  0.0062,  0.0255, -0.0269, -0.0188,  0.0059,  0.0215],
         [-0.0181, -0.0076,  0.0061,  0.0255, -0.0274, -0.0194,  0.0062,  0.0217]],
        grad_fn=<MulBackward0>),
 tensor([[-0.2758, -0.2492, -0.2398, -0.1858, -0.3070, -0.2766, -0.2401, -0.2011],
         [-0.2745, -0.2494, -0.2402, -0.1859, -0.3093, -0.2786, -0.2400, -0.2007]],
        grad_fn=<MulBackward0>))

In [33]:
x_vec

tensor([[-0.0382, -0.0146, -0.0213,  ..., -0.0019, -0.0155,  0.0205],
        [-0.0378, -0.0135, -0.0225,  ..., -0.0026, -0.0147,  0.0219]],
       grad_fn=<AddmmBackward>)