### Spoken Language Processing
В этом задании предлагается обучить классификатор класса возраста по голосу (пример с тем, как это можно сделать для пола см. в семинаре)

Подумайте, как лучше предсказывать возраст (может быть разбить на группы?) и какой лосс использовать

P.S. не забудьте, что если то вы работает в Colab, то вы можете поменять среду выполнения на GPU/TPU!

Вопросы по заданию/материалам: @Nestyme

In [None]:
!pip3 install timit-utils==0.9.0
!pip3 install torchaudio
! wget https://ndownloader.figshare.com/files/10256148 
!unzip -q 10256148

In [1]:
import timit_utils as tu
import os
import random
import numpy as np
from tqdm.notebook import tqdm, trange

import torch
import torchaudio
import torch.nn as nn
from torch.optim import Adam
import torch.nn.functional as F

import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

import IPython
_TIMIT_PATH = 'data/lisa/data/timit/raw/TIMIT'

In [2]:
SEED = 1234


def enable_reproducibility(
        seed=SEED, raise_if_no_deterministic=True,
        cudnn_deterministic=True, disable_cudnn_benchmarking=True):
    # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
    torch.use_deterministic_algorithms(raise_if_no_deterministic)

    # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"
    
    torch.backends.cudnn.benchmark = not disable_cudnn_benchmarking
    torch.backends.cudnn.deterministic = cudnn_deterministic

    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

enable_reproducibility()

## Задание 1
Загрузите данные для обучения. Для этого:
1. Скачайте датасет TIMIT (см семинар)
2. Соберите пары "голос"  — "класс возраста" также, как на семинаре собирались пары "голос"  — "пол". Аудиодорожки сконвертируйте в мелспектрограммы при помощи `torchaudio либо` `librosa`

P.S. вы можете использовать свою реализацию, а можете предложенную (см следующие ячейки)

In [181]:
import timit_utils as tu
import os
import numpy as np
from torch.utils.data import DataLoader, Dataset

def spec_to_image(spec, eps=1e-6):
      mean = spec.mean()
      std = spec.std()
      spec_norm = (spec - mean) / (std + eps)
      spec_min, spec_max = spec_norm.min(), spec_norm.max()
      spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
      spec_scaled = spec_scaled.astype(np.uint8)
      return spec_scaled

class TimitDataset(Dataset):
    def __init__(self, data_path=_TIMIT_PATH, mode='train'):
        self.doc_file_path = os.path.join(data_path, 'DOC', 'SPKRINFO.TXT')
        self.corpus = tu.Corpus(data_path)
        with open(self.doc_file_path) as f:
            self.id_age_dict = dict(
                [(tmp.split(' ')[0], 86 - int(tmp.split('  ')[5].split('/')[-1].replace('??', '50'))) \
                 for tmp in f.readlines()[39:]])
        
        self.data, self.target, self.age_classes = self.create_dataset(mode)

    def __len__(self):
        return len(self.target)

    def __getitem__(self, idx):
        age_class = self.age_classes[idx]
        age = self.target[idx]
        voice = self.data[idx]
        return voice, age, age_class

    def return_age(self, id):
        return self.id_age_dict[id]

    def create_dataset(self, mode):
        global people
        assert mode in ['train', 'valid', 'test']
        if mode == 'train':
            people = [self.corpus.train.person_by_index(i) for i in range(350)]
        if mode == 'valid':
            people = [self.corpus.train.person_by_index(i) for i in range(350, 400)]
        if mode == 'test':
            people = [self.corpus.test.person_by_index(i) for i in range(150)]
        spectrograms = []
        targets = []
        age_classes = []

        for person in tqdm(people):
                target = torch.tensor(self.return_age(person.name), dtype=torch.float)
                for i in range(len(person.sentences)):
                    spectrograms.append(
                          self.preprocess_sample(person.sentence_by_index(i).raw_audio)                          
                        )
                    targets.append(target)
                    age_classes.append(self.clasterize_by_age(target))

        return spectrograms, targets, age_classes

    def clasterize_by_age(self, age):
        if age <= 30:
            return 0
        if 30 < age <= 50:
            return 1
        if age > 50:
            return 2

    def preprocess_sample(self, amplitudes, sr=16000, max_length=150): 
        amplitudes = torch.tensor(amplitudes, dtype=torch.float) 
        spectrogram = torchaudio.transforms.MelSpectrogram(n_mels=128, sample_rate=sr, f_min=1, f_max=8192)(amplitudes)[:, :max_length]
        spectrogram = np.pad(spectrogram, [[0, 0], [0, max(0, max_length - spectrogram.shape[1])]], mode='constant')
        return torch.tensor(spec_to_image(np.float32(spectrogram)), dtype=torch.float)

In [182]:
train = TimitDataset(mode='train')
valid = TimitDataset(mode='valid')
test = TimitDataset(mode='test')

  0%|          | 0/350 [00:00<?, ?it/s]

  "At least one mel filterbank has all zero values. "


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

In [183]:
BATCH_SIZE = 64

train_loader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
valid_loader = DataLoader(valid, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
test_loader = DataLoader(test, batch_size=BATCH_SIZE, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

Простая сверточная сеть, ее можно дотюнить или поменять по желанию

In [204]:
class Model(nn.Module):
    def __init__(self, window_sizes=(3, 4, 5), num_classes=1):
        super(Model, self).__init__()

        self.convs = nn.ModuleList([
            nn.Conv2d(1, 128, (window_size, 150), padding=(window_size - 1, 0))
            for window_size in window_sizes
        ])

        
        self.fc = nn.Linear(128 * len(window_sizes) * 2, 128 * len(window_sizes) * 2)
        self.bn = nn.BatchNorm1d(128 * len(window_sizes) * 2)

        self.mse_fc = nn.Linear(128 * len(window_sizes) * 2, 1)
        self.ce_fc = nn.Linear(128 * len(window_sizes) * 2, num_classes)

    def forward(self, x):
        x = torch.unsqueeze(x, 1)  # [B, C, T, E] Add a channel dim.
        xs = []
        for conv in self.convs:
            x2 = F.relu(conv(x))  # [B, T, F, 1]
            x2 = torch.squeeze(x2, -1)  # [B, T, F]
            max_x = F.max_pool1d(x2, x2.size(2))  # [B, T, 1]
            avg_x = F.avg_pool1d(x2, x2.size(2))

            xs.append(max_x)
            xs.append(avg_x)

        x = torch.cat(xs, 2)  # [B, T, window]

        # FC
        x = x.view(x.size(0), -1) # [B, T * window]
        mse_out = self.mse_fc(x).view(-1)  # [B, value]

        # FC
        ce_out = self.ce_fc(x)  # [B, classes]

        
        return mse_out, ce_out

In [205]:
def training(model, criterions, optimizer, num_epochs, train_loader, valid_loader=None, sheduler=None, max_grad_norm=2, patience=5):
    best_loss = 1000
    cnt = 0
    mse = criterions[0]
    cross_entropy = criterions[1]

    for e in trange(num_epochs):

        model.train()
        
        valid_acc = 0
        num_iter = 0
        train_loss = 0
        correct = 0
        num_objs = 0

        pbar = tqdm(train_loader, leave=False)
        for batch in pbar:
            optimizer.zero_grad()
            labels = batch[1].to(device)
            age_classes = batch[2].to(device)
            mse_pred, ce_pred = model(batch[0].to(device))

            loss = mse(mse_pred, labels) + cross_entropy(ce_pred, age_classes)
            
            loss.backward()

            if max_grad_norm is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

            optimizer.step()
            train_loss += loss
            correct += (age_classes == ce_pred.argmax(-1)).float().sum()
            num_objs += len(labels)
            num_iter += 1

        if sheduler is not None:
            sheduler.step()
        
        train_acc = correct/num_objs
        print(f"Epoch: {e}")
        print(f"Train Loss: {train_loss / num_iter}, accuracy: {train_acc}")

        if valid_loader:
          valid_loss = 0
          num_iter = 0
          model.eval()
          with torch.no_grad():
              correct = 0
              num_objs = 0

              for batch in valid_loader:
                  labels = batch[1].to(device)
                  age_classes = batch[2].to(device)
                  mse_pred, ce_pred = model(batch[0].to(device))
                  valid_loss += mse(mse_pred, labels)
                  correct += (age_classes == ce_pred.argmax(-1)).float().sum()
                  num_objs += len(labels)
                  num_iter += 1
          
          valid_acc = correct/num_objs
          print(f"Valid Loss: {valid_loss / num_iter}, accuracy: {valid_acc}")

        if e % patience == 0 and best_loss > valid_loss.item():
            best_loss = valid_loss.item()
            cnt = 0
        else:
            cnt += 1
        
        if cnt > patience:
          break

    return train_acc, valid_acc

In [206]:
weights = [0] * 3
for t in tqdm(train):
  for i in range(3):
    weights[i] += 1 if t[2] == i else 0
print(weights)
weights = sum(weights) / torch.tensor(weights, dtype=torch.float32).to(device)
weights = weights / weights.min()

weights

  0%|          | 0/3500 [00:00<?, ?it/s]

[2160, 1230, 110]


tensor([ 1.0000,  1.7561, 19.6364])

In [207]:
model = Model(num_classes=3)
model.to(device)

optimizer = torch.optim.AdamW(
    [p for p in model.parameters() if p.requires_grad], betas=(0.9, 0.999), eps=1e-5, lr=1e-3
)

mse = nn.MSELoss()
cross_entropy = nn.CrossEntropyLoss(weight=weights)

sheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

patience = 10
num_epochs = 10

In [208]:
enable_reproducibility(raise_if_no_deterministic=False)
training(model, (mse, cross_entropy), optimizer, num_epochs, train_loader, valid_loader, sheduler, patience=patience)
enable_reproducibility()

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 0
Train Loss: 229.03732299804688, accuracy: 0.3541666567325592
Valid Loss: 99.61124420166016, accuracy: 0.3794642984867096


  0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 1
Train Loss: 84.92003631591797, accuracy: 0.3715277910232544
Valid Loss: 93.63968658447266, accuracy: 0.5491071343421936


  0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 2
Train Loss: 64.20616912841797, accuracy: 0.46325230598449707
Valid Loss: 102.46958923339844, accuracy: 0.3816964328289032


  0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 3
Train Loss: 46.45378494262695, accuracy: 0.5581597089767456
Valid Loss: 94.18233489990234, accuracy: 0.5357142686843872


  0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 4
Train Loss: 33.43952941894531, accuracy: 0.5868055820465088
Valid Loss: 102.24708557128906, accuracy: 0.5446428656578064


  0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 5
Train Loss: 20.83363151550293, accuracy: 0.6724537014961243
Valid Loss: 90.51612854003906, accuracy: 0.4888392984867096


  0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 6
Train Loss: 13.609148979187012, accuracy: 0.7141203880310059
Valid Loss: 85.26918029785156, accuracy: 0.4375


  0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 7
Train Loss: 9.848132133483887, accuracy: 0.7216435074806213
Valid Loss: 90.99998474121094, accuracy: 0.5133928656578064


  0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 8
Train Loss: 7.67041540145874, accuracy: 0.7850115895271301
Valid Loss: 89.64395904541016, accuracy: 0.5111607313156128


  0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 9
Train Loss: 7.435110569000244, accuracy: 0.7826967835426331
Valid Loss: 90.5306396484375, accuracy: 0.5625


In [209]:
test_loss = 0
num_iter = 0
model.eval()
with torch.no_grad():
   correct = 0
   num_objs = 0

   for batch in tqdm(test_loader):
       labels = batch[1].to(device)
       age_classes = batch[2].to(device)
       mse_pred, ce_pred = model(batch[0].to(device))
       test_loss += mse(mse_pred, labels)
       correct += (age_classes == ce_pred.argmax(-1)).float().sum()
       num_objs += len(labels)
       num_iter += 1
          
test_acc = correct/num_objs
print(f"Test Loss: {test_loss / num_iter}, accuracy: {test_acc}")

  0%|          | 0/24 [00:00<?, ?it/s]

Test Loss: 115.09252166748047, accuracy: 0.48533332347869873


In [210]:
def preprocess_sample_inference(amplitudes, sr=16000, max_length=150, device='cpu'):

    spectrogram = torchaudio.transforms.MelSpectrogram(n_mels=128, sample_rate=sr, f_min=1, f_max=8192)(amplitudes)[:, :max_length]
    spectrogram = np.pad(spectrogram, [[0, 0], [0, max(0, max_length - spectrogram.shape[1])]], mode='constant')

    return torch.tensor(spec_to_image(np.float32(spectrogram)), dtype=torch.float)

In [214]:
model.eval()

def predict(wavfile):
    waveform, _ = torchaudio.load(wavfile)

    input = preprocess_sample_inference(waveform.squeeze(0))
    with torch.no_grad():
        mse_pred, ce_pred = model(input.unsqueeze(0).to(device))
        mse_pred = mse_pred.cpu().detach().numpy()

    return ce_pred.cpu().detach().numpy().argmax(-1)

In [215]:
# Code for recording audio from the browser
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
import IPython
import uuid
from google.colab import output


class InvokeButton(object):
  def __init__(self, title, callback):
    self._title = title
    self._callback = callback

  def _repr_html_(self):
    from google.colab import output
    callback_id = 'button-' + str(uuid.uuid4())
    output.register_callback(callback_id, self._callback)

    template = """<button id="{callback_id}" style="cursor:pointer;background-color:#EEEEEE;border-color:#E0E0E0;padding:5px 15px;font-size:14px">{title}</button>
        <script>
          document.querySelector("#{callback_id}").onclick = (e) => {{
            google.colab.kernel.invokeFunction('{callback_id}', [], {{}})
            e.preventDefault();
          }};
        </script>"""
    html = template.format(title=self._title, callback_id=callback_id)
    return html

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=3):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  with open('audio.wav','wb+') as f:
    f.write(b)
  return 'audio.wav'

In [216]:
def classify():
  print("Now recording for 3 seconds, say what you will...")
  record()
  os.system('ffmpeg -i audio.wav -ar 16000 -y audio.wav')
  print(f"Audio recording complete, guess it is {predict('audio.wav')}")

InvokeButton('Start recording', classify)

Now recording for 3 seconds, say what you will...


<IPython.core.display.Javascript object>

Audio recording complete, guess it is [0]


  "At least one mel filterbank has all zero values. "


In [None]:
IPython.display.Audio('audio.wav')

#Задание 2
1. Обучите свой классификатор категории возраста
2. Попробуйте улучшить результат. Можно попробовать усложнить сетку, подвигать границы категорий, поискать новые данные, что угодно, кроме учиться на тесте :)
3. Какой подход оказался самым эффективным? Как думаете, почему?
4. Как считаете, где можно было бы применить такой классификатор в качестве вспомогательной задачи?


Удалось подобрать самое оптимальное разбиение классов:

      age <= 30: 0
      30 < age <= 50: 1
      age > 50: 2

* Модель пробовал усложнять с помощью BatchNorm, добавления дополнительных сверток, увеличения числа mels, углубления модели. Это было бесполезно, самым полезным оказалось обрабатывать 2 pooling'а - максимальный и средний. Также полезным оказалось добавление mse-лосса, думаю он помогает модели в спорных ситуациях, когда объект на границе 2 категорий. Взвешенный CrossEntropyLoss также дал прирост в accuracy.

* Аугментация не дала прироста. 

* Думаю, такой классификатор может быть полезен при определении предпочтений пользователя, при идентификации человека и определения некоторых заболеваний.

* Меня и моих друзей модель распознала верно :)

