# Deepspeech with kaldi features and embedded vector

In [1]:
# Restart from here
DEV = True
EPOCHS = 3

DEBUG = True

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from warpctc_pytorch import CTCLoss
#torch.multiprocessing.set_start_method("spawn")

In [3]:
True# autoreloads
%reload_ext autoreload
%autoreload 1
%aimport parameters

# Allows to load modules from parent directory
from time import time
import inspect, sys, os, json
from os.path import dirname, abspath
sys.path.append(dirname(dirname(abspath(inspect.getfile(inspect.currentframe())))))

from pathlib import Path
from os import makedirs
from collections import OrderedDict

from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from data.data_loader import create_binarizer, get_accents_counts
from utils import count_parameters
from models.modules import MaskConv, SequenceWise, BatchRNN, InferenceBatchSoftmax, Lookahead, \
                    supported_rnns, supported_rnns_inv

from tensorboardX import SummaryWriter

import math

from torch.utils.data import DataLoader, Dataset
from decoder import GreedyDecoder, BeamCTCDecoder

In [4]:
param = parameters.get_parameters(dev=DEV, epochs=EPOCHS, us_en=False)

## Utilities

In [5]:
def val_cnts(list_):
    return pd.Series(list_).value_counts()

def extract_num (s):
    return ''.join([c if c.isdigit() else '' for c in s])

def ids_list(manifest):
    ids = []
    with open(manifest) as f:
        for l in f:
            s = l.split('/')
            ids.append(f'{s[3]}-{s[5].split(".")[0]}')
    return ids

def make_accent_dict(manifest_path):
    accent_dict = {}
    class_dict = {}
    with open(manifest_path) as f:
        for l in f:
            wav, txt, acc = l.split(',')
            num = extract_num(wav)
            accent = acc.strip()
            if accent not in class_dict:
                new_key = 0 if (len(class_dict) == 0) else max(class_dict.values()) + 1
                class_dict[accent] = new_key
            accent_dict[num] = class_dict[accent]
    return accent_dict, {v: k for k, v in class_dict.items()}

def tile(a, dim, n_tile):
    init_dim = a.size(dim)
    repeat_idx = [1] * a.dim()
    repeat_idx[dim] = n_tile
    a = a.repeat(*(repeat_idx))
    order_index = torch.LongTensor(np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
    if a.is_cuda:
        order_index = order_index.cuda()
    return torch.index_select(a, dim, order_index)

## Data Loading

In [6]:
class KaldiDeepspeechDataset(Dataset):
    """Defines an iterator over the dataset. This class is intended to be used with PyTorch DataLoader"""
    
    def __init__(self, data_path, labels, sample_ids, transcripts_path,
                 accent_id_dict,  embeddings_path, ivectors_path=None):
        
        self.data_path = data_path
        self.ivectors_path = ivectors_path
        self.transcripts_path = transcripts_path
        self.embeddings_path = embeddings_path
        self.accent_id_dict = accent_id_dict
        self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
        if isinstance(sample_ids, list):
            self._datafiles = sample_ids
        else:
            with open(sample_ids) as f:
                self._datafiles = [x.strip() for x in f.readlines()]
        
    def __getitem__(self, index):
        file_idx = self._datafiles[index]
        with open(os.path.join(self.data_path, file_idx)) as f:
            sample = json.load(f)
        sample = torch.FloatTensor(sample)
        
        target = self.accent_id_dict[extract_num(self._datafiles[index])]
        
        s_id = file_idx.split('-')[-1]

        transcript_path = f'{self.transcripts_path}sample-{s_id}.txt'
        transcript = self.parse_transcript(transcript_path)

        try:
            embedding = torch.load(f'{self.embeddings_path}{s_id}', map_location=lambda storage, loc: storage)
        except Exception as e:
            print(e, 'sample at fault:', self.embeddings_path, s_id)
        
        if self.ivectors_path is None:
            return torch.FloatTensor(sample), target, embedding, transcript
        else:
            with open(os.path.join(self.ivectors_path, self._datafiles[index])) as f:
                ivect = json.load(f)
            return torch.FloatTensor(sample), target, transcript, embedding, torch.FloatTensor(ivect)
        
    def parse_transcript(self, transcript_path):
        with open(transcript_path, 'r', encoding='utf8') as transcript_file:
            transcript = transcript_file.read().replace('\n', '')
        transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)]))
        return transcript
                      
    def __len__(self):
        return len(self._datafiles)

In [7]:
def collate_fn(batch_tot):
    """This function takes list of samples and assembles a batch. It is intended to used in PyTorch DataLoader."""
    batch = list(zip(*batch_tot))
    ivect = None
    
    if len(batch) == 4:
        input_, acc, emb, trs = batch
    elif len(batch) == 5:
        input_, acc, emb, trs, ivect = batch

    input_lens = torch.tensor([len(r) for r in input_])
    acc = torch.tensor(acc)
    
    input_ = nn.utils.rnn.pad_sequence(input_, batch_first=True)

    target_lens = torch.tensor([len(t) for t in trs])

    if ivect is not None:
        ivect = nn.utils.rnn.pad_sequence(ivect, batch_first=True)
        ivect = tile(ivect, 1, 10)
        ivect = ivect[:, :input_.size(1), :]
        input_ = torch.cat([input_, ivect], dim=2)
    
    __, idx = input_lens.sort(descending=True)
    
    targets = np.array(trs)[idx]
    targets = torch.tensor([t for target in targets for t in target])
    
    input_ = input_[idx]
    input_lens = input_lens[idx].int()
    targets = targets.int()
    target_lens = target_lens[idx].int()
    acc = acc[idx].int()

    emb = torch.cat(emb)
    emb = emb[idx]
    emb = emb.view(emb.size(0), 1, emb.size(1))
    emb = tile(emb, 1, input_.size(1))

    input_ = torch.cat([input_, emb], dim=2)
    
    return input_, input_lens, targets, target_lens, acc

class KaldiDeepspeechDataLoader(DataLoader):
    def __init__(self, *args, **kwargs):
        """
        Creates a data loader for SpeechDatasets.
        """
        super(KaldiDeepspeechDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = collate_fn

In [8]:
accent_id_dict, accent_dict = make_accent_dict(param['train_manifest'])

train_dataset = KaldiDeepspeechDataset(data_path=param['train_kaldi'],
                              labels=param['labels'],
                              sample_ids=ids_list(param['train_manifest']), 
                              transcripts_path=param['train_transcripts'],
                              embeddings_path=param['train_embeddings_256'],
                              accent_id_dict=accent_id_dict,
                              ivectors_path=None)

train_loader = KaldiDeepspeechDataLoader(train_dataset, 
                                shuffle=True, 
                                num_workers=0,#param['num_worker'],
                                batch_size=param['batch_size'])

# for data in train_loader:    
#     split_targets = []
#     offset = 0
#     for size in data[3]:
#         split_targets.append(data[2][offset:offset + size])
#         offset += size
#     target_strings = decoder.convert_to_strings(split_targets)
#     print('TARGETS', target_strings)
#     break

# for data in tqdm(train_loader):
#     pass

In [9]:
test_dict, __ = make_accent_dict(param['test_manifest'])

test_dataset = KaldiDeepspeechDataset(data_path=param['test_kaldi'],
                              labels=param['labels'],
                              sample_ids=ids_list(param['test_manifest']), 
                              transcripts_path=param['test_transcripts'],
                              embeddings_path=param['test_embeddings_256'],
                              accent_id_dict=test_dict,
                              ivectors_path=None)

test_loader = KaldiDeepspeechDataLoader(test_dataset, 
                                shuffle=True, 
                                num_workers=param['num_worker'],
                                batch_size=param['batch_size'])

## Model definition

In [10]:
class DeepSpeech(nn.Module):
    def __init__(self, 
                rnn_type=nn.LSTM, 
                labels="abc", 
                rnn_hidden_size=768, 
                nb_layers=5, 
                audio_conf=None,
                bidirectional=True,
                DEBUG=False):

        super(DeepSpeech, self).__init__()

        # model metadata needed for serialization/deserialization
        if audio_conf is None:
            audio_conf = {}
        self._DEBUG = DEBUG
        self._version = '0.0.1'
        self._hidden_size = rnn_hidden_size
        self._nb_layers = nb_layers
        self._rnn_type = rnn_type
        self._audio_conf = audio_conf or {}
        self._labels = labels
        self._bidirectional = bidirectional

        sample_rate = self._audio_conf.get("sample_rate", 16000)
        window_size = self._audio_conf.get("window_size", 0.02)
        num_classes = len(self._labels)

        self.conv = MaskConv(nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True),
            nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True)
        ))

        rnn_input_size = 1120

        rnns = []
        rnn = BatchRNN(input_size=rnn_input_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
                       bidirectional=bidirectional, batch_norm=False)
        rnns.append(('0', rnn))
        for x in range(nb_layers - 1):
            rnn = BatchRNN(input_size=rnn_hidden_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
                           bidirectional=bidirectional)
            rnns.append(('%d' % (x + 1), rnn))
        self.rnns = nn.Sequential(OrderedDict(rnns))

        fully_connected = nn.Sequential(
            nn.BatchNorm1d(rnn_hidden_size),
            nn.Linear(rnn_hidden_size, num_classes, bias=False)
        )
        self.fc = nn.Sequential(
            SequenceWise(fully_connected),
        )
        self.inference_softmax = InferenceBatchSoftmax()


    def forward(self, x, lengths):
        if self._DEBUG:
            print('input', x.size())

        lengths = lengths.cpu().int()
        output_lengths = self.get_seq_lens(lengths)
        
        x = x.view(x.size(0), 1, x.size(1), x.size(2))
        x = x.transpose(2, 3)
        if self._DEBUG:
            print('after view transpose', x.size())
            
        x, _ = self.conv(x, output_lengths)
        if self._DEBUG:
            print('after conv', x.size())

        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # Collapse feature dimension
        x = x.transpose(1, 2).transpose(0, 1).contiguous()  # TxNxH
        if self._DEBUG:
            print('after view transpose', x.size())

        for rnn in self.rnns:
            x = rnn(x, output_lengths)
        if self._DEBUG:
            print('after rnn', x.size())

        x = self.fc(x)
        if self._DEBUG:
            print('after fc', x.size())
        
        x = x.transpose(0, 1)
        if self._DEBUG:
            print('after transpose', x.size())
        # identity in training mode, softmax in eval mode
        x = self.inference_softmax(x)
        if self._DEBUG:
            print('after softmax', x.size())
            
        x = x.transpose(0, 1)
        if self._DEBUG:
            print('after transpose', x.size())
            
        self._DEBUG = False
        return x, output_lengths

    def get_seq_lens(self, input_length):
        """
        Given a 1D Tensor or Variable containing integer sequence lengths, return a 1D tensor or variable
        containing the size sequences that will be output by the network.
        :param input_length: 1D Tensor
        :return: 1D Tensor scaled by model
        """
        seq_len = input_length
        for m in self.conv.modules():
            if type(m) == nn.modules.conv.Conv2d:
                seq_len = ((seq_len + 2 * m.padding[1] - m.dilation[1] * (m.kernel_size[1] - 1) - 1) / m.stride[1] + 1)
        return seq_len.int()

    @staticmethod
    def get_labels(model):
        return model.module._labels if model.is_parallel(model) else model._labels

    @staticmethod
    def get_param_size(model):
        params = 0
        for p in model.parameters():
            tmp = 1
            for x in p.size():
                tmp *= x
            params += tmp
        return params

    @staticmethod
    def get_audio_conf(model):
        return model.module._audio_conf if DeepSpeech.is_parallel(model) else model._audio_conf

## Optimizer

In [11]:
model = DeepSpeech(rnn_type=param['rnn_type'], 
                labels=param['labels'], 
                rnn_hidden_size=param['rnn_hidden_size'], 
                nb_layers=param['num_layers'], #audio_conf=audio_conf,
                bidirectional=True,
                DEBUG=DEBUG,)

if param['cuda']:
    model.cuda()

criterion = CTCLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=param['lr'][0])

decoder = BeamCTCDecoder(param['labels'], lm_path=param['lm_path'],
                        alpha=0.8, beta=1.,
                        cutoff_top_n=40, cutoff_prob=1.0,
                        beam_width=100, num_processes=param['num_worker'])
target_decoder = GreedyDecoder(param['labels'])

print(model)
print('Model parameters counts:', count_parameters(model))

DeepSpeech(
  (conv): MaskConv(
    (seq_module): Sequential(
      (0): Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5))
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): Hardtanh(min_val=0, max_val=20, inplace)
      (3): Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5))
      (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): Hardtanh(min_val=0, max_val=20, inplace)
    )
  )
  (rnns): Sequential(
    (0): BatchRNN(
      (rnn): GRU(1120, 800, bidirectional=True)
    )
    (1): BatchRNN(
      (batch_norm): SequenceWise (
      BatchNorm1d(800, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True))
      (rnn): GRU(800, 800, bidirectional=True)
    )
    (2): BatchRNN(
      (batch_norm): SequenceWise (
      BatchNorm1d(800, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True))
      (rnn): GRU(800, 800, bidirectional=True)
    

## Training

In [12]:
def check_wer(targets, targets_len, out, output_len):
    split_targets = []
    offset = 0
    for size in targets_len:
        split_targets.append(targets[offset:offset + size])
        offset += size
        
    decoded_output, _ = decoder.decode(out.data.transpose(0,1), output_len)
    target_strings = target_decoder.convert_to_strings(split_targets)
    
    if False:
        print('targets', targets)
        print('split_targets', split_targets)
        print('out', out)
        print('output_len', output_len)
        print('decoded', decoded_output)
        print('target', target_strings)
    
    wer, cer = 0, 0
    for x in range(len(target_strings)):
        transcript, reference = decoded_output[x][0], target_strings[x][0]
        wer += decoder.wer(transcript, reference) / float(len(reference.split()))
        #cer += decoder.cer(transcript, reference) / float(len(reference))
    wer /= len(target_strings)
    return wer * 100

In [13]:
def train(epochs, 
          model, 
          train_loader, 
          test_loader, 
          optimizer, 
          silent=True,
          cnt=0,
          exp_name='__tmp__'):

    # Tensorboard
    tb_path = Path(param['tensorboard_dir']) / exp_name
    makedirs(tb_path, exist_ok=True)
    tb_writer = SummaryWriter(tb_path)
    best_model = model
    
    prev_epoch_val_loss = math.inf
    prev_epoch_wer = math.inf
    
    ## Train
    for epoch in range(1, param['epochs'] + 1):
        import gc; gc.collect()
        print('')
        print(f'## EPOCH {epoch} ##')
        print(f'Training:')
        model.train()

        # train
        epoch_losses = []
        for i, data in tqdm(enumerate(train_loader), total=len(train_loader)):
            inputs, inputs_len, targets, targets_len, target_accents = data
            
            inputs = inputs.cuda()
            inputs_len = inputs_len.cuda()
            targets = targets.cuda()
            targets_len = targets_len.cuda()
            target_accents = target_accents.cuda()

            # Forward pass
            out, output_len = model(inputs, inputs_len)

            out = out.cpu()
            targets = targets.cpu()
            targets_len = targets_len.cpu()
            
            if DEBUG:
                print('## Outputs train')
                print('out', out.size())
                print('targets', targets.size())
                print('output_len', output_len.size())
                print('targets_len', targets_len.size())
                   
            loss = criterion(out, targets, output_len, targets_len)
            epoch_losses.append(loss)

            if not silent:
                print(f'Iteration {i+1}/{len(train_loader):<4}loss: {loss.item():0.3f}')

            # Gradient
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        epoch_loss = sum(epoch_losses) / len(train_loader)
        tb_writer.add_scalar('stats/train_loss', epoch_loss, epoch)
        print(f'Epoch {epoch} average loss: {epoch_loss.item():0.3f}')

        # validate
        print(f'Testing:')
        model.eval()
        epoch_val_losses = []
        epoch_wer = []
        with torch.no_grad():
            for data in tqdm(test_loader, total=len(test_loader)): ## ## 
                inputs, inputs_len, targets, targets_len, target_accents = data
                
                inputs = inputs.cuda()
                inputs_len = inputs_len.cuda()
                targets = targets.cuda()
                targets_len = targets_len.cuda()
                target_accents = target_accents.cuda()

                out, output_len = model(inputs, inputs_len)

                out = out.cpu()
                targets = targets.cpu()
                targets_len = targets_len.cpu()
                
                if False:
                    print('## Outputs test')
                    print('out', out)
                    print('targets', targets)
                    print('output_len', output_len)
                    print('targets_len', targets_len)
                
                val_loss = criterion(out, targets, output_len, targets_len)
                
                if DEBUG:
                    print('val loss', val_loss)
                
                epoch_val_losses.append(val_loss)

                wer = check_wer(targets, targets_len, out, output_len)
                epoch_wer.append(wer)

        epoch_val_loss = sum(epoch_val_losses) / len(epoch_val_losses) ##
        epoch_wer = sum(epoch_wer) / len(epoch_wer)

        tb_writer.add_scalar('stats/val_loss', epoch_val_loss, epoch)
        print(f'Average validation loss: {epoch_val_loss.item():0.3f}')
        
        tb_writer.add_scalar('stats/wer', epoch_wer, epoch)
        print(f'Average wer: {epoch_wer:0.3f}%')

        if epoch_wer < prev_epoch_wer:
            print('New best model found.')
            prev_epoch_wer = epoch_wer     
                
            torch.save(model.state_dict, f'saved/vac05-ntbk_sd_{cnt}.pt')
            torch.save(model, f'saved/vac05-ntbk_fm_{cnt}.pt')
            
    return model, prev_epoch_val_loss, prev_epoch_wer

In [14]:
settings = {'rnn_type': [nn.GRU],
            'rnn_hidden_size': [800],}
i = 0
for _rnn_type in settings['rnn_type']:
    for _rnn_hidden_size in settings['rnn_hidden_size']:
        exp_name = f'NEW-Embeddings_{_rnn_type}_hidden-{_rnn_hidden_size}_{time()}'
        i += 1
        model = DeepSpeech(rnn_type=_rnn_type, 
                        labels=param['labels'], 
                        rnn_hidden_size=_rnn_hidden_size, 
                        nb_layers=param['num_layers'], #audio_conf=audio_conf,
                        bidirectional=True,
                        DEBUG=DEBUG,)

        if param['cuda']:
            model.cuda()

        optimizer = torch.optim.Adam(model.parameters(), lr=param['lr'][0])

        print()
        print(f'{"":#<13}')
        print(exp_name)
        print(f'{"":#<13}')

        model, val_loss, wer = train(param, 
                                model,
                                train_loader, 
                                test_loader, optimizer, 
                                cnt=i,
                                exp_name=exp_name)
        



#############
NEW-Embeddings_<class 'torch.nn.modules.rnn.GRU'>_hidden-800_1550754461.4420073
#############

## EPOCH 1 ##
Training:


HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

input torch.Size([20, 802, 140])
after view transpose torch.Size([20, 1, 140, 802])
after conv torch.Size([20, 32, 35, 401])
after view transpose torch.Size([401, 20, 1120])
after rnn torch.Size([401, 20, 800])
after fc torch.Size([401, 20, 29])
after transpose torch.Size([20, 401, 29])
after softmax torch.Size([20, 401, 29])
after transpose torch.Size([401, 20, 29])
## Outputs train
out torch.Size([401, 20, 29])
targets torch.Size([1145])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([359, 20, 29])
targets torch.Size([893])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([323, 20, 29])
targets torch.Size([818])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([342, 20, 29])
targets torch.Size([1078])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([316, 20, 29])
targets torch.Size([973])
output_len torch.Size([20])
targets

HBox(children=(IntProgress(value=0, max=57), HTML(value='')))

val loss tensor([9251.6553])
val loss tensor([10398.5449])
val loss tensor([7921.9023])
val loss tensor([8488.9023])
val loss tensor([8875.7080])
val loss tensor([9150.7471])
val loss tensor([8625.9453])
val loss tensor([8629.1025])
val loss tensor([7481.4575])
val loss tensor([9340.7461])
val loss tensor([10260.9707])
val loss tensor([8796.2314])
val loss tensor([9223.7275])
val loss tensor([8891.9434])
val loss tensor([8810.9385])
val loss tensor([9466.3682])
val loss tensor([7946.8330])
val loss tensor([8669.7842])
val loss tensor([9182.7432])
val loss tensor([10589.4014])
val loss tensor([9414.7119])
val loss tensor([9544.4336])
val loss tensor([7565.1987])
val loss tensor([8215.7236])
val loss tensor([7410.2422])
val loss tensor([8056.8535])
val loss tensor([10706.9346])
val loss tensor([8810.3184])
val loss tensor([9367.6289])
val loss tensor([9232.0752])
val loss tensor([10191.5840])
val loss tensor([10104.2773])
val loss tensor([8242.9902])
val loss tensor([8539.8525])
val loss

  "type " + obj.__name__ + ". It won't be checked "



## EPOCH 2 ##
Training:


HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

## Outputs train
out torch.Size([476, 20, 29])
targets torch.Size([1130])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([366, 20, 29])
targets torch.Size([1089])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([353, 20, 29])
targets torch.Size([1024])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([327, 20, 29])
targets torch.Size([1059])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([350, 20, 29])
targets torch.Size([909])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([419, 20, 29])
targets torch.Size([957])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([298, 20, 29])
targets torch.Size([882])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([333, 20, 29])
targets torch.Size([980])
output_len to

HBox(children=(IntProgress(value=0, max=57), HTML(value='')))

val loss tensor([8244.7842])
val loss tensor([10126.4688])
val loss tensor([8612.9512])
val loss tensor([10285.3887])
val loss tensor([9904.1318])
val loss tensor([9013.9658])
val loss tensor([8580.0664])
val loss tensor([9090.2520])
val loss tensor([7033.3867])
val loss tensor([9030.8906])
val loss tensor([9058.4219])
val loss tensor([8893.6377])
val loss tensor([8632.1523])
val loss tensor([9035.9697])
val loss tensor([8973.3574])
val loss tensor([10336.5283])
val loss tensor([8635.1924])
val loss tensor([9545.7158])
val loss tensor([8772.7812])
val loss tensor([9507.0801])
val loss tensor([9028.8330])
val loss tensor([8685.9385])
val loss tensor([9972.3682])
val loss tensor([8371.1143])
val loss tensor([8472.6318])
val loss tensor([9561.3438])
val loss tensor([8300.1953])
val loss tensor([8756.1543])
val loss tensor([8799.3271])
val loss tensor([9480.7646])
val loss tensor([10936.5439])
val loss tensor([8879.8428])
val loss tensor([8709.4971])
val loss tensor([8887.0879])
val loss t

HBox(children=(IntProgress(value=0, max=58), HTML(value='')))

## Outputs train
out torch.Size([369, 20, 29])
targets torch.Size([909])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([286, 20, 29])
targets torch.Size([920])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([316, 20, 29])
targets torch.Size([1061])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([357, 20, 29])
targets torch.Size([1054])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([387, 20, 29])
targets torch.Size([1126])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([328, 20, 29])
targets torch.Size([823])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([387, 20, 29])
targets torch.Size([985])
output_len torch.Size([20])
targets_len torch.Size([20])
## Outputs train
out torch.Size([279, 20, 29])
targets torch.Size([652])
output_len tor

HBox(children=(IntProgress(value=0, max=57), HTML(value='')))

val loss tensor([10427.1611])
val loss tensor([7871.2642])
val loss tensor([8618.6748])
val loss tensor([8771.4727])
val loss tensor([9047.2969])
val loss tensor([9007.6074])
val loss tensor([8250.8740])
val loss tensor([8449.5234])
val loss tensor([10042.9766])
val loss tensor([8764.8389])
val loss tensor([7537.1787])
val loss tensor([8023.8892])
val loss tensor([9108.5947])
val loss tensor([8168.7554])
val loss tensor([10554.2861])
val loss tensor([9269.7617])
val loss tensor([9446.2246])
val loss tensor([7896.7500])
val loss tensor([9291.1094])
val loss tensor([8663.5605])
val loss tensor([8988.1650])
val loss tensor([9210.6064])
val loss tensor([8521.4316])
val loss tensor([9507.9697])
val loss tensor([9170.6670])
val loss tensor([8330.6826])
val loss tensor([8386.0508])
val loss tensor([8735.6631])
val loss tensor([10634.6016])
val loss tensor([10114.1318])
val loss tensor([9038.6113])
val loss tensor([8662.1260])
val loss tensor([8626.0928])
val loss tensor([8911.4561])
val loss 

## Tests

In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    np.set_printoptions(precision=2)
    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="black") #if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()

In [None]:
loader = test_loader
best_model= model
best_model.eval()

y_true, y_pred = [], []
with torch.no_grad():
    for data in tqdm(loader, total=len(loader)):
        inputs, lens, targets, target_lens, target_accents = data
        inputs = inputs.cuda()
        target_accents = target_accents.cuda()
        
        out, __ = best_model(inputs, lens)
        
        y_true.extend(target_accents)
        y_pred.append(np.argmax(out, axis=1))
        
    y_pred = torch.cat(y_pred)
            
    y_true_labels = [accent_dict[int(i)] for i in y_true]
    y_pred_labels = [accent_dict[int(i)] for i in y_pred]

cnf_mat = confusion_matrix(y_true_labels, y_pred_labels, labels=list(accent_dict.values()))

In [None]:
plot_confusion_matrix(cnf_mat, classes=accent_dict.values(), normalize=False)

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_true_labels, y_pred_labels))

## Clustering

In [None]:
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.manifold import TSNE

def plot_pca(X, y, _dict, projection='PCA', graph_title=''):
    if projection == 'PCA':
        Y = sklearnPCA(n_components=2).fit_transform(X)
    elif projection == 'TSNE':
        Y = TSNE(n_components=2).fit_transform(X)
    else:
        raise ValueError(f'Projection {projection} unkown.')

    plt.figure(figsize=(6, 4))
    for lab in _dict.values():
        plt.scatter(Y[y==lab, 0],
                    Y[y==lab, 1],
                    label=lab)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend(loc='lower left')
    plt.tight_layout()
    plt.title(f'{projection}: {graph_title}')
    plt.show()

In [None]:
loader = test_loader
projection = 'PCA'

for model_name, (model, val_loss) in best_models.items():
    datapoints = []
    targets = []
    
    model.eval()
    with torch.no_grad():
        for data in tqdm(loader, total=len(loader)):
            
            inputs, target_accents, lens = data
            inputs = inputs.cuda()
            __, bn = best_model(inputs, lens)

            datapoints.append(bn)
            targets.append(target_accents)
            
    datapoints = torch.cat(datapoints)
    targets = torch.cat(targets)
    
    X = np.asarray(datapoints)
    y = np.asarray([accent_dict[t.item()] for t in targets])
    
    plot_pca(X, y, accent_dict, projection=projection, graph_title=model_name)