# Leveraging native language information

This notbooks tries to implement a variation of the following paper: [Leveraging Native Language Information for Improved Accented Speech Recognition](https://www.researchgate.net/publication/327388866_Leveraging_Native_Language_Information_for_Improved_Accented_Speech_Recognition).

We use a slightly different architecture based on [Sean Naren's Deepspeech implementation](https://github.com/SeanNaren/deepspeech.pytorch).

The data splits used for testing are the same as in this paper: [Improved Accented Speech Recognition
Using Accent Embeddings and Multi-task Learning](https://www.isca-speech.org/archive/Interspeech_2018/pdfs/1864.pdf).

The precise splits can be found [here](https://sites.google.com/view/accentsunearthed-dhvani/home).

## Imports

In [108]:
# Allows to load modules from parent directory
import inspect, sys
from os.path import dirname, abspath
sys.path.append(dirname(dirname(abspath(inspect.getfile(inspect.currentframe())))))

# Imports
import argparse
import json
import os
import time

from utils import Timer, AverageMeter, restricted_float
from itertools import product as cross_iter

import numpy as np

import torch
import torch.distributed as dist
from torch.autograd import Variable
from tqdm import tqdm
from torch.nn.modules import CrossEntropyLoss
from warpctc_pytorch import CTCLoss

from data.data_loader import create_binarizer

from data.data_loader import AudioDataLoader, SpectrogramAccentDataset, BucketingSampler, DistributedBucketingSampler
from data.utils import reduce_tensor
from decoder import GreedyDecoder
from model import DeepSpeech, supported_rnns
from multitask_model import MtAccent
from multitask_loss import MtLoss

# autoreloads
%load_ext autoreload
%autoreload 1
%aimport parameters

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Utilities

In [137]:
# Configure experiments by directly changing the values in parameters.py
param = parameters.get_parameters()
config = parameters.get_config()

def to_np(x):
    return x.data.cpu().numpy()

## Data loading

## Model initialization

In [138]:
model = None # TODO
# MtAccent(accents_size=len(accent_binarizer.classes_),
#                 bottleneck_size=args.bottleneck_size,
#                 rnn_hidden_size=args.hidden_size,
#                 nb_layers=args.hidden_layers,
#                 labels=labels,
#                 rnn_type=supported_rnns[rnn_type],
#                 audio_conf=audio_conf,
#                 bidirectional=args.bidirectional,
#                 side_nb_layers=args.side_hidden_layers,
#                 side_rnn_hidden_size=args.side_hidden_size,
#                 side_rnn_type=supported_rnns[side_rnn_type],
#                 nb_shared_layers=args.shared_layers)

In [114]:




accent_binarizer = create_binarizer(args.train_manifest)

avg_loss, start_epoch, start_iter = 0, 0, 0
avg_main_loss, avg_side_loss = 0, 0

    with open(args.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))

    audio_conf = dict(sample_rate=args.sample_rate,
                      window_size=args.window_size,
                      window_stride=args.window_stride,
                      window=args.window,
                      noise_dir=args.noise_dir,
                      noise_prob=args.noise_prob,
                      noise_levels=(args.noise_min, args.noise_max))

    rnn_type = args.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"

    if args.side_rnn_type is not None:
        side_rnn_type = args.side_rnn_type.lower()
        assert side_rnn_type in supported_rnns, "side_rnn_type should be either lstm, rnn or gru"

        

    parameters = model.parameters()
    if args.optimizer == "adam":
        optimizer = torch.optim.Adam(parameters, lr=args.lr)
    elif args.optimizer == "sgd":
        optimizer = torch.optim.SGD(parameters, lr=args.lr,
                            momentum=args.momentum, nesterov=True)

if args.model == 'deepspeech':
    criterion = CTCLoss()
elif args.model == 'mtaccent':
    criterion = MtLoss(CTCLoss(), CrossEntropyLoss(), mixing_coef=args.mixing_coef)

decoder = GreedyDecoder(labels)


use_kaldi_features = False

train_dataset = SpectrogramAccentDataset(audio_conf=audio_conf, 
                                        manifest_filepath=args.train_manifest, 
                                        labels=labels,
                                        normalize=True, 
                                        augment=args.augment, 
                                        accent_binarizer=accent_binarizer,
                                        kaldi=use_kaldi_features)

test_dataset = SpectrogramAccentDataset(audio_conf=audio_conf, 
                                        manifest_filepath=args.val_manifest, 
                                        labels=labels,
                                        normalize=True, 
                                        augment=False, 
                                        accent_binarizer=accent_binarizer,
                                        kaldi=use_kaldi_features) 


if not args.distributed:
    train_sampler = BucketingSampler(train_dataset, batch_size=args.batch_size)
else:
    train_sampler = DistributedBucketingSampler(train_dataset, batch_size=args.batch_size,
                                                num_replicas=args.world_size, rank=args.rank)

train_loader = AudioDataLoader(train_dataset,
                                num_workers=args.num_workers, 
                                batch_sampler=train_sampler)
test_loader = AudioDataLoader(test_dataset, 
                                batch_size=args.batch_size,
                                num_workers=args.num_workers)


if (not args.no_shuffle and start_epoch != 0) or args.no_sorta_grad:
    print("Shuffling batches for the following epochs")
    train_sampler.shuffle(start_epoch)

if args.cuda:
    model.cuda()
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=(int(args.gpu_rank),) if args.rank else None)

print(model)
print("Number of parameters: %d" % type(model).get_param_size(model))

if args.tensorboard and main_proc: # TODO empty scope name problem
    try:
        dummy_inputs = torch.rand(20, 1, 161, 10) # TODO dynamically change size
        if args.cuda:
             dummy_inputs = dummy_inputs.cuda()
        dummy_size = torch.rand(20)
        tensorboard_writer.add_graph(model, (dummy_inputs, dummy_size), verbose=True)
    except Exception as e:
        print("Exception while creating tensorboard graph:")
        print(e)

batch_time = AverageMeter()
data_time = AverageMeter()
losses = AverageMeter()

#t.print_report()
## TRAIN ##
t.add('starts epochs')
display_scaling_coef = None
for epoch in range(start_epoch, args.epochs):
    t.add(f'begin epoch {epoch}')
    model.train()
    end = time.time()
    start_epoch_time = time.time()
    for i, (data) in enumerate(train_loader, start=start_iter):
        if i == len(train_sampler):
            break
        inputs, targets, input_percentages, target_sizes, target_accents = data
        input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
        # measure data loading time
        data_time.update(time.time() - end)

        if args.cuda:
            inputs = inputs.cuda()
        t.add(f'epoch {epoch}, batch {i} forward pass')

        if args.model == 'deepspeech':
            out, output_sizes = model(inputs, input_sizes)
            out = out.transpose(0, 1)  # TxNxH

            loss = criterion(out, targets, output_sizes, target_sizes)
            main_loss, side_loss = torch.tensor(0), torch.tensor(0)
        elif args.model == 'mtaccent':
            if epoch != 0:
                criterion.toggle_update_coefs(new_value=False)

            out, output_sizes, side_out = model(inputs, input_sizes)
            out = out.transpose(0, 1)  # TxNxH
            target_accents = np.argmax(target_accents, axis=1) # TODO check if this could be done elsewhere…
            loss = criterion((out, targets, output_sizes, target_sizes), (side_out.cpu(), target_accents))
            main_loss, side_loss = criterion.get_sublosses()


        loss = loss / inputs.size(0)  # average the loss by minibatch
        main_loss = main_loss / inputs.size(0)
        side_loss = side_loss / inputs.size(0)

        if args.distributed:
            loss_value = reduce_tensor(loss, args.world_size)[0]
            main_loss_value = reduce_tensor(main_loss, args.world_size)[0]
            side_loss_value = reduce_tensor(side_loss, args.world_size)[0]
        else:
            loss_value = loss.item()
            main_loss_value = main_loss.item()
            side_loss_value = side_loss.item()

        inf = float("inf")
        if loss_value == inf or loss_value == -inf:
            print("WARNING: received an inf loss, setting loss value to 0")
            loss_value = 0
        if main_loss_value == inf or main_loss_value == -inf:
            print("WARNING: received an inf main_loss, setting loss value to 0")
            main_loss_value = 0
        if side_loss_value == inf or side_loss_value == -inf:
            print("WARNING: received an inf side_loss, setting side_loss value to 0")
            side_loss_value = 0
        t.add(f'epoch {epoch} backward pass')

        avg_loss += loss_value
        avg_main_loss += main_loss_value
        avg_side_loss += side_loss_value
        losses.update(loss_value, inputs.size(0))

        # compute gradient
        optimizer.zero_grad()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        # Optimizer step
        optimizer.step()
        t.add('starts computing stuff to print')

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if not args.silent:
            sub_losses = criterion.print_sublosses() if args.model == 'mtaccent' else 'n/a'
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  '(Sub-losses: {sub_losses})\t'.format(
                (epoch + 1), (i + 1), len(train_sampler), 
                batch_time=batch_time, data_time=data_time, 
                loss=losses, sub_losses=sub_losses))
        if args.checkpoint_per_batch > 0 and i > 0 and (i + 1) % args.checkpoint_per_batch == 0 and main_proc:
            file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth' % (save_folder, epoch + 1, i + 1)
            print("Saving checkpoint model to %s" % file_path)
            torch.save(type(model).serialize(model, optimizer=optimizer, epoch=epoch, iteration=i,
                                            loss_results=loss_results,
                                            main_loss_results=main_loss_results,
                                            side_loss_results=side_loss_results,
                                            wer_results=wer_results, 
                                            cer_results=cer_results, 
                                            mca_results=mca_results,
                                            avg_loss=avg_loss,
                                            avg_main_loss=avg_main_loss,
                                            avg_side_loss=avg_side_loss),
                       file_path)
        del loss
        del out

    avg_loss /= len(train_sampler)
    avg_main_loss /= len(train_sampler)
    avg_side_loss /= len(train_sampler)

    if display_scaling_coef is None:
        display_scaling_coef = 100. / avg_loss
    display_avg_loss = avg_loss * display_scaling_coef
    display_avg_main_loss = avg_main_loss * display_scaling_coef
    display_avg_side_loss = avg_side_loss * display_scaling_coef

    epoch_time = time.time() - start_epoch_time
    print('Training Summary Epoch: [{0}]\t'
          'Time taken (s): {epoch_time:.0f}\t'
          'Average Loss {loss:.3f}\t'.format(epoch + 1, epoch_time=epoch_time, loss=avg_loss))

    start_iter = 0  # Reset start iteration for next epoch

    ## VALIDATION ##
    #t.print_report()
    total_cer, total_wer, total_mca = 0, 0, 0
    model.eval()
    with torch.no_grad():
        for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)):
            inputs, targets, input_percentages, target_sizes, target_accents = data
            input_sizes = input_percentages.mul_(int(inputs.size(3))).int()

            # unflatten targets
            split_targets = []
            offset = 0
            for size in target_sizes:
                split_targets.append(targets[offset:offset + size])
                offset += size

            if args.cuda:
                inputs = inputs.cuda()

            if args.model == 'deepspeech':
                out, output_sizes = model(inputs, input_sizes)
            elif args.model == 'mtaccent':
                out, output_sizes, side_out = model(inputs, input_sizes)
                mca = 0

                for x in range(len(target_accents)):
                    accent_out = np.argmax(torch.exp(side_out[x])) # take exp because we do logsoftmax
                    accent_target = np.argmax(target_accents[x])

                    if accent_out != accent_target:
                        mca += 1
                total_mca += mca

            decoded_output, _ = decoder.decode(out.data, output_sizes)
            target_strings = decoder.convert_to_strings(split_targets)
            wer, cer = 0, 0
            for x in range(len(target_strings)):
                transcript, reference = decoded_output[x][0], target_strings[x][0]
                wer += decoder.wer(transcript, reference) / float(len(reference.split()))
                cer += decoder.cer(transcript, reference) / float(len(reference))
            total_cer += cer
            total_wer += wer
            del out

        wer = total_wer / len(test_loader.dataset)
        cer = total_cer / len(test_loader.dataset)

        wer *= 100
        cer *= 100

        loss_results[epoch] = display_avg_loss
        main_loss_results[epoch] = display_avg_main_loss
        side_loss_results[epoch] = display_avg_side_loss
        wer_results[epoch] = wer
        cer_results[epoch] = cer

        if args.model == 'mtaccent':
            mca = total_mca / len(test_loader.dataset)
            mca *= 100
        else:
            mca = -1 # if the model doesn't use accent, mca doesn't make sense.
        mca_results[epoch] = mca

        mca_print = f'{mca:.3f}' if mca != -1 else 'n/a'
        print('Validation Summary Epoch: [{0}]\t'
              'Average WER {wer:.3f}\t'
              'Average CER {cer:.3f}\t'
              'Accent missclassification {mca}\t'.format(epoch + 1, wer=wer, cer=cer, mca=mca_print))

        
        if args.checkpoint and main_proc:
            file_path = '%s/deepspeech_%d.pth' % (save_folder, epoch + 1)
            torch.save(type(model).serialize(model, optimizer=optimizer, 
                                            epoch=epoch, 
                                            loss_results=loss_results,
                                            main_loss_results=main_loss_results,
                                            side_loss_results=side_loss_results,
                                            wer_results=wer_results, cer_results=cer_results,
                                            mca_results=mca_results),
                       file_path)
            # anneal lr
            optim_state = optimizer.state_dict()
            optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0]['lr'] / args.learning_anneal
            optimizer.load_state_dict(optim_state)
            print('Learning rate annealed to: {lr:.6f}'.format(lr=optim_state['param_groups'][0]['lr']))

        if (best_wer is None or best_wer > wer) and main_proc:
            print("Found better validated model, saving to %s" % args.model_path)
            torch.save(type(model).serialize(model, optimizer=optimizer, 
                epoch=epoch,
                loss_results=loss_results,
                main_loss_results=main_loss_results,
                side_loss_results=side_loss_results,
                wer_results=wer_results, 
                cer_results=cer_results), args.model_path)
            best_wer = wer

            avg_loss = 0
        if not args.no_shuffle:
            print("Shuffling batches...")
            train_sampler.shuffle(epoch)


NameError: name 'args' is not defined