In [None]:
!pip install jiwer --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m1.9/3.1 MB[0m [31m55.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch, torchaudio, json, os, time
import numpy as np

from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from jiwer import compute_measures, cer

In [None]:
# Sync with google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
TRAIN_DATASET_NAME = "synthetic_train_x3_noise=0.0.json"
TRAIN_DATASET_JSON_DIR = f"drive/MyDrive/MLMI2/proc_data/augmented_data/json/{TRAIN_DATASET_NAME}"
VALID_DATASET_JSON_DIR = "drive/MyDrive/MLMI2/proc_data/json/valid.json"

TRAIN_MFCC_DATA_DIR = "drive/MyDrive/MLMI2/proc_data/augmented_data/synthetic_mfcc.zip"
VALID_MFCC_DATA_DIR = "drive/MyDrive/MLMI2/proc_data/mfcc_features/mfcc_features.zip"

MODEL_PARAMETERS = {
    'batch_size': 8,
    'nb_hidden': 3,
    'hidden_size': 512,
    'model_arch': 'BI-LSTM',
    'optimizer': 'SGD_SCHEDULED',
    'learning_rate': 0.02,
    'momentum': 0.9,
    'dropout_rate': 0.2,
    'max_norm_clipping': 4.0,
    'patience': 7,
    'min_reduce': 0.03,
    'max_epoch': 55,
    'scheduler_patience': 2,
}

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

SEED = 123
torch.manual_seed(SEED)

<torch._C.Generator at 0x7dfd65b4dcf0>

### Model

In [None]:
class MyLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, nb_hidden, output_size, is_bidirectional, dropout_rate):
        super(MyLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, nb_hidden, dropout=dropout_rate, bidirectional=is_bidirectional)
        if is_bidirectional:
          self.fc = nn.Linear(hidden_size*2, output_size)
        else:
          self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        hidden, _ = self.lstm(x)
        out = self.fc(hidden)
        return out

In [None]:
# Model initialisation
model = MyLSTM(
    input_size=23,
    hidden_size=MODEL_PARAMETERS['hidden_size'],
    nb_hidden=MODEL_PARAMETERS['nb_hidden'],
    output_size=40,
    is_bidirectional=(MODEL_PARAMETERS['model_arch'] == 'BI-LSTM'),
    dropout_rate=MODEL_PARAMETERS['dropout_rate'],
).to(DEVICE)

### Dataset

In [None]:
# Get token to phone vocabulary (dictionary)
with open('drive/MyDrive/MLMI2/proc_data/vocab_39.txt') as f:
  vocab = f.read().splitlines()
  vocab_idx_to_phn = {k: v for k, v in enumerate(vocab)}

In [None]:
class TDataset(Dataset):
    def __init__(self, path, specaug=False):
        with open(path) as f:
            self.data = json.load(f)
        self.specaug = specaug

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = self.data[index]
        mfcc = torch.load(data["mfcc_path"])
        targets = data["tokens"]
        return mfcc, targets

def collate_wrapper(batch):
    fbank = pad_sequence([i[0] for i in batch])
    input_lengths = torch.tensor([len(i[0]) for i in batch], dtype=torch.long)
    targets = [i[1] for i in batch]
    return fbank, input_lengths, targets

def get_dataloader(path, bs, shuffle, specaug=False):
    dataset = TDataset(path, specaug)
    return DataLoader(
        dataset,
        batch_size=bs,
        shuffle=shuffle,
        collate_fn=collate_wrapper,
        pin_memory=True
    )

In [None]:
!cp -f {TRAIN_DATASET_JSON_DIR} .
!cp -f {VALID_DATASET_JSON_DIR} .

!unzip -q -o {TRAIN_MFCC_DATA_DIR} -d .
!unzip -q -o {VALID_MFCC_DATA_DIR} -d .

In [None]:
# Setup SYNTHETIC batched data
train_loader = get_dataloader(TRAIN_DATASET_NAME, MODEL_PARAMETERS['batch_size'], True)
valid_loader = get_dataloader('valid.json', MODEL_PARAMETERS['batch_size'], False)

### Training

In [None]:
# TRIAL HYPERPARAMETERS
OPTIMIZER = MODEL_PARAMETERS['optimizer']
LEARNING_RATE = MODEL_PARAMETERS['learning_rate']
MAX_NORM_CLIPPING = MODEL_PARAMETERS['max_norm_clipping']
MOMENTUM = 0.9

In [None]:
# EARLY STOPPING
all_valid_loss, PATIENCE, MIN_REDUCE = [], MODEL_PARAMETERS['patience'], MODEL_PARAMETERS['min_reduce']
MAX_EPOCH = MODEL_PARAMETERS['max_epoch']

In [None]:
# Optimizer initialisation
criterion = torch.nn.CTCLoss(blank=0, zero_infinity=True)
if OPTIMIZER == 'SGD':
  optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
elif OPTIMIZER == 'SGD_SCHEDULED':
  optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=MODEL_PARAMETERS['scheduler_patience'], threshold=MIN_REDUCE)
else:
  optimizer = torch.optim.Adam(model.parameters(),lr=LEARNING_RATE)

In [None]:
# --- TRAINING ---
start_time = time.time()
metrics = {'PER': [], 'sub': [], 'dele': [], 'ins': [], 'cor': [], 'TL': []}

for epoch in range(MAX_EPOCH):
  train_loss_history = []

  model.train()
  for batch in train_loader:

    input, input_lengths, targets = batch
    input = input.to(device=DEVICE)
    input_lengths = input_lengths.to(device=DEVICE)

    targets = [torch.tensor(tokens) for tokens in targets]
    target_lengths = torch.tensor([len(target) for target in targets], dtype=torch.long)
    targets = pad_sequence(targets, batch_first=True)
    targets = targets.to(DEVICE)

    optimizer.zero_grad()

    y_pred = model(input).log_softmax(2)
    loss = criterion(y_pred, targets, input_lengths, target_lengths)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=MAX_NORM_CLIPPING)

    optimizer.step()
    train_loss_history.append(loss.item())

  print('TL:', np.mean(train_loss_history))
  metrics['TL'].append(np.mean(train_loss_history))

  # --- VALIDATION ---
  model.eval()
  valid_loss_history = []
  decoding_stats = [0., 0., 0., 0.]

  with torch.no_grad():

    for batch in valid_loader:

      input, input_lengths, targets = batch
      input = input.to(device=DEVICE)
      input_lengths = input_lengths.to(device=DEVICE)

      targets = [torch.tensor(tokens) for tokens in targets]
      target_lengths = torch.tensor([len(target) for target in targets], dtype=torch.long)
      targets = pad_sequence(targets, batch_first=True)
      targets = targets.to(DEVICE)

      y_pred = model(input).log_softmax(2)
      loss = criterion(y_pred, targets, input_lengths, target_lengths)
      valid_loss_history.append(loss.item())

      # Get PER, sub/del/ins/cor metrics
      for i in range(len(batch)):
        # Get formatted predictions
        pred_softmax = y_pred.transpose(1, 0)[i]
        pred = list(int(i) for i in torch.argmax(pred_softmax, dim=-1))[:input_lengths[i]]
        pred = [pred[0]] + [pred[i] for i in range(1, len(pred)) if pred[i] != pred[i - 1]] # Delete consequitive duplicates
        pred = [i for i in pred if i != 0]          # Remove the blank spaces
        pred = [vocab_idx_to_phn[i] for i in pred]  # Transform from tokens to phones
        pred = ' '.join(pred)
        # Get formatted targets
        exp = targets[i][:target_lengths[i]]
        exp = [int(i) for i in exp]
        exp = [vocab_idx_to_phn[i] for i in exp]
        exp = ' '.join(exp)
        # Record metrics
        cur_stats = compute_measures(exp, pred)
        decoding_stats[0] += cur_stats["substitutions"]
        decoding_stats[1] += cur_stats["deletions"]
        decoding_stats[2] += cur_stats["insertions"]
        decoding_stats[3] += cur_stats["hits"]

  total_words = decoding_stats[0] + decoding_stats[1] + decoding_stats[3]
  sub = decoding_stats[0] / total_words * 100
  dele = decoding_stats[1] / total_words * 100
  ins = decoding_stats[2] / total_words * 100
  cor = decoding_stats[3] / total_words * 100
  PER = (decoding_stats[0] + decoding_stats[1] + decoding_stats[2]) / total_words * 100

  # Save additional metrics
  metrics['PER'].append(PER)
  metrics['sub'].append(sub)
  metrics['dele'].append(dele)
  metrics['ins'].append(ins)
  metrics['cor'].append(cor)


  print('VL:', np.mean(valid_loss_history))
  print('Val-PER:', PER)
  all_valid_loss.append(np.mean(valid_loss_history))

  # Step scheduler on validation loss
  if OPTIMIZER == 'SGD_SCHEDULED':
    scheduler.step(np.mean(valid_loss_history))

  # CHECK: Early Stopping
  if len(all_valid_loss) > PATIENCE:
    if min(all_valid_loss[:-PATIENCE]) - min(all_valid_loss[-PATIENCE:]) < MIN_REDUCE:
      break

metrics['VL'] = all_valid_loss
metrics['run_time'] = time.time() - start_time
metrics['best_epoch'] = int(np.argmin(all_valid_loss))

### Save logs

In [None]:
with open('drive/MyDrive/MLMI2/training/synthetic_logs/logs.txt', 'a+') as f:
    f.write(f'{TRAIN_DATASET_NAME} :=>: ')
    json.dump(metrics, f)
    f.write('\nParameters: ')
    json.dump(MODEL_PARAMETERS, f)
    f.write('\n\n')