# Mounting drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## installing the requirements

In [3]:
!pip install torch==2.1.0
!pip install torchaudio
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.3 rapidfuzz-3.6.1


# Imports

In [16]:
import os
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
import torchaudio
import numpy as np
import pandas as pd
from jiwer import wer, cer

In [67]:
MAX_EPOCHS = 100
ON_MAC = False


# PATHS
TRAIN_PATH = "/content/drive/MyDrive/speech_recognition/data/cv-corpus/datasets-csv-colab/train.csv"
VALIDATE_PATH = "/content/drive/MyDrive/speech_recognition/data/cv-corpus/datasets-csv-colab/test.csv"
SAVE_MODEL_PATH = "/content/drive/MyDrive/speech_recognition/model"

# DATA
BATCH_SIZE = 64
VALID_EVERY = 1000 // BATCH_SIZE

# DATALOADER
NUM_WORKERS = 1

# AUDIO
SAMPLE_RATE = 32000

# MEL LOG SPECTROGRAM
N_MELS = 128
N_FFT = 1024
WIN_LENGTH = 1024
HOP_LENGTH = 512
MAX_SPECTROGRAM_SIZE = 1650

# SPECTROGRAM AUGMENTATION
SPECAUG_RATE = 0.5
SPECAUG_POLICY = 3
TIME_MASK = 60
FREQUENCY_MASK = 20

# TEXT
NUMBER_OF_CLASSES = 29 # number of label clases (characters)
BLANK_CHARACTER_INDEX = 28

# MODEL
DROPOUT = 0.1
MAIN_SIZE = 128

# CNN
KERNEL_SIZE = 10
STRIDE = 2

# LSTM
LSTM_HIDDEN_SIZE = 512
LSTM_NUMBER_OF_LAYERS = 1
LSTM_DROPOUT = 0.0
LSTM_BIDIRECTIONAL = False

# GPU runtime

In [10]:
!nvidia-smi

Sun Feb  4 20:15:32 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Text processing

In [6]:
class TextProcessing():
    index_to_char_map = {
        0: "'", 1: " ", 2: "a", 3: "b", 4: "c", 5: "d", 6: "e", 7: "f", 8: "g", 9: "h", 10: "i",
        11: "j", 12: "k", 13: "l", 14: "m", 15: "n", 16: "o", 17: "p", 18: "q", 19: "r", 20: "s",
        21: "t", 22: "u", 23: "v", 24: "w", 25: "x", 26: "y", 27: "z", 28: "_", # blank character
    }
    char_to_index_map = {
        "'": 0, " ": 1, "a": 2, "b": 3, "c": 4, "d": 5, "e": 6, "f": 7, "g": 8, "h": 9, "i": 10,
        "j": 11, "k": 12, "l": 13, "m": 14, "n": 15, "o": 16, "p": 17, "q": 18, "r": 19, "s": 20,
        "t": 21, "u": 22, "v": 23, "w": 24, "x": 25, "y": 26, "z": 27, "_": 28 # blank character
    }

    def text_to_int_sequence(text):
        int_sequence = []
        for char in text.lower():
            if char in TextProcessing.char_to_index_map.keys():
                index = TextProcessing.char_to_index_map[char]
            else: # Ignoring characters not specified in dictionary
                continue
            int_sequence.append(index)
        return int_sequence

    def int_sequence_to_text(int_sequence):
        text = ""
        for index in int_sequence:
            if index in TextProcessing.index_to_char_map.keys(): # Ignoring integers outside of range specified in dictionary
                text += TextProcessing.index_to_char_map[index]
        return text

    def text_with_only_allowed_characters(text):
        output_text = ""
        for char in text.lower():
            if char in TextProcessing.char_to_index_map.keys():
                output_text += char
        return output_text

    def get_char_list():
        return list(TextProcessing.char_to_index_map.keys())

    def get_index_list():
        return list(TextProcessing.index_to_char_map.keys())


def GreedyDecoder(output, labels, label_lengths, blank_label=28, collapse_repeated=True):
	arg_maxes = torch.argmax(output, dim=2)
	decodes = []
	targets = []
	for i, args in enumerate(arg_maxes):
		decode = []
		targets.append(TextProcessing.int_sequence_to_text(labels[i][:label_lengths[i]].tolist()))
		for j, index in enumerate(args):
			if index != blank_label:
				if collapse_repeated and j != 0 and index == args[j -1]:
					continue
				decode.append(index.item())
		decodes.append(TextProcessing.int_sequence_to_text(decode))
	return decodes, targets


# Audio processing

In [79]:
class LogMelSpec(nn.Module):

    def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels):
        super(LogMelSpec, self).__init__()
        self.mel_spectogram_function = torchaudio.transforms.MelSpectrogram(
                            sample_rate=sample_rate, n_fft=n_fft,
                            win_length=win_length, hop_length=hop_length,
                            n_mels=n_mels
                            )

    def forward(self, x):
        x = self.mel_spectogram_function(x)  # mel spectrogram
        x = np.log(x + 1e-14)  # logrithmic, add small value to avoid inf
        return x

class SpecAugment(nn.Module):

    def __init__(self, rate, policy=3, freq_mask=15, time_mask=35):
        super(SpecAugment, self).__init__()

        self.rate = rate

        self.specaug = nn.Sequential(
            torchaudio.transforms.FrequencyMasking(freq_mask_param=freq_mask),
            torchaudio.transforms.TimeMasking(time_mask_param=time_mask)
        )

        self.specaug2 = nn.Sequential(
            torchaudio.transforms.FrequencyMasking(freq_mask_param=freq_mask),
            torchaudio.transforms.TimeMasking(time_mask_param=time_mask),
            torchaudio.transforms.FrequencyMasking(freq_mask_param=freq_mask),
            torchaudio.transforms.TimeMasking(time_mask_param=time_mask)
        )

        policies = { 1: self.policy1, 2: self.policy2, 3: self.policy3 }
        self._forward = policies[policy]

    def forward(self, x):
        return self._forward(x)

    def policy1(self, x):
        probability = torch.rand(1, 1).item()
        if self.rate > probability:
            return  self.specaug(x)
        return x

    def policy2(self, x):
        probability = torch.rand(1, 1).item()
        if self.rate > probability:
            return  self.specaug2(x)
        return x

    def policy3(self, x):
        probability = torch.rand(1, 1).item()
        if probability > 0.5:
            return self.policy1(x)
        return self.policy2(x)

class Data(torch.utils.data.Dataset):
    def __init__(self, csv_path, print_errors=False):
        print(f"Loading data CSV file from: {csv_path}\n")
        self.data = pd.read_csv(csv_path, sep='\t')

        self.print_errors = print_errors

        self.audio_transforms = torch.nn.Sequential(
            # torchaudio.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE, n_mels=N_MELS)
            LogMelSpec(sample_rate=SAMPLE_RATE,
                       n_fft=N_FFT,
                       n_mels=N_MELS,
                       win_length=WIN_LENGTH,
                       hop_length=HOP_LENGTH),
            SpecAugment(rate=SPECAUG_RATE,
                        policy=SPECAUG_POLICY,
                        freq_mask=FREQUENCY_MASK,
                        time_mask=TIME_MASK)
        )

    def __len__(self):
        return len(self.data)


    def __getitem__(self, index):
        file_path = None
        try:
            text = self.data.iloc[index].text # Column is named "text"
            label = TextProcessing.text_to_int_sequence(text) # Text as sequence of ints
            label_len = len(label)

            file_path = self.data.iloc[index].file # Column is named "file"
            waveform, samplerate = torchaudio.load(file_path)

            spectrogram = self.audio_transforms(waveform) # (channel, feature, time)
            spec_len = spectrogram.shape[-1] // 2

            if spec_len < label_len:
                raise Exception(f'Spectrogram length ({spec_len}) is smaller than label length ({label_len})') # spectrogram length must be higher than label length so that audio is longer than written form

        except Exception as e:
            if self.print_errors:
                print(str(e), file_path, text)
                return self.__getitem__(index - 1 if index != 0 else index + 1) # Returning previous item (we have to assume that at least first element was correct, if not we will have a loop)

        return spectrogram, label, spec_len, label_len


def custom_collate_fn(batch):
    spectrograms = []
    labels = []
    spectrogram_lengths = []
    label_lengths = []
    for (spectrogram, label, spectrogram_length, label_length) in batch:
        if spectrogram is None:
            continue

        spectrograms.append(spectrogram.squeeze(0).transpose(0, 1))
        # Squeeze gets rid of first size parameter beacuse spectrograms in this program are single channel
        # FROM: torch.Size([1, 128, 514])    ->    TO: torch.Size([128, 514])
        #
        # Transposing so that the first parameter will be the number of elements (in our example 514),
        # because 128 is number of mels and we are doing this so we can use pad_sequence

        labels.append(torch.Tensor(label))
        spectrogram_lengths.append(spectrogram_length)
        label_lengths.append(label_length)

    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
    # pad_sequence adds zeros  for elements which are smaller than the biggest one, then we unsqueze it
    # to get back to this 1 in front of 128, 514 and transpose it to get back from
    # [514, 128] to original [128, 514], and we end up with shape: torch.Size([2, 1, 128, 514])
    # Where:
    #   2 is number of elements in a batch
    #   1 is a number of channels of audio
    #   128 is number of mels
    #   514 is a number of time sequences of this audio (max)

    # because batch_first is true number of batches is a first parameter: [2, 1, 128, 514]
    # otherwise it will be like this: [514, 1, 128, 2]

    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms, labels, spectrogram_lengths, label_lengths

# Model

In [68]:
class TransposeLayer(nn.Module):
    def __init__(self, dim0, dim1):
        super(TransposeLayer, self).__init__()
        self.dim0 = dim0
        self.dim1 = dim1

    def forward(self, x):
        x = x.transpose(self.dim0, self.dim1)
        return x


class SpeechRecognitionModel(nn.Module):

    def __init__(self, device="cpu"):
        super(SpeechRecognitionModel, self).__init__()

        use_cuda = torch.cuda.is_available()
        self.device = device

        self.criterion = nn.CTCLoss(blank=28, zero_infinity=True)
        self.learning_rate = 1e-3

        self.validation_step_outputs = []

        self.cnn = nn.Sequential(
            nn.Conv1d(N_MELS, N_MELS, kernel_size=KERNEL_SIZE, stride=STRIDE, padding=KERNEL_SIZE//STRIDE),
            TransposeLayer(1, 2),
            nn.LayerNorm(N_MELS),
            nn.GELU(),
            nn.Dropout(DROPOUT),
        )
        self.dense = nn.Sequential(
            nn.Linear(N_MELS, 128),
            nn.LayerNorm(128),
            nn.GELU(),
            nn.Dropout(DROPOUT),
            nn.Linear(128, 128),
            nn.LayerNorm(128),
            nn.GELU(),
            nn.Dropout(DROPOUT),
        )
        self.lstm = nn.LSTM(input_size=128, hidden_size=LSTM_HIDDEN_SIZE,
                            num_layers=LSTM_NUMBER_OF_LAYERS, dropout=LSTM_DROPOUT,
                            bidirectional=LSTM_BIDIRECTIONAL, batch_first=True)
        self.final_transformations = nn.Sequential(
            nn.LayerNorm(LSTM_HIDDEN_SIZE),
            nn.GELU(),
            nn.Dropout(DROPOUT),
        )

        self.final_fc = nn.Linear(LSTM_HIDDEN_SIZE, NUMBER_OF_CLASSES) # final fully connected

    def forward(self, x):
        current_batch_size = x.shape[0]

        h_0 = torch.zeros(1, current_batch_size, LSTM_HIDDEN_SIZE).to(self.device)
        c_0 = torch.zeros(1, current_batch_size, LSTM_HIDDEN_SIZE).to(self.device)

        x = x.squeeze(1)  # batch, feature, time - removing unnecessary dimention for num_of_channels
        x = self.cnn(x) # batch, time, feature
        x = self.dense(x) # batch, time, feature
        x, (h_n, c_n) = self.lstm(x, (h_0, c_0))

        x = self.final_transformations(x)  # (batch, time, n_class)
        x = self.final_fc(x)
        return x


# Training and Validation

In [83]:
class IterationsCounter():
    def __init__(self):
        self.value = 0

    def step(self):
        self.value += 1

    def get(self):
        return self.value


def train(model, device, train_loader, criterion, optimizer, scheduler, epoch, iterations_counter, experiment):
    model.train()
    data_len = len(train_loader.dataset)

    for batch_idx, _data in enumerate(train_loader):
        spectrograms, labels, input_lengths, label_lengths = _data
        spectrograms, labels = spectrograms.to(device), labels.to(device)

        optimizer.zero_grad()

        output = model(spectrograms)  # (batch, time, n_class)
        output = F.log_softmax(output, dim=2)
        output = output.transpose(0, 1) # (time, batch, n_class)

        loss = criterion(output, labels, input_lengths, label_lengths)
        loss.backward()

        optimizer.step()
        scheduler.step()
        iterations_counter.step()
        if batch_idx % 100 == 0 or batch_idx == data_len:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(spectrograms), data_len,
                100. * batch_idx / len(train_loader), loss.item()))
            if batch_idx % 1000 == 0 or batch_idx == data_len:
                decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)
                print(decoded_preds)
                print(decoded_targets)


def test(model, device, test_loader, criterion, epoch, iterations_counter, experiment):
    print('\nevaluating...')
    model.eval()
    test_loss = 0
    test_cer, test_wer = [], []
    with torch.no_grad():
        for i, _data in enumerate(test_loader):
            spectrograms, labels, input_lengths, label_lengths = _data
            spectrograms, labels = spectrograms.to(device), labels.to(device)

            output = model(spectrograms)  # (batch, time, n_class)
            output = F.log_softmax(output, dim=2)
            output = output.transpose(0, 1) # (time, batch, n_class)

            loss = criterion(output, labels, input_lengths, label_lengths)
            test_loss += loss.item() / len(test_loader)

            decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)
            for j in range(len(decoded_preds)):
                test_cer.append(cer(decoded_targets[j], decoded_preds[j]))
                test_wer.append(wer(decoded_targets[j], decoded_preds[j]))

    avg_cer = sum(test_cer)/len(test_cer)
    avg_wer = sum(test_wer)/len(test_wer)

    print(f'Test set: Average loss: {test_loss:.4f}, Average CER: {avg_cer:4f} Average WER: {avg_wer:.4f}\n')


def main(learning_rate=1e-3, batch_size=20, epochs=10):

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(7)
    device = torch.device("cuda" if use_cuda else "cpu")
    print(f"Device in use: {device}\n")

    if not os.path.isdir("./data"):
        os.makedirs("./data")

    train_dataset = Data(csv_path=TRAIN_PATH)
    test_dataset = Data(csv_path=VALIDATE_PATH)

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = data.DataLoader(dataset=train_dataset,
                                batch_size=BATCH_SIZE,
                                shuffle=True,
                                collate_fn=custom_collate_fn,
                                **kwargs)
    test_loader = data.DataLoader(dataset=test_dataset,
                                batch_size=BATCH_SIZE,
                                shuffle=False,
                                collate_fn=custom_collate_fn,
                                **kwargs)

    model = SpeechRecognitionModel(device=device).to(device)

    LOADED_EPOCH = 40
    LOAD_MODEL_FILEPATH = f"{SAVE_MODEL_PATH}/model-{LOADED_EPOCH}"


    if LOAD_MODEL_FILEPATH and len(LOAD_MODEL_FILEPATH) > 1:
        print(f"Model loaded from {LOAD_MODEL_FILEPATH}\n")
        model.load_state_dict(torch.load(LOAD_MODEL_FILEPATH))

    # print(model)

    optimizer = optim.AdamW(model.parameters(), learning_rate)
    criterion = nn.CTCLoss(blank=BLANK_CHARACTER_INDEX).to(device)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=learning_rate,
                                            steps_per_epoch=int(len(train_loader)),
                                            epochs=epochs,
                                            anneal_strategy='linear')

    if not os.path.isdir(SAVE_MODEL_PATH):
        os.makedirs(SAVE_MODEL_PATH)

    iterations_counter = IterationsCounter()
    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, criterion, optimizer, scheduler, epoch, iterations_counter, None)
        test(model, device, test_loader, criterion, epoch, iterations_counter, None)

        torch.save(model.state_dict(), f"{SAVE_MODEL_PATH}/model-{epoch + LOADED_EPOCH}")


# Run Training

In [None]:
learning_rate = 1e-3

main(learning_rate, BATCH_SIZE, MAX_EPOCHS)