In [1]:
import re
import os
import wandb
import torch
import torchaudio
import numpy as np
import pandas as pd
import torch.nn as nn
from datetime import datetime
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


import jiwer
import matplotlib.pyplot as plt

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [2]:
#wandb.init(project="dla-asr-hw-2", resume=True)

## Preprocessing

In [3]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [4]:
set_seed()

In [5]:
dir_path = "/home/veroslovets/dla/HW2/datasets/LJSpeech-1.1/"

In [6]:
def preprocessing_LJ(path):
    latters = "qwertyuiopasdfghjklzxcvbnm "
    id_latter = {}
    latter_id = {}
    for i, char in enumerate(latters):
        id_latter[i] = char
        latter_id[char] = i
    
    id_chars = []
    values = {"id": [], "text": []}
    with open(path + "metadata.csv", "r") as rdr:
        for line in rdr:
            line = line.split("|")
            sub_text = re.sub(r'[^a-z ]+', '', line[2].lower()[:-1])
            values["text"].append(sub_text)
            values["id"].append(line[0])
            id_chars.append([latter_id[char] for char in sub_text])
    
    pd_values = pd.DataFrame(values)
    pd_values["id_chars"] = id_chars
    
    return pd_values, id_latter, latter_id

In [7]:
df, id_latter, latter_id = preprocessing_LJ(dir_path)

In [8]:
def filter_df(df, quantile=0.05):
    lens = []
    for i in df["text"]:
        lens.append(len(i))
    l_ = np.quantile(lens, quantile)
    r_ = np.quantile(lens, 1 - quantile)
    mask = (df["text"].str.len() > l_) & (df["text"].str.len() < r_)
    df = df[mask]
    
    return df, int(r_)

In [9]:
df, max_seq_char = filter_df(df)

In [10]:
train, valid_data = train_test_split(df, test_size=0.1)

## DataLoader

In [11]:
class LoadDataset(Dataset):
    def __init__(self, data, path, transform, pading_mel, padding_text):
        super().__init__()
        self.data = data.values
        self.path = path + "wavs"
        self.transform = transform
        self.pading_mel = pading_mel
        self.padding_text = padding_text
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,index):
        audio_name, _, tokens = self.data[index]
        audio_path = os.path.join(self.path, audio_name + ".wav")
        wav, sr = torchaudio.load(audio_path)
        wav = wav.squeeze()
        mel_spectrogram = self.transform.forward(wav)
        log_mel = torch.log(mel_spectrogram + 1e-9)
        
        if log_mel.shape[2 - 1] < self.pading_mel:
            res = torch.cat((log_mel, torch.zeros((log_mel.shape[1 - 1],self.pading_mel-log_mel.shape[2- 1]))), dim=1)
        else:
            res = log_mel[:,:self.pading_mel]
        
        target = torch.tensor(tokens + [27] * (self.padding_text - len(tokens)))
        target_len = torch.tensor(len(tokens))
        input_len = torch.tensor(435)
        
        
        return res.squeeze(0).transpose(0, 1), target, target_len, input_len

In [12]:
melspec = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=64, n_fft=1024, hop_length=256, f_max=10000)
train_audio_transforms = nn.Sequential(
    melspec,
    torchaudio.transforms.FrequencyMasking(freq_mask_param=10),
    torchaudio.transforms.TimeMasking(time_mask_param=35)
)

In [13]:
padding_text = 148
batch_size   = 64
padding_spec = 870
melspec      = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=64, n_fft=1024, hop_length=256, f_max=10000)
train_loader = DataLoader(dataset=LoadDataset(train, dir_path, train_audio_transforms, padding_spec, padding_text), batch_size = batch_size, shuffle=True, num_workers=3)
valid_loader = DataLoader(dataset=LoadDataset(valid_data, dir_path, melspec, padding_spec, padding_text), batch_size = batch_size, shuffle=False, num_workers=3)

In [14]:
1+1

2

## Model

In [15]:
import torch.nn as nn


class CNNLayerNorm(nn.Module):
    def __init__(self, n_feats):
        super(CNNLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(n_feats)

    def forward(self, x):
        x = x.transpose(2, 3).contiguous()
        x = self.layer_norm(x)
        return x.transpose(2, 3).contiguous()


class ResidualCNN(nn.Module):
    def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
        super(ResidualCNN, self).__init__()

        self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding=kernel//2)
        self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding=kernel//2)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.layer_norm1 = CNNLayerNorm(n_feats)
        self.layer_norm2 = CNNLayerNorm(n_feats)

    def forward(self, x):
        residual = x
        x = self.layer_norm1(x)
        x = F.gelu(x)
        x = self.dropout1(x)
        x = self.cnn1(x)
        x = self.layer_norm2(x)
        x = F.gelu(x)
        x = self.dropout2(x)
        x = self.cnn2(x)
        x += residual
        return x


class BidirectionalGRU(nn.Module):
    def __init__(self, rnn_dim, hidden_size, dropout):
        super(BidirectionalGRU, self).__init__()

        self.BiGRU = nn.LSTM(input_size=rnn_dim, hidden_size=hidden_size, num_layers=2, bidirectional=True, batch_first=True)
        self.layer_norm = nn.LayerNorm(rnn_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.layer_norm(x)
        x = F.gelu(x)
        x, _ = self.BiGRU(x)
        x = self.dropout(x)
        return x


class Classifier(nn.Module):
    def __init__(self, dim, n_classes, dropout):
        super(Classifier, self).__init__()
        self.clf1 = nn.Linear(dim * 2, dim)
        self.drop = nn.Dropout(dropout)
        self.clf2 = nn.Linear(dim, n_classes)

    def forward(self, x):
        x = self.clf1(x)
        x = self.drop(x)
        return self.clf2(x)


class SpeechRecognition(nn.Module):
    def __init__(self, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):
        super(SpeechRecognition, self).__init__()
        n_feats = n_feats//2
        self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3//2)

        self.rescnn_layers1 = ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats)
        self.rescnn_layers2 = ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats)

        self.fully_connected = nn.Linear(n_feats*32, rnn_dim)

        self.birnn_layers = BidirectionalGRU(rnn_dim=rnn_dim, hidden_size=rnn_dim, dropout=dropout)

        self.classifier = Classifier(rnn_dim, n_class, dropout)

    def forward(self, x):
        x = self.cnn(x)
        x = self.rescnn_layers1(x)
        x = self.rescnn_layers2(x)
        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])
        x = x.transpose(1, 2)
        x = self.fully_connected(x)
        x = self.birnn_layers(x)
        x = self.classifier(x)
        return x

In [16]:
model = SpeechRecognition(128, 29, 64, 2, 0.1).to(device)


In [17]:
def int_to_text(labels):
    string = ""
    for i in labels:
        string += id_latter[i]
    return string

def GreedyDecoder(output, labels, label_lengths, blank_label=28, collapse_repeated=True):
    arg_maxes = torch.argmax(output, dim=2)
    decodes = []
    targets = []
    
    for i, args in enumerate(arg_maxes):
        decode = []
        targets.append(int_to_text(labels[i][:label_lengths[i]].tolist()))
        for j, index in enumerate(args):
            if index != blank_label:
                if collapse_repeated and j != 0 and index == args[j -1]:
                    continue
                decode.append(index.item())
        decodes.append(int_to_text(decode))
    return decodes, targets

## Fit-Predict

In [18]:
def get_time(start_time):
    duration = datetime.now() - start_time
    days, seconds = duration.days, duration.seconds
    hours = days * 24 + seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    
    return minutes, seconds


def train_epoch(epoch, model, optimizer, criterion, batch_size):
    model.train()
    train_loss = 0.0
    start_time = datetime.now()
    for idx, (melspec, tokens, target_len, padded_len) in enumerate(train_loader):
        melspec, tokens = melspec.to(device), tokens.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(melspec.unsqueeze(1).transpose(2, 3))
        outputs = F.log_softmax(outputs, dim=2)
        
        loss = criterion(outputs.transpose(0, 1), tokens, padded_len, target_len)
        
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        
        minutes,seconds = get_time(start_time)
        
        print("\r Train Epoch {} [{}/{} ({:.0f}%)] loss: {:.7f} time: {}m:{}s".format(epoch, 
                                                                                      (idx + 1) * batch_size,
                                                                                      len(train_loader.dataset),
                                                                                      100. * idx / len(train_loader),
                                                                                      loss.item(),
                                                                                      minutes,
                                                                                      seconds), end='')
        if (idx + 1) % 10 == 0:
            pass
            #wandb.log({"Loss": loss})
    print()

def evaluate(model, criterion, calc_wer=False):
    model.eval() 
    loss = 0
    correct = 0
    test_cer = []
    test_wer = []
    
    with torch.no_grad():
        for i, (melspec, tokens, target_len, padded_len) in enumerate(valid_loader):
            melspec, tokens = melspec.to(device), tokens.to(device)

            outputs = model(melspec.unsqueeze(1).transpose(2, 3))
            outputs = F.log_softmax(outputs, dim=2)

            loss += criterion(outputs.transpose(0, 1), tokens, padded_len, target_len).item()
            
            if calc_wer:
                decoded_preds, decoded_targets = GreedyDecoder(outputs, tokens, target_len)

                for j in range(len(decoded_preds)):
                    test_wer.append(jiwer.wer(decoded_targets[j], decoded_preds[j]))
    
    loss /= len(valid_loader)
    #wandb.log({"Validation loss": loss})
    
    if calc_wer:
        avg_wer = sum(test_wer) / len(test_wer)
        #wandb.log({"WER": avg_wer})
        print('Validation: Average loss: {:.4f}, Average WER: {:.4f}\n'.format(loss, avg_wer))
    else:
        print('Validation: Average loss: {:.4f}\n'.format(loss))

In [19]:
learning_rate = 0.001

criterion = nn.CTCLoss(blank=28).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [20]:
num_epoch = 60
for i in range(1, num_epoch):
    train_epoch(i, model, optimizer, criterion, batch_size)
    if i % 2 == 0:
        evaluate(model, criterion, calc_wer=True)
    else:
        evaluate(model, criterion)

Validation: Average loss: 2.8824

Validation: Average loss: 2.8705, Average WER: 1.0000

Validation: Average loss: 2.8319

Validation: Average loss: 1.9240, Average WER: 0.9779

Validation: Average loss: 1.0228

Validation: Average loss: 0.7314, Average WER: 0.6883

Validation: Average loss: 0.5919

Validation: Average loss: 0.5127, Average WER: 0.5635

Validation: Average loss: 0.4526

Validation: Average loss: 0.4129, Average WER: 0.4745

Validation: Average loss: 0.3812

Validation: Average loss: 0.3501, Average WER: 0.4196

Validation: Average loss: 0.3261

Validation: Average loss: 0.3120, Average WER: 0.3800

Validation: Average loss: 0.2954

Validation: Average loss: 0.2852, Average WER: 0.3506

Validation: Average loss: 0.2713

Validation: Average loss: 0.2633, Average WER: 0.3253

Validation: Average loss: 0.2530

Validation: Average loss: 0.2476, Average WER: 0.3060

Validation: Average loss: 0.2354

Validation: Average loss: 0.2311, Average WER: 0.2871

Validation: Average l

In [23]:
loss

NameError: name 'loss' is not defined

In [24]:
torch.save({
            'epoch': num_epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': 0.1722,
            }, "model.checkpoint.pth")

In [25]:
!ls

HW2-assemblyai.ipynb	Untitled.ipynb	   model.checkpoint.pth
HW2-clear_.ipynb	data		   test-clean.tar.gz
HW_2_RUSSION_ASR.ipynb	datasets	   train-clean-100.tar.gz
LibriSpeech		homework2_M.ipynb  wandb


In [28]:
model2 = SpeechRecognition(128, 29, 64, 2, 0.1)
#optimizer = TheOptimizerClass(*args, **kwargs)

checkpoint = torch.load("model.checkpoint.pth")
model2.load_state_dict(checkpoint['model_state_dict'])
#optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

model2.eval()


SpeechRecognition(
  (cnn): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (rescnn_layers1): ResidualCNN(
    (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
    (layer_norm1): CNNLayerNorm(
      (layer_norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    )
    (layer_norm2): CNNLayerNorm(
      (layer_norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    )
  )
  (rescnn_layers2): ResidualCNN(
    (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
    (layer_norm1): CNNLayerNorm(
      (layer_norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    )
    (

In [29]:
model2 = model2.to(device)

In [30]:
evaluate(model2, criterion, calc_wer=True)

Validation: Average loss: 0.1722, Average WER: 0.1920



In [22]:
for i, (melspec, tokens, target_len, padded_len) in enumerate(valid_loader):
        melspec, tokens = melspec.to(device), tokens.to(device)

        outputs = model(melspec.unsqueeze(1).transpose(2, 3))
        outputs = F.log_softmax(outputs, dim=2)

        
        decoded_preds, decoded_targets = GreedyDecoder(outputs, tokens, target_len)

        for j in range(len(decoded_preds)):
            print("TRUE:", decoded_targets[j])
            print("PREDICT", decoded_preds[j])
            print("--------------------------------------------")
        break

TRUE: and we have every reason to believe that it should be in full swing by autumn
PREDICT and we have every reason to believe that it should be infulswiing by auhtoum
--------------------------------------------
TRUE: after inspecting this floor sawyer returned to the street about three minutes after he entered the building
PREDICT after inspecting this floor sowier returned to the street about three minutes after he entered the building
--------------------------------------------
TRUE: great efforts have been made to save his life
PREDICT great efforts have been made to sa his life
--------------------------------------------
TRUE: field labor he urged and with reason was a very suitable employment
PREDICT field labor he urdged and with reason was a very sutable employment
--------------------------------------------
TRUE: the crowding was in consequence of the delay in removing transports
PREDICT the crowding was in consequence of the delay in removing transports
-----------------