In [51]:
import json
import os
import torch
import random
import numpy as np

from datasets import Dataset, DatasetDict, Audio
from typing import Dict, List

import pytorch_lightning as pl

from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from dataclasses import dataclass
from typing import Any, Dict, List, Union

import gc
from datasets import load_dataset, DatasetDict, Audio
from tqdm import tqdm

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7d63bcb51fb0>

In [13]:
DATASET_PATH: str = "./dataset/"

# Dataset setup

In [3]:
!gdown 1j9d91QqE7_WnOnmEmidtOG55tpmxQUeJ

Downloading...
From (original): https://drive.google.com/uc?id=1j9d91QqE7_WnOnmEmidtOG55tpmxQUeJ
From (redirected): https://drive.google.com/uc?id=1j9d91QqE7_WnOnmEmidtOG55tpmxQUeJ&confirm=t&uuid=39570873-9cfc-4b37-b7cc-77627910130b
To: /home/artur/files/Audio Processing/Audio_Processing_Labs/Lab4/dataset.zip
100%|██████████████████████████████████████| 9.12G/9.12G [10:30<00:00, 14.5MB/s]


In [None]:
!mkdir -p dataset

In [11]:
!unzip dataset.zip -d toronto_dataset

Archive:  dataset.zip
   creating: toronto_dataset/toronto_0/
   creating: toronto_dataset/toronto_100/
   creating: toronto_dataset/toronto_101/
   creating: toronto_dataset/toronto_11/
   creating: toronto_dataset/toronto_12/
   creating: toronto_dataset/toronto_123/
   creating: toronto_dataset/toronto_127/
   creating: toronto_dataset/toronto_128/
   creating: toronto_dataset/toronto_130/
   creating: toronto_dataset/toronto_133/
   creating: toronto_dataset/toronto_134/
   creating: toronto_dataset/toronto_135/
   creating: toronto_dataset/toronto_136/
   creating: toronto_dataset/toronto_138/
   creating: toronto_dataset/toronto_139/
   creating: toronto_dataset/toronto_14/
   creating: toronto_dataset/toronto_144/
   creating: toronto_dataset/toronto_145/
   creating: toronto_dataset/toronto_148/
   creating: toronto_dataset/toronto_15/
   creating: toronto_dataset/toronto_150/
   creating: toronto_dataset/toronto_153/
   creating: toronto_dataset/toronto_155/
   creating: toron

In [16]:
test_lines = [
    'toronto_27', 'toronto_46', 'toronto_42', 'toronto_37', 'toronto_89',
    'toronto_43', 'toronto_157', 'toronto_9', 'toronto_156', 'toronto_7',
    'toronto_123', 'toronto_54', 'toronto_67', 'toronto_62', 'toronto_81',
    'toronto_134', 'toronto_148', 'toronto_21', 'toronto_135', 'toronto_166',
    'toronto_58'
]


def load_toronto_dataset(json_path: str, test_lines: List[str], val_part: float = 0.1, max_size: int | None = None) -> DatasetDict:
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    dataset_items = []

    for audio_path, transcription in data.items():
        if not os.path.exists(audio_path):
            print(f"Warning: File not found - {audio_path}")
            continue

        dataset_items.append({
            'id': audio_path,
            'path': audio_path,
            'sentence': transcription,
        })

    train_val_items = [item for item in dataset_items
                  if not any(prefix in item['id'] for prefix in test_lines)]
    test_items = [item for item in dataset_items
                 if any(prefix in item['id'] for prefix in test_lines)]

    random.shuffle(train_val_items)
    random.shuffle(test_items)

    if max_size is not None:
        limiter = min(max_size, len(train_val_items))
        train_limiter = int((1.0 - val_part) * limiter)
        print(limiter)
        print(train_limiter)

        train_items = train_val_items[:train_limiter]
        val_items = train_val_items[train_limiter:limiter]
        test_items  = test_items[:min(max_size, len(test_items))]
    else:
        train_limiter = int((1.0 - val_part) * len(train_val_items))
        train_items = train_val_items[:train_limiter]
        val_items = train_val_items[train_limiter:]

    print(f"Train set: {len(train_items)} samples")
    print(f"Val set: {len(val_items)} samples")
    print(f"Test set: {len(test_items)} samples")

    train_dataset = Dataset.from_list(train_items)
    val_dataset = Dataset.from_list(val_items)
    test_dataset = Dataset.from_list(test_items)

    dataset_dict = DatasetDict({
        'train': train_dataset,
        'val': val_dataset,
        'test': test_dataset
    })

    dataset_dict = dataset_dict.cast_column('path', Audio(sampling_rate=16000))

    def preprocess_function(examples):
        examples['audio'] = {'array': examples['path']['array'],
                            'sampling_rate': examples['path']['sampling_rate']}
        return examples

    dataset_dict = dataset_dict.map(preprocess_function, num_proc=1)
    return dataset_dict

In [17]:
json_path = DATASET_PATH + "labels.jsonl"
common_voice = load_toronto_dataset(json_path, test_lines, max_size=7500)

7500
6750
Train set: 6750 samples
Val set: 750 samples
Test set: 7213 samples


Map:   0%|          | 0/6750 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map:   0%|          | 0/7213 [00:00<?, ? examples/s]

In [18]:
gc.collect()

72

In [10]:
common_voice['train'][4]

{'id': 'dataset/toronto_139/toronto_139_173.wav',
 'path': {'path': None,
  'array': array([ 0.0027771 ,  0.00323486,  0.00247192, ...,  0.01959229,
          0.00308228, -0.00411987], shape=(84960,)),
  'sampling_rate': 16000},
 'sentence': 'Вона допомагає нужденним, опікується храмом, будує і відновлює пам’ятники.',
 'audio': {'array': [0.0027979814913123846,
   0.003253600327298045,
   0.0024876180104911327,
   0.0019312015501782298,
   0.0018787472508847713,
   0.005803032778203487,
   0.006674429401755333,
   0.014871727675199509,
   0.02464376576244831,
   0.03527560830116272,
   0.036990731954574585,
   0.03672519326210022,
   0.04057428985834122,
   0.04293437674641609,
   0.04475962370634079,
   0.03926539793610573,
   0.0284880381077528,
   0.013175729662179947,
   -0.00211538840085268,
   -0.013919888064265251,
   -0.022229747846722603,
   -0.030223529785871506,
   -0.03430590406060219,
   -0.03041844815015793,
   -0.020363230258226395,
   -0.013715031556785107,
   -0.010473

# Modeling

In [19]:
my_token = "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"

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base", token=my_token)
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="Ukrainian", task="transcribe", token=my_token)
processor = WhisperProcessor.from_pretrained("openai/whisper-base", language="Ukrainian", task="transcribe", token=my_token)

In [20]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [21]:
common_voice = common_voice.map(
    prepare_dataset,
    remove_columns=common_voice['train'].column_names,
    num_proc=1
)

Map:   0%|          | 0/6750 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map:   0%|          | 0/7213 [00:00<?, ? examples/s]

In [22]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

In [23]:
import evaluate.loading

try:
    metric = evaluate.loading.load("wer")
except:
    import jiwer

    class WERMetric:
        def compute(self, predictions, references, **kwargs):
              return jiwer.wer(references, predictions)

    metric = WERMetric()

In [24]:
import logging

def compute_metrics(pred_ids, label_ids):
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    filtered_preds = []
    filtered_refs = []
    for p, r in zip(pred_str, label_str):
        if r.strip():
            filtered_preds.append(p)
            filtered_refs.append(r)

    if len(filtered_refs) == 0:
        logging.warning("No non-empty references found in batch.")
        return {"wer": 0.0}

    wer = 100 * metric.compute(predictions=filtered_preds, references=filtered_refs)
    return {"wer": wer}

In [25]:
class WhisperModule(pl.LightningModule):
    def __init__(self, model_name="openai/whisper-base", lr=1e-5):
        super().__init__()
        self.model = WhisperForConditionalGeneration.from_pretrained(model_name, token=my_token)
        self.model.generation_config.language = "ukrainian"
        self.model.generation_config.task = "transcribe"
        self.model.generation_config.forced_decoder_ids = None
        self.learning_rate = lr
        self.save_hyperparameters()

    def forward(self, input_features, labels=None):
        return self.model(input_features=input_features, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(input_features=batch["input_features"], labels=batch["labels"])
        loss = outputs.loss
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self(input_features=batch["input_features"], labels=batch["labels"])
        loss = outputs.loss
        self.log("val_loss", loss, prog_bar=True, logger=True)

        with torch.no_grad():
            generated_ids = self.model.generate(
                input_features=batch["input_features"],
                max_new_tokens=225
            )

        metrics = compute_metrics(
            generated_ids.cpu(),
            batch["labels"].cpu()
        )
        self.log("val_wer", metrics["wer"], prog_bar=True, logger=True)
        return loss


    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        scheduler = torch.optim.lr_scheduler.LinearLR(
            optimizer,
            start_factor=1.0,
            end_factor=0.1,
            total_iters=4000
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "step"
            }
        }

In [26]:
class WhisperDataModule(pl.LightningDataModule):
    def __init__(self, train_dataset, test_dataset, data_collator, batch_size=16):
        super().__init__()
        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.data_collator = data_collator
        self.batch_size = batch_size

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            collate_fn=self.data_collator,
            num_workers=2,
            shuffle=True
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.test_dataset,
            batch_size=self.batch_size // 2,
            collate_fn=self.data_collator,
            num_workers=2
        )

In [None]:
BATCH_SIZE: int = 8
model = WhisperModule(lr=1e-5)
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.model.config.decoder_start_token_id,
)
data_module = WhisperDataModule(
    train_dataset=common_voice["train"],
    test_dataset=common_voice["val"],
    data_collator=data_collator,
    batch_size=BATCH_SIZE
)

checkpoint_callback = ModelCheckpoint(
    dirpath="./whisper-base-uk-checkpoints",
    filename="whisper-base-uk-{epoch:02d}-{val_wer:.4f}",
    save_top_k=3,
    monitor="val_wer",
    mode="min"
)
logger = TensorBoardLogger("tb_logs", name="whisper-base-uk")

trainer = pl.Trainer(
    max_steps=10000,
    max_epochs=20,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    precision="16-mixed" if torch.cuda.is_available() else "32",
    gradient_clip_val=1.0,
    accumulate_grad_batches=1,
    log_every_n_steps=25,
    val_check_interval=len(common_voice["train"]) // (BATCH_SIZE * 2) - 1,
    callbacks=[checkpoint_callback],
    logger=logger,
)


trainer.fit(model, data_module)

output_dir = "./whisper-base-uk-lightning"
model.model.save_pretrained(output_dir)
processor.save_pretrained(output_dir)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                            | Params | Mode
-----------------------------------------------------------------
0 | model | WhisperForConditionalGeneration | 72.6 M | eval
-----------------------------------------------------------------
71.8 M    Trainable params
768 K     Non-trainable params
72.6 M    Total params
290.376   Total estimated model params size (MB)
0         Modules in train mode
182       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=4000` reached.


[]

# Model testing

In [64]:
cer_metric = evaluate.loading.load("cer")
wer_metric = evaluate.loading.load("wer")

def evaluate_model(model: WhisperForConditionalGeneration, processor: WhisperProcessor, dataloader: torch.utils.data.DataLoader) -> dict:
    all_preds: List[str] = []
    all_refs:  List[str] = []

    model.eval()
    for data in tqdm(dataloader):
        input_features = data["input_features"].to(model.device)
        labels = data["labels"]
        references = processor.batch_decode(labels, skip_special_tokens=True)

        with torch.no_grad():
            predicted_ids = model.generate(input_features=input_features)

        predictions = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        labels[labels == -100] = processor.tokenizer.pad_token_id
        for pred, rfrncs in zip(predictions, references):        
            if rfrncs.strip():
                all_preds.append(pred)
                all_refs.append(rfrncs)

    wer = wer_metric.compute(predictions=all_preds, references=all_refs)
    cer = cer_metric.compute(predictions=all_preds, references=all_refs)
    return {
        "wer": wer,
        "cer": cer
    }

In [33]:
model(np.array(common_voice["test"][0]['input_features']))

TypeError: conv1d() received an invalid combination of arguments - got (numpy.ndarray, Parameter, Parameter, tuple, tuple, tuple, int), but expected one of:
 * (Tensor input, Tensor weight, Tensor bias = None, tuple of ints stride = 1, tuple of ints padding = 0, tuple of ints dilation = 1, int groups = 1)
      didn't match because some of the arguments have invalid types: (!numpy.ndarray!, !Parameter!, !Parameter!, !tuple of (int,)!, !tuple of (int,)!, !tuple of (int,)!, !int!)
 * (Tensor input, Tensor weight, Tensor bias = None, tuple of ints stride = 1, str padding = "valid", tuple of ints dilation = 1, int groups = 1)
      didn't match because some of the arguments have invalid types: (!numpy.ndarray!, !Parameter!, !Parameter!, !tuple of (int,)!, !tuple of (int,)!, !tuple of (int,)!, !int!)


In [60]:
test_loader = torch.utils.data.DataLoader(
    common_voice["test"],
    batch_size=8,
    collate_fn=data_collator,
    num_workers=1
)

In [44]:
for data in test_loader:
    input_features = data["input_features"]
    labels = data["labels"]
    generated_ids = model.model.generate(input_features=input_features)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
    reference_text = processor.batch_decode(labels, skip_special_tokens=True)
    print()
    print("Reference:", reference_text)
    print("Predicted:", generated_text)
    metrics = compute_metrics(
            generated_ids.cpu(),
            labels.cpu()
        )
    print(metrics)

    # print(model.model(input_features=input_features, labels=None))
    break


Reference: ['Утомилися від карантину? Дарма! Заплануйте втому на осінь. А зараз все тільки починається!']
Predicted: ['Утомили свід карантину? Дарма! Заплавнуєте втомлено осінь! А зараз це тільки починається!']
{'wer': 53.84615384615385}


In [56]:
our_model = model.model.to("cuda")

In [65]:
metrics = evaluate_model(our_model, processor, test_loader)

100%|██████████| 902/902 [32:19<00:00,  2.15s/it]


In [68]:
print(f"WER: {metrics['wer']:.4f}")
print(f"CER: {metrics['cer']:.4f}")

WER: 0.5282
CER: 0.2274
