# Dependencies

In [1]:
! pip install git+https://github.com/openai/whisper.git
! pip install jiwer
! pip install pytorch-lightning
! pip install evaluate
! pip install mutagen
! pip install transformers
! pip uninstall -y ipywidgets
! pip install ipywidgets==7.7.2

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-ya7vhps6
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-ya7vhps6
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Found existing installation: ipywidgets 7.7.2
Uninstalling ipywidgets-7.7.2:
  Successfully uninstalled ipywidgets-7.7.2
Collecting ipywidgets==7.7.2
  Using cached ipywidgets-7.7.2-py2.py3-none-any.whl.metadata (1.9 kB)
Using cached ipywidgets-7.7.2-py2.py3-none-any.whl (123 kB)
Installing collected packages: ipywidgets
Successfully installed ipywidgets-7.7.2


In [2]:
import os, numpy as np, pathlib
import torch, torch.nn
import pandas as pd
import whisper
import torchaudio, torchaudio.transforms
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt, seaborn as sns
import evaluate
from transformers.optimization import get_linear_schedule_with_warmup
import jiwer
from whisper.normalizers import EnglishTextNormalizer
import warnings
warnings.filterwarnings('ignore')
import mutagen
from transformers import get_linear_schedule_with_warmup, AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
from torch.optim import AdamW
from pathlib import Path
import openpyxl

#### Configuration

In [3]:
DATASET_DIR = "dataset"
SAMPLE_RATE = 16000
AUDIO_MAX_LENGTH = 160000  # Maksimal panjang audio (10 detik * 16000)
TEXT_MAX_LENGTH = 200  # Maksimal panjang text transcript

TRAIN_RATE = 0.8  # 80% untuk training
VAL_RATE = 0.1    # 10% untuk validation
TEST_RATE = 0.1   # 10% untuk testing

TRAIN_BATCH_SIZE = 64  # Batch size untuk training
EVAL_BATCH_SIZE = 16    # Batch size untuk evaluation
MAX_TRAIN_STEPS = 150

SEED = 3407
seed_everything(SEED, workers=True)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

class Config:

    sample_rate = 16000

    learning_rate = 1e-5
    weight_decay = 0.01
    adam_epsilon = 1e-8

    warmup_steps = 10  # 10 steps warmup dari 150 total steps
    max_steps = 150           # Minimum 150 Steps Training
    eval_steps = 25           # Evaluate setiap 25 steps
    save_steps = 50           # Save checkpoint setiap 50 steps
    train_batch_size = 8      # Batch size training
    eval_batch_size = 4       # Batch size evaluation
    num_worker = 2            # Number of workers untuk data loading
    gradient_accumulation_steps = 2  # Accumulate gradients untuk simulate larger batch

INFO:lightning_fabric.utilities.seed:Seed set to 3407


#### Utility Functions

In [4]:
def load_wave(wave_path, sample_rate: int = 16000):
    waveform = whisper.load_audio(wave_path, sr=sample_rate)
    return torch.from_numpy(waveform)


def calculate_err(data):
    normalizer = EnglishTextNormalizer()

    data['text_clean'] = data['text'].apply(normalizer)
    data['predict_clean'] = data['predict'].apply(normalizer)

    wer_original = jiwer.wer(list(data['text']), list(data['predict']))
    cer_original = jiwer.cer(list(data['text']), list(data['predict']))

    wer_normalized = jiwer.wer(list(data['text_clean']), list(data['predict_clean']))
    cer_normalized = jiwer.cer(list(data['text_clean']), list(data['predict_clean']))

    return (data, wer_original, wer_normalized, cer_original, cer_normalized)


def extract_aud(audio_path, mymodel):
    result = mymodel.transcribe(audio_path, language="id", without_timestamps=True)
    return result["text"]

#### Dataset Loading

In [6]:
TVRI_DATASET_DIR = "korpus_tvri.csv"
TEST_DATASET_DIR = ".csv"

def load_custom_dataset(dataset_dir):
    audio_transcript_pair_list = []

    df = pd.read_csv(dataset_dir)

    for i in range(len(df)):
        audio_path = os.path.join(dataset_dir, f"{df.loc[i, 'Nama Data']}.wav")
        text = df.loc[i, "Transkrip Suara (Bahasa Indonesia)"]

        print(f"{audio_path}")
        audio = mutagen.File(audio_path)
        audio_length = audio.info.length * 16000
        if len(text) <= TEXT_MAX_LENGTH and audio_length <= AUDIO_MAX_LENGTH:
            audio_transcript_pair_list.append((audio_path, text))

    return audio_transcript_pair_list


def split_dataset(audio_transcript_pair_list, train_rate=0.8, val_rate=0.1, test_rate=0.1):
    np.random.seed(SEED)
    np.random.shuffle(audio_transcript_pair_list)

    total_rate = train_rate + val_rate + test_rate
    if abs(total_rate - 1.0) > 1e-9:
        print(f"Warning: Dataset split rates ({total_rate}) do not sum to 1.0. Adjusting.")
        train_rate /= total_rate
        val_rate /= total_rate
        test_rate /= total_rate

    dataset_size = len(audio_transcript_pair_list)

    train_size = int(train_rate * dataset_size)
    val_size = int(val_rate * dataset_size)
    test_size = dataset_size - train_size - val_size

    print(f"Train size: {train_size}, Val size: {val_size}, Test size: {test_size}")

    train_list = audio_transcript_pair_list[:train_size]
    val_list = audio_transcript_pair_list[train_size:train_size + val_size]
    test_list = audio_transcript_pair_list[train_size + val_size:]

    return train_list, val_list, test_list

#### Dataset Class

In [8]:
class CustomSpeechDataset(torch.utils.data.Dataset):

    def __init__(self, audio_info_list, tokenizer, sample_rate):
        super().__init__()

        self.audio_info_list = audio_info_list
        self.tokenizer = tokenizer
        self.sample_rate = sample_rate

    def __len__(self):
        return len(self.audio_info_list)

    def __getitem__(self, id):
        audio_path, text = self.audio_info_list[id]

        waveform = load_wave(audio_path, sample_rate=self.sample_rate)
        waveform = whisper.pad_or_trim(waveform.flatten())
        mel = whisper.log_mel_spectrogram(waveform)

        text_tokens = self.tokenizer.encode(text)
        text = [*self.tokenizer.sot_sequence_including_notimestamps] + self.tokenizer.encode(text)
        labels = text[1:] + [self.tokenizer.eot]

        return {
            "input_ids": mel,
            "labels": labels,
            "dec_input_ids": text
        }

class WhisperDataCollatorWithPadding:

    def __call__(self, features):

        input_ids, labels, dec_input_ids = [], [], []
        for f in features:
            input_ids.append(f["input_ids"])
            labels.append(f["labels"])
            dec_input_ids.append(f["dec_input_ids"])

        input_ids = torch.concat([input_id[None, :] for input_id in input_ids])

        label_lengths = [len(lab) for lab in labels]
        dec_input_ids_length = [len(e) for e in dec_input_ids]
        max_label_len = max(label_lengths+dec_input_ids_length)

        labels = [np.pad(lab, (0, max_label_len - lab_len), 'constant', constant_values=-100) for lab, lab_len in zip(labels, label_lengths)]
        dec_input_ids = [np.pad(e, (0, max_label_len - e_len), 'constant', constant_values=50257) for e, e_len in zip(dec_input_ids, dec_input_ids_length)] # 50257 is eot token id

        batch = {
            "labels": labels,
            "dec_input_ids": dec_input_ids
        }

        batch = {k: torch.tensor(np.array(v), requires_grad=False) for k, v in batch.items()}
        batch["input_ids"] = input_ids

        return batch

#### Whisper Finetuning Class

In [9]:
class WhisperModelModule(LightningModule):

    def __init__(self, cfg, model_name="tiny", lang="id", train_dataset=[], eval_dataset=[]):
        super().__init__()
        self.options = whisper.DecodingOptions(language="id", without_timestamps=True, task="transcribe")
        self.tokenizer = whisper.tokenizer.get_tokenizer(True, language="id", task=self.options.task)

        self.model = whisper.load_model(model_name)

        self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100)
        self.metrics_wer = evaluate.load("wer")
        self.metrics_cer = evaluate.load("cer")

        self.cfg = cfg
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_id):
        input_ids = batch["input_ids"]
        labels = batch["labels"].long()
        dec_input_ids = batch["dec_input_ids"].long()

        with torch.no_grad():
            audio_features = self.model.encoder(input_ids)

        out = self.model.decoder(dec_input_ids, audio_features)

        loss = self.loss_fn(out.view(-1, out.size(-1)), labels.view(-1))

        self.log("train/loss", loss, on_step=True, prog_bar=True, logger=True)

        return loss

    def validation_step(self, batch, batch_id):
        input_ids = batch["input_ids"]
        labels = batch["labels"].long()
        dec_input_ids = batch["dec_input_ids"].long()
        audio_features = self.model.encoder(input_ids)
        out = self.model.decoder(dec_input_ids, audio_features)

        loss = self.loss_fn(out.view(-1, out.size(-1)), labels.view(-1))

        predicted_ids = torch.argmax(out, dim=2)

        o_list, l_list = [], []
        for pred, ref in zip(predicted_ids, labels):
            o_list.append(self.tokenizer.decode([token.item() for token in pred if token.item() != -100 and token.item() != self.tokenizer.eot]))
            l_list.append(self.tokenizer.decode([token.item() for token in ref if token.item() != -100]))

        cer = self.metrics_cer.compute(references=l_list, predictions=o_list)
        wer = self.metrics_wer.compute(references=l_list, predictions=o_list)


        self.log("val/loss", loss, on_step=True, prog_bar=True, logger=True)
        self.log("val/cer", cer, on_step=True, prog_bar=True, logger=True)
        self.log("val/wer", wer, on_step=True, prog_bar=True, logger=True)

        return {
            "cer": cer,
            "wer": wer,
            "loss": loss
        }


    def configure_optimizers(self):

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters()
                            if not any(nd in n for nd in no_decay)],
                "weight_decay": self.cfg.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters()
                            if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.cfg.learning_rate,
                          eps=self.cfg.adam_epsilon)

        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=self.cfg.warmup_steps,
            num_training_steps=self.cfg.max_steps
        )

        return [optimizer], [{"scheduler": scheduler, "interval": "step", "frequency": 1}]

    def train_dataloader(self):
        dataset = CustomSpeechDataset(self.train_dataset, self.tokenizer, self.cfg.sample_rate)
        return torch.utils.data.DataLoader(dataset,
                          batch_size=self.cfg.train_batch_size,
                          drop_last=True, shuffle=True, num_workers=self.cfg.num_worker,
                          collate_fn=WhisperDataCollatorWithPadding()
                          )

    def val_dataloader(self):
        dataset = CustomSpeechDataset(self.eval_dataset, self.tokenizer, self.cfg.sample_rate)
        return torch.utils.data.DataLoader(dataset,
                          batch_size=self.cfg.eval_batch_size,
                          num_workers=self.cfg.num_worker,
                          collate_fn=WhisperDataCollatorWithPadding()
                          )

#### Split Dataset

In [12]:
audio_transcript_pair_list_tvri = load_custom_dataset(TVRI_DATASET_DIR)
audio_transcript_pair_list_test = load_custom_dataset(TEST_DATASET_DIR)
train_list, val_list, _ = split_dataset(audio_transcript_pair_list_tvri, 0.9, 0.1, 0)
_, _, test_list = split_dataset(audio_transcript_pair_list_test, 0, 0, 1)

total_samples = len(train_list) + len(val_list) + len(test_list)
print(f"Train samples: {len(train_list)}")
print(f"Val samples: {len(val_list)}")
print(f"Test samples: {len(test_list)}")

dataset/TVRI_BS_071119_0001.wav
dataset/TVRI_BS_071119_0002.wav
dataset/TVRI_BS_071119_0003.wav
dataset/TVRI_BS_071119_0004.wav
dataset/TVRI_BS_071119_0005.wav
dataset/TVRI_BS_071119_0006.wav
dataset/TVRI_BS_071119_0007.wav
dataset/TVRI_BS_071119_0010.wav
dataset/TVRI_BS_071119_0013.wav
dataset/TVRI_BS_071119_0019.wav
dataset/TVRI_BS_071119_0020.wav
dataset/TVRI_BS_071119_0022.wav
dataset/TVRI_BS_071119_0025.wav
dataset/TVRI_BS_071119_0028.wav
dataset/TVRI_BS_071119_0029.wav
dataset/TVRI_BS_071119_0030.wav
dataset/TVRI_BS_071119_0032.wav
dataset/TVRI_BS_071119_0036.wav
dataset/TVRI_BS_071119_0050.wav
dataset/TVRI_BS_071119_0056.wav
dataset/TVRI_BS_071119_0058.wav
dataset/TVRI_BS_071119_0062.wav
dataset/TVRI_BS_071119_0070.wav
dataset/TVRI_BS_071119_0076.wav
dataset/TVRI_BS_071119_0077.wav
dataset/TVRI_BS_071119_0078.wav
dataset/TVRI_BS_071119_0079.wav
dataset/TVRI_BS_071119_0084.wav
dataset/TVRI_BS_071119_0086.wav
dataset/TVRI_BS_071119_0088.wav
dataset/TVRI_BS_071119_0089.wav
dataset/

#### Training

In [13]:
log_output_dir = "logs"
check_output_dir = "artifacts"
train_name = "whisper"
model_name = "tiny"
lang = "id"

In [15]:
cfg = Config()

tflogger = TensorBoardLogger(
    save_dir=log_output_dir,
    name=train_name,
)

callback_list = [EarlyStopping(monitor="val_accuracy", min_delta=0.01), LearningRateMonitor(logging_interval="epoch")]
tiny_model = WhisperModelModule(cfg, model_name, lang, train_list, val_list)

trainer = Trainer(
    precision=16,
    accelerator=DEVICE,
    max_epochs=cfg.max_steps,
    accumulate_grad_batches=cfg.gradient_accumulation_steps,
    logger=tflogger,
    callbacks=callback_list
)

trainer.fit(tiny_model)

INFO:pytorch_lightning.utilities.rank_zero:Using bfloat16 Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | model   | Whisper          | 37.2 M | train
1 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
37.2 M    Trainable params
0         Non-trainable params
37.2 M    Total params
148.739   Total estimated model params si

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:
Detected KeyboardInterrupt, attempting graceful shutdown ...
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/pytorch_lightning/trainer/call.py", line 49, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pytorch_lightning/trainer/trainer.py", line 598, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/usr/local/lib/python3.12/dist-packages/pytorch_lightning/trainer/trainer.py", line 1011, in _run
    results = self._run_stage()
              ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pytorch_lightning/trainer/trainer.py", line 1053, in _run_stage
    self._run_sanity_check()
  File "/usr/local/lib/python3.12/dist-packages/pytorch_lightning/trainer/trainer.py", line 1082, in _run_sanity_check
    val_loop.run()
  File "/usr/local/lib/python3.12/dist-packages/pytorch_lightning/loops/utilities.py", line 179, in _decorator
    return loop_run(self, *args, **kwargs)
         

TypeError: object of type 'NoneType' has no len()

In [16]:
options = whisper.DecodingOptions(language="id", without_timestamps=True, task="transcribe")
tokenizer = whisper.tokenizer.get_tokenizer(True, language="id", task=options.task)
dataset = CustomSpeechDataset(val_list, tokenizer, SAMPLE_RATE)
loader = torch.utils.data.DataLoader(dataset, batch_size=2, collate_fn=WhisperDataCollatorWithPadding())

refs = []
res = []
for b in tqdm(loader):
    input_ids = b["input_ids"].half()
    labels = b["labels"].long()
    with torch.no_grad():
        results = tiny_model.model.decode(input_ids, options)
        for r in results:
            res.append(r.text)

        for l in labels:
            filtered_l = [token.item() for token in l if token.item() != -100 and token.item() != tokenizer.eot]
            ref = tokenizer.decode(filtered_l)
            refs.append(ref)

  0%|          | 0/2 [00:00<?, ?it/s]

#### Evaluation

In [18]:
test_df = pd.DataFrame(test_list, columns=['audio_path', 'text'])

# EVALUATION 1: ZERO-SHOT (BASELINE)

print("EVALUATING MEDIUM MODEL...")

zero_shot_predictions = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Transcribing Zero-Shot"):

    transcriber = pipeline(
      "automatic-speech-recognition",
      model="cahya/whisper-medium-id"
    )
    transcriber.model.config.forced_decoder_ids = (
      transcriber.tokenizer.get_decoder_prompt_ids(
        language="id",
        task="transcribe"
      )
    )
    prediction = transcriber(row['audio_path'])["text"]
    zero_shot_predictions.append(prediction)

test_df['predict_zero_shot'] = zero_shot_predictions
test_df, wer_zero_shot_orig, wer_zero_shot_norm, cer_zero_shot_orig, cer_zero_shot_norm = calculate_err(test_df.rename(columns={'predict_zero_shot': 'predict'}))

print("EVALUATING TINY MODEL...")

test_df_2 = pd.DataFrame(test_list, columns=['audio_path', 'text'])

if tiny_model:
    tiny_model.to(DEVICE)
    fine_tuned_predictions = []

    for idx, row in tqdm(test_df_2.iterrows(), total=len(test_df_2), desc="Transcribing Fine-Tuned"):
        prediction = extract_aud(row['audio_path'], tiny_model.model)
        fine_tuned_predictions.append(prediction)

    test_df_2['predict_fine_tuned'] = fine_tuned_predictions
    test_df_2, wer_fine_tuned_orig, wer_fine_tuned_norm, cer_fine_tuned_orig, cer_fine_tuned_norm = calculate_err(test_df_2.rename(columns={'predict_fine_tuned': 'predict'}))

else:
    print("Fine-tuned model not found. Skipping evaluation.")

EVALUATING ZERO-SHOT MODEL...


Transcribing Zero-Shot:   0%|          | 0/5 [00:00<?, ?it/s]

dataset/TVRI_BS_071119_0235.wav


Device set to use cpu
`return_token_timestamps` is deprecated for WhisperFeatureExtractor and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
`generation_config` default values have been modified to match model-specific defaults: {'begin_suppress_tokens': [220, 50257]}. If this is not desired, please set these values explicitly.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProcessor'> will take precedence. Please check the docstring of <class 'transformers.generation.logits_process.SuppressTokensAtBeginLogitsProce

dataset/TVRI_BS_071119_0273.wav


Device set to use cpu


dataset/TVRI_BS_071119_0013.wav


Device set to use cpu


dataset/TVRI_BS_071119_0191.wav


Device set to use cpu


dataset/TVRI_BS_071119_0247.wav


Device set to use cpu


EVALUATING Fine-TUNED MODEL...


Transcribing Fine-Tuned:   0%|          | 0/5 [00:00<?, ?it/s]

In [19]:
test_df[["text_clean", "predict_clean"]].head()

Unnamed: 0,text_clean,predict_clean
0,keputusannya akan dilakukan setelah partai gel...,keputusannya akan dilakukan setelah partai gel...
1,tidak bisa dihindari permasalahan global menja...,tidak bisa dihindari permasalahan global menja...
2,dari hasil investigasi tersebut penyidik menet...,dari hasil investigasi tersebut penyidik menet...
3,hal itu diungkapkan presiden joko widodo ketik...,hal itu diunggapkan presiden joko widodo ketik...
4,jabatan wakil panglima tni terakhir muncul pad...,jabatan wakil panglima tni terakhir muncul pad...


In [20]:
test_df_2[["text_clean", "predict_clean"]].head()

Unnamed: 0,text_clean,predict_clean
0,keputusannya akan dilakukan setelah partai gel...,keputusannya akan dilakukan setelah partai gel...
1,tidak bisa dihindari permasalahan global menja...,tidak bisa di indari permasaran gobal menjadi ...
2,dari hasil investigasi tersebut penyidik menet...,dari hasil investigation sebut penyindik menet...
3,hal itu diungkapkan presiden joko widodo ketik...,ali tuh diunggapkan prasi dan jokawi dodo keti...
4,jabatan wakil panglima tni terakhir muncul pad...,jepatan wakil panglimatan itahir 12 5 lalu wak...


#### Display Results

In [23]:
results_data = {
    'Model': ['Medium Model', 'Tiny Model'],
    'WER (Original)': [wer_zero_shot_orig, wer_fine_tuned_orig],
    'WER (Normalized)': [wer_zero_shot_norm, wer_fine_tuned_norm],
    'CER (Original)': [cer_zero_shot_orig, cer_fine_tuned_orig],
    'CER (Normalized)': [cer_zero_shot_norm, cer_fine_tuned_norm]
}

results_df = pd.DataFrame(results_data)
print(results_df.to_string(index=False))

       Model  WER (Original)  WER (Normalized)  CER (Original)  CER (Normalized)
Medium Model        0.283582          0.073529         0.04878          0.012397
  Tiny Model        0.850746          0.647059         0.26626          0.210744
