In [None]:

!pip install datasets jiwer



In [None]:
import torchaudio
import os
from transformers import WhisperProcessor
import torch
from sklearn.model_selection import train_test_split
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import Dataset, DatasetDict
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [None]:
def create_dataset(audio_path, transcript_path):
    audio_files = sorted([os.path.join(audio_path, f) for f in os.listdir(audio_path) if f.endswith('.wav') or f.endswith('.mp3')])
    with open(os.path.abspath(transcript_path), "r") as f:
        transcriptions = [line.strip() for line in f.readlines()]
    print(f"Audio Files: {audio_files}")

    dataset = []
    for audio_file, transcription in zip(audio_files, transcriptions):
        waveform, sample_rate = torchaudio.load(audio_file)
        dataset.append({"audio": {"array": waveform.squeeze(0).numpy(), "sampling_rate": sample_rate}, "sentence": transcription})

    return dataset

In [None]:
def preprocess_dataset(dataset, processor):
    def preprocess(batch):
        audio = batch["audio"]
        input_features = processor(audio["array"], sampling_rate=16000, return_tensors="pt").input_features[0]
        input_ids = processor.tokenizer(batch["sentence"]).input_ids

        return {
            "input_features": input_features,
            "labels": torch.tensor(input_ids, dtype=torch.long),
        }

    return dataset.map(preprocess, remove_columns=["audio", "sentence"])

In [None]:
class DataCollatorWhisper:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, batch):
        input_features = torch.stack(
            [torch.tensor(example["input_features"]) if not isinstance(example["input_features"], torch.Tensor) else example["input_features"] for example in batch]
        )

        labels = [
            torch.tensor(example["labels"], dtype=torch.long) if not isinstance(example["labels"], torch.Tensor) else example["labels"]
            for example in batch
        ]

        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=self.processor.tokenizer.pad_token_id)

        return {
            "input_features": input_features,
            "labels": labels,
        }

In [None]:
!git clone https://github.com/Salomo309/Tugas-Besar-Automatic-Speech-Recognition.git

%cd Tugas-Besar-Automatic-Speech-Recognition/data

!ls -R

Cloning into 'Tugas-Besar-Automatic-Speech-Recognition'...
remote: Enumerating objects: 302, done.[K
remote: Counting objects: 100% (302/302), done.[K
remote: Compressing objects: 100% (292/292), done.[K
remote: Total 302 (delta 13), reused 292 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (302/302), 6.64 MiB | 16.66 MiB/s, done.
Resolving deltas: 100% (13/13), done.
/content/Tugas-Besar-Automatic-Speech-Recognition/data/Tugas-Besar-Automatic-Speech-Recognition/data
.:
audio  audio_test  transcript_test.txt	transcript.txt

./audio:
common_voice_id_19059614.mp3  common_voice_id_20955243.mp3  common_voice_id_23827455.mp3
common_voice_id_19061958.mp3  common_voice_id_20955251.mp3  common_voice_id_23827469.mp3
common_voice_id_19062055.mp3  common_voice_id_20955254.mp3  common_voice_id_24016690.mp3
common_voice_id_19078324.mp3  common_voice_id_20956528.mp3  common_voice_id_24976980.mp3
common_voice_id_19080263.mp3  common_voice_id_20961261.mp3  common_voice_id_24981957.mp3

In [None]:
model_name = "cahya/whisper-small-id"
model = WhisperForConditionalGeneration.from_pretrained(model_name)
processor = WhisperProcessor.from_pretrained(model_name)

dataset1 = create_dataset("/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio", "/content/Tugas-Besar-Automatic-Speech-Recognition/data/transcript.txt")
dataset1 = Dataset.from_list(dataset1)
dataset1 = preprocess_dataset(dataset1, processor)

dataset2 = create_dataset("/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio_test", "/content/Tugas-Besar-Automatic-Speech-Recognition/data/transcript_test.txt")
dataset2 = Dataset.from_list(dataset2)
dataset2 = preprocess_dataset(dataset2, processor)

data_collator = DataCollatorWhisper(processor)

train_dataset = dataset1
eval_dataset = dataset2

Audio Files: ['/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio/common_voice_id_19059614.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio/common_voice_id_19061958.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio/common_voice_id_19062055.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio/common_voice_id_19078324.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio/common_voice_id_19080263.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio/common_voice_id_19083105.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio/common_voice_id_19115434.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio/common_voice_id_19192638.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio/common_voice_id_19192707.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio/common_voice_id_19258438.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Audio Files: ['/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio_test/common_voice_id_26831395.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio_test/common_voice_id_26831401.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio_test/common_voice_id_27160502.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio_test/common_voice_id_27180100.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio_test/common_voice_id_27204052.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio_test/common_voice_id_27341350.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio_test/common_voice_id_27370297.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio_test/common_voice_id_27374100.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio_test/common_voice_id_27451104.mp3', '/content/Tugas-Besar-Automatic-Speech-Recognition/data/audio_test/common_voice_id_27457826.mp3', '/cont

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

In [None]:
# training hyperparameters, eksperimen di sini
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

training_args = Seq2SeqTrainingArguments(
    output_dir="../models/checkpoints/",
    run_name="my_training_run",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=2,
    save_strategy="epoch",
    predict_with_generate=True,
    fp16=True,
    logging_strategy="steps",
    logging_steps=10,
)

num_training_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=None,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler)
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


In [None]:
import os
os.environ["WANDB_MODE"] = "offline"
os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.066,0.211345
2,0.0259,0.209722


TrainOutput(global_step=104, training_loss=0.08931764945960961, metrics={'train_runtime': 344.3015, 'train_samples_per_second': 1.202, 'train_steps_per_second': 0.302, 'total_flos': 1.1947435573248e+17, 'train_loss': 0.08931764945960961, 'epoch': 2.0})

In [None]:
model.save_pretrained("output")

In [None]:
results = trainer.evaluate()
print("Hasil evaluasi:", results)

Hasil evaluasi: {'eval_loss': 0.19525191187858582, 'eval_runtime': 8.9509, 'eval_samples_per_second': 5.698, 'eval_steps_per_second': 0.782, 'epoch': 2.0}


In [None]:
from jiwer import wer
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np

def evaluate_model(model, eval_dataset, processor, data_collator, device='cuda', batch_size=8):

    eval_dataloader = DataLoader(
        eval_dataset,
        batch_size=batch_size,
        collate_fn=data_collator
    )

    model.eval()
    model.to(device)

    all_predictions = []
    all_references = []

    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            input_features = batch["input_features"].to(device)

            predicted_ids = model.generate(input_features)

            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

            labels = batch["labels"]
            reference = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)

            all_predictions.extend(transcription)
            all_references.extend(reference)

    wer_val = wer(all_predictions, all_references)

    return {
        "wer": wer_val,
        "predictions": all_predictions,
        "references": all_references
    }

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

results = evaluate_model(
    model=model,
    eval_dataset=eval_dataset,
    processor=processor,
    data_collator=data_collator,
    device=device,
    batch_size=8
)

print(f"\nEvaluation Results:")
print(f"Word Error Rate (WER): {results['wer']:.4f}")

print("\nSample Predictions vs References:")
for pred, ref in zip(results['predictions'][:5], results['references'][:5]):
    print(f"\nPrediction: {pred}")
    print(f"Reference:  {ref}")

Using device: cuda


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Evaluating: 100%|██████████| 7/7 [00:20<00:00,  2.88s/it]


Evaluation Results:
Word Error Rate (WER): 0.1523

Sample Predictions vs References:

Prediction: Kita tidak sampai harus menjual rumah kita.
Reference:  Kita tidak sampai harus menjual rumah kita.

Prediction: Biasanya ada banyak penjual kue di sana.
Reference:  Biasanya ada banyak penjual kue di sana.

Prediction: Tiketan tidak harus menjualnya.
Reference:  Kita tidak harus menjualnya.

Prediction: mengapa dia tinggal pembeli?
Reference:  mengapa tidak dijual saja?

Prediction: Aku ingin membuat penawaran.
Reference:  Aku ingin membuat penawaran.



