In [None]:
!pip install datasets
!pip install evaluate
!pip install jiwer
!pip install openai-whisper
!pip install ptflops
!pip install fvcore
!pip install ipython-autotime

In [2]:
%load_ext autotime

time: 114 µs (started: 2024-12-03 04:04:13 +00:00)


In [3]:
import torch
import torchaudio
import whisper
from ptflops import get_model_complexity_info
from datasets import load_dataset, get_dataset_split_names
from evaluate import load
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, WhisperForConditionalGeneration, WhisperProcessor
import re
import numpy as np

time: 19.7 s (started: 2024-12-03 04:04:18 +00:00)


# Prepare dataset

In [4]:
wer = load("wer")
cer = load("cer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

time: 1.16 s (started: 2024-12-03 04:04:38 +00:00)


In [5]:
vi_dataset = load_dataset("mozilla-foundation/common_voice_11_0", "vi", split="test", trust_remote_code=True)

common_voice_11_0.py:   0%|          | 0.00/8.13k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/14.4k [00:00<?, ?B/s]

languages.py:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

release_stats.py:   0%|          | 0.00/60.9k [00:00<?, ?B/s]

n_shards.json:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

vi_train_0.tar:   0%|          | 0.00/76.3M [00:00<?, ?B/s]

vi_dev_0.tar:   0%|          | 0.00/5.54M [00:00<?, ?B/s]

vi_test_0.tar:   0%|          | 0.00/33.9M [00:00<?, ?B/s]

vi_other_0.tar:   0%|          | 0.00/274M [00:00<?, ?B/s]

vi_invalidated_0.tar:   0%|          | 0.00/10.4M [00:00<?, ?B/s]

train.tsv:   0%|          | 0.00/562k [00:00<?, ?B/s]

dev.tsv:   0%|          | 0.00/53.3k [00:00<?, ?B/s]

test.tsv:   0%|          | 0.00/272k [00:00<?, ?B/s]

other.tsv:   0%|          | 0.00/2.52M [00:00<?, ?B/s]

invalidated.tsv:   0%|          | 0.00/74.7k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 2525it [00:00, 133868.66it/s]


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 248it [00:00, 94493.77it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 1237it [00:00, 87670.73it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 11476it [00:00, 154233.58it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 337it [00:00, 128943.66it/s]

time: 15.4 s (started: 2024-12-03 04:04:39 +00:00)





In [47]:
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
resampler = torchaudio.transforms.Resample(48_000, 16_000)

def speech_file_to_array_fn(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    batch["speech"] = resampler(speech_array).squeeze().numpy()
    return batch

def evaluate(batch):
    inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_strings"] = processor.batch_decode(pred_ids)

    return batch

def reset_memory_stats():
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.empty_cache()

def print_memory_usage_summary():
    max_allocated = torch.cuda.max_memory_allocated() / (1024**2)
    max_reserved = torch.cuda.max_memory_reserved() / (1024**2)
    print(f"Peak memory allocated: {max_allocated:.2f} MB")
    print(f"Peak memory reserved: {max_reserved:.2f} MB")

time: 4.11 ms (started: 2024-12-03 05:39:44 +00:00)


# Evaluate model performance

## Vietnamese

In [9]:
vi_dataset = vi_dataset.map(speech_file_to_array_fn)

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

time: 7.96 s (started: 2024-12-03 04:05:36 +00:00)


In [10]:
vi_dataset

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'speech'],
    num_rows: 1237
})

time: 3.46 ms (started: 2024-12-03 04:05:47 +00:00)


In [11]:
vi_dataset["sentence"][0]

'hương thu còn thoảng đâu đây bên thềm'

time: 4.54 ms (started: 2024-12-03 04:05:49 +00:00)


### Wav2Vec2

In [12]:
processor = Wav2Vec2Processor.from_pretrained("CuongLD/wav2vec2-large-xlsr-vietnamese")
model = Wav2Vec2ForCTC.from_pretrained("CuongLD/wav2vec2-large-xlsr-vietnamese")
model.to("cuda")

reset_memory_stats()

result = vi_dataset.map(evaluate, batched=True, batch_size=8)

print_memory_usage_summary()

print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["sentence"])))

preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/938 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]



Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

Peak memory allocated: 2729.54 MB
Peak memory reserved: 7568.00 MB
CER: 29.543150
time: 1min 19s (started: 2024-12-03 04:05:51 +00:00)


### Whisper

In [18]:
def evaluate(batch):
    audio_tensor = torch.tensor(batch["speech"], dtype=torch.float32)
    audio_tensor = audio_tensor / torch.max(torch.abs(audio_tensor))

    result = model.transcribe(audio_tensor.numpy(), language="vi")
    batch["pred_strings"] = result["text"]

    return batch

time: 641 µs (started: 2024-12-03 04:13:25 +00:00)


In [19]:
model = whisper.load_model("medium")
model.to("cuda")

reset_memory_stats()

result = vi_dataset.map(evaluate, batched=False)

print_memory_usage_summary()

print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["sentence"])))

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

Peak memory allocated: 6912.28 MB
Peak memory reserved: 11026.00 MB
CER: 22.431553
time: 17min 28s (started: 2024-12-03 04:13:27 +00:00)


In [20]:
model = whisper.load_model("small")
model.to("cuda")

reset_memory_stats()

result = vi_dataset.map(evaluate, batched=False)

print_memory_usage_summary()

print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["sentence"])))

100%|███████████████████████████████████████| 461M/461M [00:05<00:00, 94.5MiB/s]


Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

Peak memory allocated: 1073.89 MB
Peak memory reserved: 7062.00 MB
CER: 28.948071
time: 10min 27s (started: 2024-12-03 04:43:26 +00:00)


In [21]:
model = whisper.load_model("base")
model.to("cuda")

reset_memory_stats()

result = vi_dataset.map(evaluate, batched=False)

print_memory_usage_summary()

print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["sentence"])))

100%|███████████████████████████████████████| 139M/139M [00:01<00:00, 88.1MiB/s]


Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

Peak memory allocated: 361.58 MB
Peak memory reserved: 1464.00 MB
CER: 50.971340
time: 10min 13s (started: 2024-12-03 04:54:14 +00:00)


In [22]:
model = whisper.load_model("tiny")
model.to("cuda")

reset_memory_stats()

result = vi_dataset.map(evaluate, batched=False)

print_memory_usage_summary()

print("CER: {:2f}".format(100 * cer.compute(predictions=result["pred_strings"], references=result["sentence"])))

100%|█████████████████████████████████████| 72.1M/72.1M [00:02<00:00, 31.0MiB/s]


Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

Peak memory allocated: 204.56 MB
Peak memory reserved: 760.00 MB
CER: 67.235950
time: 12min 52s (started: 2024-12-03 05:04:28 +00:00)


## English

In [49]:
import pandas as pd
import os

en_dataset_path = "/kaggle/input/common-voice"
csv_file = os.path.join(en_dataset_path, "cv-valid-test.csv")

data = pd.read_csv(csv_file)

time: 12.6 ms (started: 2024-12-03 05:39:55 +00:00)


In [50]:
data.keys()

Index(['filename', 'text', 'up_votes', 'down_votes', 'age', 'gender', 'accent',
       'duration'],
      dtype='object')

time: 3.11 ms (started: 2024-12-03 05:39:56 +00:00)


In [51]:
data = data.iloc[:500, :2]
data

Unnamed: 0,filename,text
0,cv-valid-test/sample-000000.mp3,without the dataset the article is useless
1,cv-valid-test/sample-000001.mp3,i've got to go to him
2,cv-valid-test/sample-000002.mp3,and you know it
3,cv-valid-test/sample-000003.mp3,down below in the darkness were hundreds of pe...
4,cv-valid-test/sample-000004.mp3,hold your nose to keep the smell from disablin...
...,...,...
495,cv-valid-test/sample-000495.mp3,since the miner had sacrificed everything to h...
496,cv-valid-test/sample-000496.mp3,i would have won the junior olympics if not fo...
497,cv-valid-test/sample-000497.mp3,i've got indigestion
498,cv-valid-test/sample-000498.mp3,he is going to transform himself into the wind...


time: 7.09 ms (started: 2024-12-03 05:39:57 +00:00)


In [52]:
base_path = "/kaggle/input/common-voice/cv-valid-test/"

data["filename"] = data["filename"].apply(lambda x: os.path.join(base_path, x))

time: 1.89 ms (started: 2024-12-03 05:39:58 +00:00)


In [53]:
data.iloc[0]

filename    /kaggle/input/common-voice/cv-valid-test/cv-va...
text               without the dataset the article is useless
Name: 0, dtype: object

time: 3.31 ms (started: 2024-12-03 05:39:59 +00:00)


In [63]:
def speech_file_to_array_fn(batch):
    # Làm sạch nội dung text, bỏ các ký tự không cần thiết và chuyển về chữ thường
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).upper()
    
    # Load file âm thanh và resample nếu cần
    speech_array, sampling_rate = torchaudio.load(batch["filename"])
    batch["speech"] = resampler(speech_array).squeeze().numpy()
    
    return batch

time: 628 µs (started: 2024-12-03 05:43:03 +00:00)


In [65]:
from datasets import Dataset

en_dataset = Dataset.from_pandas(data)

en_dataset = en_dataset.map(speech_file_to_array_fn)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

time: 4.54 s (started: 2024-12-03 05:43:07 +00:00)


In [66]:
en_dataset

Dataset({
    features: ['filename', 'text', 'speech'],
    num_rows: 500
})

time: 2.62 ms (started: 2024-12-03 05:43:12 +00:00)


### Wev2Vec2

In [67]:
from transformers import AutoProcessor, AutoModelForCTC

processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
model.to("cuda")

reset_memory_stats()

result = en_dataset.map(evaluate, batched=True, batch_size=8)

print_memory_usage_summary()

print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["text"])))

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Peak memory allocated: 3046.29 MB
Peak memory reserved: 10402.00 MB
WER: 6.505657
time: 43.6 s (started: 2024-12-03 05:43:12 +00:00)


In [68]:
print(result["pred_strings"][0])
print(result["text"][0])

WITHOUT THE DATA SET THE ARTECLE IS USELESS
WITHOUT THE DATASET THE ARTICLE IS USELESS
time: 1.76 ms (started: 2024-12-03 05:43:55 +00:00)


### Whisper

In [81]:
def evaluate(batch):
    audio_tensor = torch.tensor(batch["speech"], dtype=torch.float32)
    audio_tensor = audio_tensor / torch.max(torch.abs(audio_tensor))

    result = model.transcribe(audio_tensor.numpy(), language="en")
    batch["pred_strings"] = result["text"]

    return batch

time: 787 µs (started: 2024-12-03 07:05:31 +00:00)


In [85]:
model = whisper.load_model("medium")
model.to("cuda")

reset_memory_stats()

result = en_dataset.map(evaluate, batched=False)

print_memory_usage_summary()

predictions_cleaned = [
    re.sub(chars_to_ignore_regex, '', ''.join(pred).upper()) for pred in result["pred_strings"]
]
print("WER: {:2f}".format(100 * wer.compute(predictions=predictions_cleaned, references=result["text"])))

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Peak memory allocated: 3171.91 MB
Peak memory reserved: 7300.00 MB
WER: 4.503916
time: 5min 52s (started: 2024-12-03 07:34:47 +00:00)


In [88]:
print(predictions_cleaned[0])
print(result["text"][0])

 WITHOUT THE DATA SET THE ARTICLE IS USELESS
WITHOUT THE DATASET THE ARTICLE IS USELESS
time: 1.3 ms (started: 2024-12-03 08:09:03 +00:00)


In [90]:
model = whisper.load_model("small")
model.to("cuda")

reset_memory_stats()

result = en_dataset.map(evaluate, batched=False)

print_memory_usage_summary()

predictions_cleaned = [
    re.sub(chars_to_ignore_regex, '', ''.join(pred).upper()) for pred in result["pred_strings"]
]
print("WER: {:2f}".format(100 * wer.compute(predictions=predictions_cleaned, references=result["text"])))

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Peak memory allocated: 1067.56 MB
Peak memory reserved: 2452.00 MB
WER: 6.201044
time: 2min 58s (started: 2024-12-03 08:16:28 +00:00)


In [91]:
model = whisper.load_model("base")
model.to("cuda")

reset_memory_stats()

result = en_dataset.map(evaluate, batched=False)

print_memory_usage_summary()

predictions_cleaned = [
    re.sub(chars_to_ignore_regex, '', ''.join(pred).upper()) for pred in result["pred_strings"]
]
print("WER: {:2f}".format(100 * wer.compute(predictions=predictions_cleaned, references=result["text"])))

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Peak memory allocated: 366.03 MB
Peak memory reserved: 1408.00 MB
WER: 9.551784
time: 1min 40s (started: 2024-12-03 08:19:26 +00:00)


In [92]:
model = whisper.load_model("tiny")
model.to("cuda")

reset_memory_stats()

result = en_dataset.map(evaluate, batched=False)

print_memory_usage_summary()

predictions_cleaned = [
    re.sub(chars_to_ignore_regex, '', ''.join(pred).upper()) for pred in result["pred_strings"]
]
print("WER: {:2f}".format(100 * wer.compute(predictions=predictions_cleaned, references=result["text"])))

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Peak memory allocated: 209.00 MB
Peak memory reserved: 520.00 MB
WER: 15.752829
time: 1min 21s (started: 2024-12-03 08:21:07 +00:00)
