# Audio Transcription

### I. Data Gathering

The dataset used for this project is the Common Voice dataset. Common Voice is a massive multi-lingual corpus of read speech by Mozilla. This project used Common Voice Corpus 20.0 subset for Indonesian language.  

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
audio_dir = "../data/cv-corpus-6.1-indonesian/clips/"

##### 1. Training Data

In [3]:
train_df = pd.read_csv("../data/cv-corpus-6.1-indonesian/train.tsv", sep = "\t")

In [4]:
train_df.head(5)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,locale,segment
0,4c81270f49ada076d376a968994e1533674531b0fae896...,common_voice_id_19192526.mp3,Kamar adik laki-laki saya lebih sempit daripad...,2,0,twenties,male,,id,
1,4c81270f49ada076d376a968994e1533674531b0fae896...,common_voice_id_19192527.mp3,Ayah akan membunuhku.,2,0,twenties,male,,id,
2,4c81270f49ada076d376a968994e1533674531b0fae896...,common_voice_id_19192528.mp3,Ini pulpen.,2,0,twenties,male,,id,
3,4c81270f49ada076d376a968994e1533674531b0fae896...,common_voice_id_19192535.mp3,Akira pandai bermain tenis.,2,0,twenties,male,,id,
4,4c81270f49ada076d376a968994e1533674531b0fae896...,common_voice_id_19192536.mp3,Dia keluar dari ruangan tanpa mengatakan sepat...,2,1,twenties,male,,id,


In [5]:
train_df = train_df[train_df["up_votes"] >= train_df["down_votes"]]
train_df["sentence"] = train_df["sentence"].str.lower()
train_df = train_df[["path", "sentence"]]

In [6]:
train_df.duplicated().sum()

0

In [7]:
train_df.isna().sum()

path        0
sentence    0
dtype: int64

##### 2. Testing Data

In [8]:
test_df = pd.read_csv("../data/cv-corpus-6.1-indonesian/test.tsv", sep = "\t")

In [9]:
test_df.head(5)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,locale,segment
0,057bf45c0c338db897f5717f744bcac8a2ac2eee990a42...,common_voice_id_22888800.mp3,Minggu depan kakak perempuan saya menikah.,2,0,,,,id,
1,0835fbbf1d609a6ed421eef134a48ff06d719121b41f3b...,common_voice_id_24015257.mp3,Berbagai bahasa daerah dan bahasa asing menjad...,2,1,,,,id,
2,0c8ac0307f35c73b09d8fc0d92e4c183e3078adee87212...,common_voice_id_24015280.mp3,apa yang bisa saya berikan kepadamu?,2,0,,,,id,
3,19285f8e012ad31cad237d53bab348ce59a5cc13684754...,common_voice_id_20425643.mp3,Inilah dunia kecil.,2,1,,,,id,
4,3502377c5fb712169a3f2fe5583906e4b3a5ecba27bf2c...,common_voice_id_22185104.mp3,nol,2,0,,,,id,Benchmark


In [10]:
test_df = test_df[test_df["up_votes"] >= test_df["down_votes"]]
train_df["sentence"] = train_df["sentence"].str.lower()
test_df = test_df[["path", "sentence"]]

In [11]:
test_df.duplicated().sum()

0

In [12]:
test_df.isna().sum()

path        0
sentence    0
dtype: int64

##### 3. Split Data

In [13]:
valid_df, test_df = train_test_split(test_df, test_size = 0.5, random_state = 42)

##### 4. Data Transformation

In [14]:
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
train_df["path"] = train_df["path"].apply(lambda x: audio_dir + x)
valid_df["path"] = valid_df["path"].apply(lambda x: audio_dir + x)
test_df["path"] = test_df["path"].apply(lambda x: audio_dir + x)

In [16]:
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

In [17]:
train_dataset = train_dataset.rename_column("path", "audio_path")
valid_dataset = valid_dataset.rename_column("path", "audio_path")
test_dataset = test_dataset.rename_column("path", "audio_path")

In [18]:
train_dataset = train_dataset.map(lambda x: {"audio": x["audio_path"]}, remove_columns = ["audio_path"])
valid_dataset = valid_dataset.map(lambda x: {"audio": x["audio_path"]}, remove_columns = ["audio_path"])
test_dataset = test_dataset.map(lambda x: {"audio": x["audio_path"]}, remove_columns = ["audio_path"])

Map: 100%|██████████| 2130/2130 [00:00<00:00, 44281.65 examples/s]
Map: 100%|██████████| 922/922 [00:00<00:00, 38416.78 examples/s]
Map: 100%|██████████| 922/922 [00:00<00:00, 36879.16 examples/s]


### II. Data Preprocessing

The preprocessing technique used for this project are:
1. Removing special characters
2. Creating Wav2Vec2CTCTokenizer

##### 1. Removing Special Characters

In [19]:
import re

In [20]:
def remove_special_characters(batch):
    chars_to_ignore_regex = r'[\,\?\.\!\-\;\:\"\“\%\‘\”\�\！]'
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
    return batch

In [21]:
train_dataset = train_dataset.map(remove_special_characters)
valid_dataset = valid_dataset.map(remove_special_characters)
test_dataset = test_dataset.map(remove_special_characters)

Map: 100%|██████████| 2130/2130 [00:00<00:00, 41766.17 examples/s]
Map: 100%|██████████| 922/922 [00:00<00:00, 34143.09 examples/s]
Map: 100%|██████████| 922/922 [00:00<00:00, 34148.81 examples/s]


##### 2. Creating Wav2Vec2CTCTokenizer

In [22]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [23]:
vocab_train = train_dataset.map(extract_all_chars, batched = True, batch_size = -1, keep_in_memory = True, remove_columns = train_dataset.column_names)
vocab_valid = valid_dataset.map(extract_all_chars, batched = True, batch_size = -1, keep_in_memory = True, remove_columns = valid_dataset.column_names)
vocab_test = test_dataset.map(extract_all_chars, batched = True, batch_size = -1, keep_in_memory = True, remove_columns = test_dataset.column_names)

Map: 100%|██████████| 2130/2130 [00:00<00:00, 532602.09 examples/s]
Map: 100%|██████████| 922/922 [00:00<00:00, 307477.80 examples/s]
Map: 100%|██████████| 922/922 [00:00<00:00, 307697.99 examples/s]


In [24]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_valid["vocab"][0]) | set(vocab_test["vocab"][0]))

In [25]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'s': 0,
 'u': 1,
 'o': 2,
 'a': 3,
 'e': 4,
 't': 5,
 'n': 6,
 'l': 7,
 'h': 8,
 'k': 9,
 'g': 10,
 'p': 11,
 'v': 12,
 'd': 13,
 'r': 14,
 ' ': 15,
 'j': 16,
 'w': 17,
 'b': 18,
 'm': 19,
 'z': 20,
 'y': 21,
 'x': 22,
 'i': 23,
 'f': 24,
 'c': 25}

In [26]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [27]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

28

In [28]:
import json
with open('../data/cv-corpus-6.1-indonesian/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

### III. Feature Extraction

The feature extractor used for this project is the Wav2Vec2 Feature Extractor.

In [29]:
from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
from datasets import Audio
import IPython.display as ipd
import numpy as np
import random

In [30]:
tokenizer = Wav2Vec2CTCTokenizer("../data/cv-corpus-6.1-indonesian/vocab.json", unk_token = "[UNK]", pad_token = "[PAD]", word_delimiter_token = "|")

In [31]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size = 1, sampling_rate = 16000, padding_value = 0.0, do_normalize = True, return_attention_mask = True)

In [32]:
processor = Wav2Vec2Processor(feature_extractor = feature_extractor, tokenizer = tokenizer)

In [33]:
train_dataset[0]

{'sentence': 'kamar adik lakilaki saya lebih sempit daripada kamar saya ',
 'audio': '../data/cv-corpus-6.1-indonesian/clips/common_voice_id_19192526.mp3'}

In [34]:
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate = 16000))
valid_dataset = valid_dataset.cast_column("audio", Audio(sampling_rate = 16000))
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate = 16000))

In [35]:
train_dataset[0]

{'sentence': 'kamar adik lakilaki saya lebih sempit daripada kamar saya ',
 'audio': {'path': '../data/cv-corpus-6.1-indonesian/clips/common_voice_id_19192526.mp3',
  'array': array([-6.82121026e-12, -9.09494702e-13, -5.91171556e-12, ...,
         -3.77372089e-09, -1.57415094e-08, -1.17970558e-08]),
  'sampling_rate': 16000}}

In [36]:
rand_int = random.randint(0, len(train_dataset) - 1)

ipd.Audio(data = train_dataset[rand_int]["audio"]["array"], autoplay = True, rate = 16000)

In [37]:
rand_int = random.randint(0, len(train_dataset) - 1)

print("Target text:", train_dataset[rand_int]["sentence"])
print("Input array shape:", train_dataset[rand_int]["audio"]["array"].shape)
print("Sampling rate:", train_dataset[rand_int]["audio"]["sampling_rate"])

Target text: tampaknya dia berubah pikiran 
Input array shape: (58368,)
Sampling rate: 16000


### IV. Modeling

The model used for this projects are:
- XLSR-53
- Whisper

In [38]:
import torch

In [39]:
print("PyTorch CUDA Available:", torch.cuda.is_available())
print("PyTorch CUDA Version:", torch.version.cuda)
print("PyTorch cuDNN Enabled:", torch.backends.cudnn.enabled)

PyTorch CUDA Available: True
PyTorch CUDA Version: 11.8
PyTorch cuDNN Enabled: True


In [40]:
device = torch.device("cuda")

In [41]:
device

device(type='cuda')

In [42]:
def prepare_dataset(batch):
    audio = batch["audio"]
    
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [43]:
train_dataset = train_dataset.map(prepare_dataset, remove_columns = train_dataset.column_names, num_proc = 1)
valid_dataset = valid_dataset.map(prepare_dataset, remove_columns = valid_dataset.column_names, num_proc = 1)
test_dataset = test_dataset.map(prepare_dataset, remove_columns = test_dataset.column_names, num_proc = 1)

Map: 100%|██████████| 2130/2130 [00:09<00:00, 216.63 examples/s]
Map: 100%|██████████| 922/922 [00:03<00:00, 255.50 examples/s]
Map: 100%|██████████| 922/922 [00:03<00:00, 256.14 examples/s]


##### 1. XLSR-53

XLSR-53 is a pretrained model built on wav2vec 2.0 thas has been trained in 53 different languages. There are four importance elements, which are Feature Encoder, Quantization Module, Context Network, and Pretraining and Contrasive Loss. 

![XLSR-53 Architecture](../assets/xlsr-53.png)

Fig. 1. XLSR-53 Architecture

In [44]:
import evaluate
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from transformers import Wav2Vec2ForCTC
from transformers import TrainingArguments
from transformers import Trainer

In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding = self.padding,
            max_length = self.max_length,
            pad_to_multiple_of = self.pad_to_multiple_of,
            return_tensors = "pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding = self.padding,
                max_length = self.max_length_labels,
                pad_to_multiple_of = self.pad_to_multiple_of_labels,
                return_tensors = "pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [46]:
data_collator = DataCollatorCTCWithPadding(processor = processor, padding = True)

In [47]:
wer_metric = evaluate.load("wer")

In [48]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [49]:
xlsr53_model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", 
    attention_dropout = 0.1,
    hidden_dropout = 0.1,
    feat_proj_dropout = 0.0,
    mask_time_prob = 0.05,
    layerdrop = 0.1,
    ctc_loss_reduction = "mean", 
    pad_token_id = processor.tokenizer.pad_token_id,
    vocab_size = len(processor.tokenizer)
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
xlsr53_model.freeze_feature_extractor()



In [51]:
xlsr53_model.gradient_checkpointing_enable()

In [52]:
xlsr53_training_args = TrainingArguments(
  output_dir = "../models/xlsr-53",
  group_by_length = True,
  per_device_train_batch_size = 4,
  gradient_accumulation_steps = 1,
  evaluation_strategy = "steps",
  num_train_epochs = 30,
  fp16 = True,
  save_steps = 100,
  eval_steps = 100,
  logging_steps = 10,
  learning_rate = 3e-4,
  warmup_steps = 500,
  save_total_limit = 2,
)



In [53]:
xlsr53_trainer = Trainer(
    model = xlsr53_model,
    data_collator = data_collator,
    args = xlsr53_training_args,
    compute_metrics = compute_metrics,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset,
    tokenizer = processor.feature_extractor,
)

  xlsr53_trainer = Trainer(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [54]:
xlsr53_trainer.train()



Step,Training Loss,Validation Loss,Wer
100,5.9183,4.06602,1.0
200,2.9784,2.914909,1.0
300,2.8867,2.903507,1.0
400,2.9244,2.880877,1.0
500,2.825,2.82503,1.0
600,2.7625,2.791624,1.0
700,2.7518,2.756393,1.0
800,2.6902,2.693568,1.0
900,1.6942,1.537278,1.000561
1000,1.2959,1.07551,0.950093


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=15990, training_loss=0.4406403456183804, metrics={'train_runtime': 15242.4575, 'train_samples_per_second': 4.192, 'train_steps_per_second': 1.049, 'total_flos': 7.218536353117701e+18, 'train_loss': 0.4406403456183804, 'epoch': 30.0})

##### 2. Whisper

### V. Model Evaluation

##### 1. XLSR-53

In [55]:
xlsr53_test_result = xlsr53_trainer.evaluate(test_dataset)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


In [56]:
print(xlsr53_test_result)

{'eval_loss': 0.6989041566848755, 'eval_wer': 0.5340401785714286, 'eval_runtime': 39.6758, 'eval_samples_per_second': 23.238, 'eval_steps_per_second': 2.924, 'epoch': 30.0}


In [57]:
xlsr53_trainer.save_model("../models/xlsr-53")
processor.save_pretrained("../models/xlsr-53") 

[]

In [58]:
xlsr53_model = Wav2Vec2ForCTC.from_pretrained("../models/xlsr-53")
processor = Wav2Vec2Processor.from_pretrained("../models/xlsr-53")

### VI. Conclusion

<table>
    <thead>
        <td> Model </td>
        <td> Loss </td>
        <td> WER </td>
    </thead>
    <tbody>
        <tr>
            <td> XLSR-53 </td>
            <td> 0.6989 </td>
            <td> 0.5340 </td>
        </tr>
    </tbody>
</table>