## Install Library

In [12]:
%%capture
!pip install -U datasets
!pip install transformers==4.4.0
!pip install torchaudio
!pip install librosa
!pip install jiwer
!pip install pythainlp

## Prepare Data, Tokenizer, Feature Extractor

### Create Wav2Vec2CTCTokenizer

download datasets
*   download datasets from common_voice



In [13]:
from datasets import load_dataset, load_metric

common_voice_train = load_dataset("common_voice", "th", split="train+validation")
common_voice_test = load_dataset("common_voice", "th", split="test")

Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/th/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e)
Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/th/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e)


remove columns

In [14]:
common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

function show data

In [15]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [16]:
#show_random_elements(common_voice_train.remove_columns(["path"]), num_examples=20)

Let's do some pre-processing step such as:
*   Thai word tokenizing
*   Remove special *character*



In [17]:
import re
from pythainlp.tokenize import word_tokenize

chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

## For Thai NLP Library, please feel free to check https://pythainlp.github.io/docs/2.2/api/tokenize.html
def th_tokenize(batch):
    batch["sentence"] = " ".join(word_tokenize(batch["sentence"], engine="newmm"))
    return batch

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
    return batch

In [18]:
common_voice_train = common_voice_train.map(th_tokenize).map(remove_special_characters)
common_voice_test = common_voice_test.map(th_tokenize).map(remove_special_characters)

Loading cached processed dataset at /root/.cache/huggingface/datasets/common_voice/th/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e/cache-f0a97d7feb2ac2cf.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/common_voice/th/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e/cache-1f951a649e2514d1.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/common_voice/th/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e/cache-7c3e3a9407fc2501.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/common_voice/th/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e/cache-0a618ae67c2d1442.arrow


In [19]:
#show_random_elements(common_voice_train.remove_columns(["path"]))

import transdict

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
#not work is traing 2 language thai and english(transliteration)
#import pandas as pd
#df = pd.read_csv("https://raw.githubusercontent.com/wannaphong/thai-english-transliteration-dictionary/main/dict.tsv", on_bad_lines = "skip" ,sep='\t')

In [22]:
#df["th"]

func for replace thaitranliteration2eng

In [23]:
# def thai2transliteration(batch, df = df):
#   word = batch["sentence"].split(" ")
#   trans = [list(df['th']),list(df['en'])]
#   for i in word:
#     if i in trans[0]:
#       batch["sentence"] = batch["sentence"].replace(i, "".join(df[df["th"] == i]["en"].values))
#   return batch

In [24]:
#common_voice_train = common_voice_train.map(thai2transliteration)
#common_voice_test = common_voice_test.map(thai2transliteration)

extract char for ctc

In [25]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [26]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [27]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [28]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
print("len vocab: {}".format(len(vocab_dict)))
vocab_dict

len vocab: 69


{' ': 8,
 "'": 65,
 'ก': 66,
 'ข': 19,
 'ค': 39,
 'ฆ': 59,
 'ง': 15,
 'จ': 57,
 'ฉ': 20,
 'ช': 21,
 'ซ': 35,
 'ญ': 44,
 'ฎ': 41,
 'ฏ': 3,
 'ฐ': 67,
 'ฑ': 4,
 'ฒ': 0,
 'ณ': 24,
 'ด': 52,
 'ต': 6,
 'ถ': 9,
 'ท': 62,
 'ธ': 32,
 'น': 16,
 'บ': 63,
 'ป': 42,
 'ผ': 26,
 'ฝ': 64,
 'พ': 14,
 'ฟ': 54,
 'ภ': 34,
 'ม': 56,
 'ย': 23,
 'ร': 5,
 'ฤ': 11,
 'ล': 36,
 'ว': 46,
 'ศ': 18,
 'ษ': 17,
 'ส': 43,
 'ห': 58,
 'ฬ': 22,
 'อ': 27,
 'ฮ': 7,
 'ะ': 31,
 'ั': 1,
 'า': 48,
 'ำ': 2,
 'ิ': 40,
 'ี': 51,
 'ึ': 45,
 'ื': 28,
 'ุ': 47,
 'ู': 12,
 'เ': 29,
 'แ': 68,
 'โ': 61,
 'ใ': 37,
 'ไ': 49,
 'ๅ': 33,
 'ๆ': 55,
 '็': 50,
 '่': 10,
 '้': 13,
 '๊': 25,
 '๋': 38,
 '์': 30,
 'ํ': 60,
 '’': 53}

In [29]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [30]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

71

In [31]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [32]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

### Create XLSR-Wav2Vec2 Feature Extractor

In [33]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [34]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [35]:
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [36]:
processor.save_pretrained("/content/gdrive/MyDrive/Colab Notebooks/HuggingFace/thai-keyword-asr")

In [37]:
import torchaudio

def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["sentence"]
    return batch

In [38]:
common_voice_train = common_voice_train.map(speech_file_to_array_fn, remove_columns=common_voice_train.column_names)
common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names)

  0%|          | 0/4839 [00:00<?, ?ex/s]

  0%|          | 0/2188 [00:00<?, ?ex/s]

In [39]:
import librosa
import numpy as np

def resample(batch):
    batch["speech"] = librosa.resample(np.asarray(batch["speech"]), 48_000, 16_000)
    batch["sampling_rate"] = 16_000
    return batch

In [40]:
common_voice_train = common_voice_train.map(resample, num_proc=4)
common_voice_test = common_voice_test.map(resample, num_proc=4)

        

#0:   0%|          | 0/1210 [00:00<?, ?ex/s]

#1:   0%|          | 0/1210 [00:00<?, ?ex/s]

#3:   0%|          | 0/1209 [00:00<?, ?ex/s]

#2:   0%|          | 0/1210 [00:00<?, ?ex/s]

        

#1:   0%|          | 0/547 [00:00<?, ?ex/s]

#0:   0%|          | 0/547 [00:00<?, ?ex/s]

#2:   0%|          | 0/547 [00:00<?, ?ex/s]

#3:   0%|          | 0/547 [00:00<?, ?ex/s]

In [41]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(common_voice_train))

ipd.Audio(data=np.asarray(common_voice_train[rand_int]["speech"]), autoplay=False, rate=16000)

In [42]:
rand_int = random.randint(0, len(common_voice_train))

print("Target text:", common_voice_train[rand_int]["target_text"])
print("Input array shape:", np.asarray(common_voice_train[rand_int]["speech"]).shape)
print("Sampling rate:", common_voice_train[rand_int]["sampling_rate"])

Target text: คน นี้ อุทิศ ให้ กับ สลัม ที่ ฉัน มา 
Input array shape: (54528,)
Sampling rate: 16000


In [43]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

In [44]:
common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, batch_size=8, num_proc=4, batched=True)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, batch_size=8, num_proc=4, batched=True)

        

#0:   0%|          | 0/152 [00:00<?, ?ba/s]

#1:   0%|          | 0/152 [00:00<?, ?ba/s]

#2:   0%|          | 0/152 [00:00<?, ?ba/s]

#3:   0%|          | 0/152 [00:00<?, ?ba/s]

  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)


        

#0:   0%|          | 0/69 [00:00<?, ?ba/s]

#1:   0%|          | 0/69 [00:00<?, ?ba/s]

#2:   0%|          | 0/69 [00:00<?, ?ba/s]

#3:   0%|          | 0/69 [00:00<?, ?ba/s]

  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)
  tensor = as_tensor(value)


In [45]:
common_voice_train

Dataset({
    features: ['input_values', 'labels'],
    num_rows: 4839
})

## Training

### Set-up Trainer

In [46]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [47]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [48]:
wer_metric = load_metric("wer")

Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

In [49]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [50]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", 
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True, 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Downloading:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-large-xlsr-53 were not used when initializing Wav2Vec2ForCTC: ['quantizer.codevectors', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_q.weight', 'project_q.bias', 'project_hid.weight', 'project_hid.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to u

In [51]:
model.freeze_feature_extractor()

In [52]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="/content/gdrive/MyDrive/Colab Notebooks/HuggingFace/thai-keyword-asr",
  #output_dir="./wav2vec2-large-xlsr-thai-demo",
  group_by_length=True,
  per_device_train_batch_size=8,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=20,
  fp16=True,
  save_steps=200,
  eval_steps=400,
  logging_steps=400,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=2,
  load_best_model_at_end=True
)

In [53]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_train,
    eval_dataset=common_voice_test,
    tokenizer=processor.feature_extractor,
)

### Training

In [None]:
#trainer.train( resume_from_checkpoint = True ) if you want to resume training from checkpoint
trainer.train()

In [None]:
#trainer.save_model("/content/gdrive/MyDrive/Colab Notebooks/HuggingFace/wav2vec2-large-xlsr-thai-demo")

In [None]:
input_dict = processor(common_voice_test["input_values"][100], return_tensors="pt", padding=True)

logits = model(input_dict.input_values.to("cuda")).logits

pred_ids = torch.argmax(logits, dim=-1)[0]

It is strongly recommended to pass the ``sampling_rate`` argument to this function.Failing to do so can result in silent errors that might be hard to debug.


In [None]:
common_voice_test_transcription = load_dataset("common_voice", "th", data_dir="./cv-corpus-6.1-2020-12-11", split="test")

Using custom data configuration th-24242e0bc11eddb3


Downloading and preparing dataset common_voice/th (download: 325.49 MiB, generated: 576.90 MiB, post-processed: Unknown size, total: 902.39 MiB) to /root/.cache/huggingface/datasets/common_voice/th-24242e0bc11eddb3/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e...


Generating train split:   0%|          | 0/2917 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2188 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1922 [00:00<?, ? examples/s]

Generating other split:   0%|          | 0/2671 [00:00<?, ? examples/s]

Generating validated split:   0%|          | 0/7028 [00:00<?, ? examples/s]

Generating invalidated split:   0%|          | 0/467 [00:00<?, ? examples/s]

Dataset common_voice downloaded and prepared to /root/.cache/huggingface/datasets/common_voice/th-24242e0bc11eddb3/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e. Subsequent calls will reuse this data.


In [None]:
print("Prediction:")
print(processor.decode(pred_ids))

print("\nReference:")
print(common_voice_test_transcription["sentence"][100].lower())


Prediction:
fแชแชแp

Reference:
ไม่จำเป็นต้องมี


In [None]:
input_dict = processor(common_voice_test["input_values"][1], return_tensors="pt", padding=True)

logits = model(input_dict.input_values.to("cuda")).logits

pred_ids = torch.argmax(logits, dim=-1)[0]

It is strongly recommended to pass the ``sampling_rate`` argument to this function.Failing to do so can result in silent errors that might be hard to debug.


In [None]:
print("Prediction:")
print(processor.decode(pred_ids))

print("\nReference:")
print(common_voice_test_transcription["sentence"][1].lower())

Prediction:
fแชแชแชแช

Reference:
ส่งให้ตรงนี้แหละ โดโด้บอก
