## Install library

In [2]:
%%capture
!pip install -U datasets
!pip install transformers==4.4.0
!pip install pythainlp
!pip install librosa
!pip install torchaudio
!pip install jiwer
!pip install pandas

# read Dict

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/google_colab_file/dict.txt", on_bad_lines='skip')

In [5]:
from datasets import load_dataset

common_voice_th = load_dataset("common_voice", "th", split="train+validation")

Downloading builder script:   0%|          | 0.00/5.21k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading and preparing dataset common_voice/th (download: 325.49 MiB, generated: 576.90 MiB, post-processed: Unknown size, total: 902.39 MiB) to /root/.cache/huggingface/datasets/common_voice/th/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e...


Downloading data:   0%|          | 0.00/341M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2917 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2188 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1922 [00:00<?, ? examples/s]

Generating other split:   0%|          | 0/2671 [00:00<?, ? examples/s]

Generating validated split:   0%|          | 0/7028 [00:00<?, ? examples/s]

Generating invalidated split:   0%|          | 0/467 [00:00<?, ? examples/s]

Dataset common_voice downloaded and prepared to /root/.cache/huggingface/datasets/common_voice/th/6.1.0/a1dc74461f6c839bfe1e8cf1262fd4cf24297e3fbd4087a711bd090779023a5e. Subsequent calls will reuse this data.


In [6]:
common_voice_th = common_voice_th.remove_columns(['client_id', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'])

In [7]:
import re
from pythainlp.tokenize import word_tokenize

chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

## For Thai NLP Library, please feel free to check https://pythainlp.github.io/docs/2.2/api/tokenize.html
def th_tokenize(batch):
    batch["sentence"] = " ".join(word_tokenize(batch["sentence"], engine="newmm"))
    return batch

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
    return batch

In [8]:
common_voice_th = common_voice_th.map(remove_special_characters).map(th_tokenize)

  0%|          | 0/4839 [00:00<?, ?ex/s]

  0%|          | 0/4839 [00:00<?, ?ex/s]

In [9]:
df.columns

Index(['th', 'en', 'check'], dtype='object')

## thai keyword -> transliteration

In [10]:
print(df[df["th"] == "ทวิตเตอร์"]["en"].values)

['twitter']


In [11]:
def thai2transliteration(batch, df = df):
  word = batch["sentence"].split(" ")
  trans = [list(df['th']),list(df['en'])]
  for i in word:
    if i in trans[0]:
      batch["sentence"] = batch["sentence"].replace(i, "".join(df[df["th"] == i]["en"].values))
  return batch

In [12]:
common_voice_th = common_voice_th.map(thai2transliteration)

  0%|          | 0/4839 [00:00<?, ?ex/s]

In [13]:
def findWord(batch, word) :
  result = []
  for i,k in enumerate(batch["sentence"]):
    if word in k :
      result.append((i, k))
  return result


In [14]:
transSentence = pd.DataFrame(common_voice_th["sentence"], index = [i for i in range(len(common_voice_th["sentence"]))], columns=['sentence'])

In [15]:
def findTrans(batch, df = df):
  result = []
  trans = []
  transSentence = pd.DataFrame(common_voice_th["sentence"], index = [i for i in range(len(common_voice_th["sentence"]))], columns=['sentence'])
  for i in df['en']:
    for j in transSentence["sentence"]:
      if i in j:
        result.append(j)
        trans.append(i)
  return set(result), set(trans)
    

In [16]:
_,trans = findTrans(common_voice_th)

In [17]:
print(trans)

{'site', 'e-mail', 'dough', 'utopia', 'bill', 'guitar', 'look', 'upload', 'chin', 'pound', 'bass', 'microphone', 'script', 'break', 'plan', 'tape', 'charlie', 'gold', 'whisky', 'ream', 'duchess', 'asean', 'camp', 'autism', 'bad', 'bra', 'chess', 'christmas', 'pingpong', 'phone', 'europe', 'shop', 'load', 'pack', 'alcohol', 'football', 'war room', 'baseball', 'prism', 'support', 'calculus', 'oxide', 'london', 'lock', 'centigrade', 'clip', 'chris', 'plum', 'chop', 'car', 'ball', 'stew', 'hockey', 'mug', 'package', 'sauce', 'boar', 'nylon', 'digital', 'cheque', 'lab', 'file', 'style', 'bar', 'bacon', 'chat', 'ham', 'note', 'tom', 'siren', 'virus', 'chlorine', 'aig', 'unicorn', 'rock', 'skate', 'jelly', 'john', 'mail', 'admin', 'idea', 'hormone', 'roulette', 'market', 'web', 'cheese', 'keyboard', 'iron', 'rap', 'farm', 'campaign', 'cycle', 'client', 'bow', 'arm', 'violin', 'star', 'art', 'bartender', 'lift', 'summer', 'games', 'vietnam', 'caravan', 'atom', 'tv', 'arctic', 'ice cream', 'fir

In [21]:
findWord(common_voice_th, "")

[(1257, 'website นี้ มี แอพ ให้ ใช่ มั้ย  '),
 (1746, 'website ต้อง เคารพ ความเป็นส่วนตัว ของ ผู้ใช้งาน  '),
 (3945, 'ตรวจสอบ คุณสมบัติ หลัก ใน website  '),
 (4100,
  'program ค้นหา ที่อยู่ ของ website ที่ ฉัน ชื่นชอบ คือ   google   และ   บิ ง  ')]

# data to audio

In [None]:
common_voice_th[0]

{'audio': {'array': array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 8.1002712e-05,
         5.5789948e-05, 3.8623810e-05], dtype=float32),
  'path': '/root/.cache/huggingface/datasets/downloads/extracted/3fa1d036f7dd806dce529a685d419d747ac2330f88abf8f1009d6b0ab683e8e7/cv-corpus-6.1-2020-12-11/th/clips/common_voice_th_23654854.mp3',
  'sampling_rate': 48000},
 'path': '/root/.cache/huggingface/datasets/downloads/extracted/3fa1d036f7dd806dce529a685d419d747ac2330f88abf8f1009d6b0ab683e8e7/cv-corpus-6.1-2020-12-11/th/clips/common_voice_th_23654854.mp3',
 'sentence': 'เงียบ หน่อย   เจ้า หนู '}

In [None]:
import torchaudio

def speech_file_to_array_fn(batch):
  speech_array, sampling_rate = torchaudio.load(batch["path"])
  batch["speech"] = speech_array[0].numpy()
  batch["sampling_rate"] = sampling_rate
  batch["target_text"] = batch["sentence"]
  return batch

In [None]:
common_voice_th = common_voice_th.map(speech_file_to_array_fn, remove_columns=common_voice_th.column_names)

  0%|          | 0/7510 [00:00<?, ?ex/s]

In [None]:
import librosa
import numpy as np

def resample(batch):
  batch["speech"] = librosa.resample(np.asarray(batch["speech"]), 48000, 16000)
  batch["sampling_rate"] = 16000
  return batch

In [None]:
common_voice_th = common_voice_th.map(resample, num_proc=4)

        

#1:   0%|          | 0/1878 [00:00<?, ?ex/s]

#0:   0%|          | 0/1878 [00:00<?, ?ex/s]

#2:   0%|          | 0/1877 [00:00<?, ?ex/s]

#3:   0%|          | 0/1877 [00:00<?, ?ex/s]

In [None]:
import IPython.display as ipd
import numpy as np 

ipd.Audio(data=np.asarray(common_voice_th[6665]['speech']), rate= 16000)