### Forced Alignment for Multilingual Data
This notebook demonstrates how to use the `forced_alignment` function from the `align` module to perform forced alignment on multilingual data. The function takes a list of audio files and their corresponding transcripts, and aligns the audio with the text. If you manually segment transcript file.

In [None]:
!pip install uroman tha

Collecting uroman
  Downloading uroman-1.3.1.1-py3-none-any.whl.metadata (18 kB)
Collecting tha
  Downloading tha-0.1.4-py3-none-any.whl.metadata (5.8 kB)
Collecting urlextract (from tha)
  Downloading urlextract-1.9.0-py3-none-any.whl.metadata (5.8 kB)
Collecting phonenumbers (from tha)
  Downloading phonenumbers-9.0.5-py2.py3-none-any.whl.metadata (11 kB)
Collecting ftfy (from tha)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting uritools (from urlextract->tha)
  Downloading uritools-5.0.0-py3-none-any.whl.metadata (5.0 kB)
Downloading uroman-1.3.1.1-py3-none-any.whl (930 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m930.7/930.7 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tha-0.1.4-py3-none-any.whl (20 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading phonenumbers-9.0.5-py2.py3-none-any

In [None]:
import torch
import torchaudio
from typing import List, Tuple
from torchaudio.pipelines import MMS_FA as bundle
import IPython
import matplotlib.pyplot as plt
import re
import os
from tha.decimals import processor
import uroman as ur
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
model = bundle.get_model()
model.to(device)
tokenizer = bundle.get_tokenizer()
aligner = bundle.get_aligner()

uroman = ur.Uroman()

Downloading: "https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/model.pt" to /root/.cache/torch/hub/checkpoints/model.pt
100%|██████████| 1.18G/1.18G [00:12<00:00, 97.9MB/s]


In [None]:
import re

def convert_number_to_khmer(text: str):
    '''
    text = '1231.23' -> មួយពាន់ពីររយសាមសិបមួយចុចម្ភៃបី
    text = '12323' -> មួយម៉ឺនពីរពាន់បីរយម្ភៃបី
    text = '១២៣១' -> មួយពាន់ពីររយសាមសិបមួយ
    text = '១២.៣១' -> ដប់ពីរចុចសាមសិបមួយ
    '''
    # Assuming 'processor' is defined elsewhere and performs the actual conversion
    return processor(text).replace('▁', '')

def normalize_uroman(text):
    text = text.lower()
    text = text.replace("’", "'")
    text = re.sub("([^a-z' ])", "", text)
    text = re.sub(' +', ' ', text)
    return text.strip()

def normalize_transcript(text_path: str) -> Tuple[List[str], List[str]]:
    lines = [line.strip().replace('\u200b','') for line in open(text_path)]
    text = ''

    for line in lines:
        tsub = line

        # Extract numbers (both Arabic and Khmer)
        numbers_only = re.findall(r'\d+', tsub)  # This matches Arabic numbers

        # If there are numbers, process them
        if len(numbers_only) > 0:
            for num in numbers_only:
                # Make sure to print the numbers to inspect
                # print(f'Converting number: {num}')
                khmer_num = convert_number_to_khmer(num)
                # print(f'Converted to Khmer: {khmer_num}')

                # Handle replacing the number only when it is not a part of a larger number
                tsub = re.sub(r'\b' + re.escape(num) + r'\b', khmer_num, tsub)
                # print(f'Updated text: {tsub}')

        # Now romanize the Khmer string (if necessary)
        t = uroman.romanize_string(tsub, lcode='khm')
        t = normalize_uroman(t).replace(' ', '')  # Normalize the romanized string
        text += t + '\t'

    kh_texts_in_latins = text.strip().split('\t')
    kh_texts = lines
    return kh_texts_in_latins, kh_texts

In [None]:
def compute_alignments(waveform: torch.Tensor, transcript: List[str]):
    '''
    transcript: Text from uroman before tokenization; only English text is accepted.
    emission: A tensor representing frame-wise probability distributions over phonemes or characters.
    aligner: Maps the tokenized transcript to audio frames and generates token spans (start and end times for each token).
    '''
    with torch.inference_mode():
        emission, _ = model(waveform.to(device))
        token_spans = aligner(emission[0], tokenizer(transcript))
    return emission, token_spans

In [None]:
from  scipy.io import wavfile
import soundfile as sf
import numpy as np
import librosa

def save_utterance(path: str, waveform: torch.Tensor, spans: List[torch.Tensor], num_frames: int, transcript: List[str], sample_rate: int = bundle.sample_rate):
    '''
    path: The file path where the extracted audio segment will be saved.
    waveform: A tensor representing the entire audio waveform, where the second dimension is the time (samples).
    spans: A list of time spans (start and end times) representing the portion of the audio to be saved.
    num_frames: The total number of frames in the audio segment, used to calculate the time duration of the spans.
    transcript: The text transcript (typically a list of strings) for the utterance, although it's not used for the audio extraction process.
    sample_rate: The sample rate at which the audio is to be saved (defaults to `bundle.sample_rate`).

    The function extracts the audio segment corresponding to the time spans from the full waveform, and saves it as a `.wav` file
    at the specified file path using the provided sample rate.
    '''
    ratio = waveform.size(1) / num_frames
    x0 = int(ratio * spans[0].start)
    x1 = int(ratio * spans[-1].end)
    segment = waveform[:, x0:x1]
    sf.write(path, np.ravel(segment.numpy()), sample_rate)


In [None]:
def algin_audio_text(audio_path: str, transcript_path: str, out_dir: str):
    waveform_raw, sr = librosa.load(audio_path, sr=bundle.sample_rate)
    latin_texts, khm_texts  = normalize_transcript(transcript_path)

    tokens = tokenizer(latin_texts)
    waveform = torch.tensor(waveform_raw).unsqueeze(0)
    emission, token_spans = compute_alignments(waveform, latin_texts)
    num_frames = emission.size(1)
    base_path = os.path.basename(audio_path)[:-4]

    for idx in range(len(token_spans)):
        path = os.path.join(out_dir,'{}_{}.wav'.format(base_path,str(idx)))
        save_utterance( path, waveform, token_spans[idx], num_frames, latin_texts[idx])
        path_text = os.path.join(out_dir,'{}_{}.txt'.format(base_path,str(idx)))
        with open(path_text, 'w') as the_file:
            the_file.write(khm_texts[idx])

In [None]:
algin_audio_text("/content/1.mp3", '/content/1.txt', '/content/res')

In [None]:
import shutil
shutil.make_archive('/content/res', 'zip', '/content/res')

'/content/res.zip'

In [None]:
%%bash
rm -rf /content/res.zip /content/res
mkdir /content/res

### Multiple Files

In [None]:
from natsort import natsorted

In [None]:
folders = os.listdir('/content/AMS')
folders = natsorted(folders)
folders = [folder for folder in folders if folder != '.DS_Store']
for folder in folders:
    file_texts = os.listdir('/content/AMS/{}'.format(folder))
    file_texts = [f for f in file_texts if f.endswith('.txt')]
    file_texts = natsorted(file_texts)

    file_audios = os.listdir('/content/AMS/{}'.format(folder))
    file_audios = [f for f in file_audios if f.endswith('.mp3')]
    file_audios = natsorted(file_audios)

    for file_text, file_audio in zip(file_texts, file_audios):
        output_path = os.path.join('/content/res', folder, file_audio[:-4])
        os.makedirs(output_path, exist_ok=True)
        try:
            algin_audio_text(
                '/content/AMS/{}/{}'.format(folder, file_audio),
                '/content/AMS/{}/{}'.format(folder, file_text),
                output_path
            )
        except:
            file_name = '/content/AMS/{}/{}'.format(folder, file_audio)
            print(f'Error: {file_name}')
            continue
    t = os.path.join('/content/res', folder)
    print(f'✅ Finish {t}.')

✅ Finish /content/res/1.
✅ Finish /content/res/2.
✅ Finish /content/res/3.
Error: /content/AMS/4/1.mp3
✅ Finish /content/res/4.
✅ Finish /content/res/5.
✅ Finish /content/res/7.
✅ Finish /content/res/8.
✅ Finish /content/res/11.
✅ Finish /content/res/12.
✅ Finish /content/res/13.
✅ Finish /content/res/14.
✅ Finish /content/res/15.
✅ Finish /content/res/16.
✅ Finish /content/res/18.
✅ Finish /content/res/19.
✅ Finish /content/res/20.
✅ Finish /content/res/21.
✅ Finish /content/res/22.
✅ Finish /content/res/23.
✅ Finish /content/res/24.
✅ Finish /content/res/25.
✅ Finish /content/res/26.
✅ Finish /content/res/27.
✅ Finish /content/res/28.
✅ Finish /content/res/29.
✅ Finish /content/res/30.
✅ Finish /content/res/31.
✅ Finish /content/res/32.
✅ Finish /content/res/33.
✅ Finish /content/res/34.
✅ Finish /content/res/35.
✅ Finish /content/res/36.
✅ Finish /content/res/37.
✅ Finish /content/res/38.
✅ Finish /content/res/39.
✅ Finish /content/res/40.
✅ Finish /content/res/41.
✅ Finish /content

### Push to Hugging Face Hub

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
from datasets import Features, Audio, Value, Dataset

In [None]:
from huggingface_hub import login
from google.colab import userdata

login(token=userdata.get('hg-main'))

In [None]:
base_path = '/content/res'

data = {
    "audio": [],
    "text": [],
}

folders = os.listdir(base_path)
folders = natsorted(folders)
for folder in folders:
    sub_folders = os.listdir(os.path.join(base_path, folder))
    sub_folders = [sub_folder for sub_folder in sub_folders if sub_folder != '.ipynb_checkpoints']
    sub_folders = natsorted(sub_folders)
    print(sub_folders)

    for sub_folder in sub_folders:
        files = os.listdir(os.path.join(base_path, folder, sub_folder))
        files = natsorted(files)

        text_files = [f for f in files if f.endswith('.txt')]
        text_files = natsorted(text_files)
        audio_files = [f for f in files if f.endswith('.wav')]
        audio_files = natsorted(audio_files)

        for text_file, audio_file in zip(text_files, audio_files):
            audio_path = os.path.join(base_path, folder, sub_folder, audio_file)
            text_path = os.path.join(base_path, folder, sub_folder, text_file)
            with open(text_path, 'r') as f:
                text = f.read()

            data["audio"].append(audio_path)
            data["text"].append(text)

['0', '4', '5', '6', '9']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['0', '1', '2', '4', '5', '6', '7', '8', '9']
['0', '1', '5', '7', '8', '9']
['0', '1']
['3', '4', '5', '6', '7']
['4']
['3', '4', '7', '9']
['0', '4', '5', '6', '9']
['0', '8', '9']
['0', '1', '4', '5', '6', '8', '9']
['0', '1', '2']
['1', '2', '3', '8', '9']
['1', '2', '3', '4', '8', '9']
['1', '2', '3', '4', '5', '6', '7', '9']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['0', '1', '2', '3', '4', '5'

In [None]:
feature = Features({
    "audio": Audio(sampling_rate=16000),
    "text": Value("string"),
})

In [None]:
dataset = Dataset.from_dict(
    data,
    features=feature
)
print(dataset)

Dataset({
    features: ['audio', 'text'],
    num_rows: 14543
})


In [None]:
dataset[1]

{'audio': {'path': '/content/res/1/0/0_1.wav',
  'array': array([ 0.00073242,  0.00094604,  0.00109863, ..., -0.00335693,
         -0.00622559, -0.00894165]),
  'sampling_rate': 16000},
 'text': 'ក្រសួងសុខាភិបាលបានឱ្យដឹងថា កាលពីពេលថ្មីៗនេះ បណ្តាញផ្សព្វផ្សាយព័ត៌មានមួយចំនួនបានផ្សាយព័ត៌មានស្តីពីការផ្ទុះឡើងនៃយូមិនមេតាភ្នឺម៉ូវីរុស (human metapneumovirus) បង្កឱ្យមានជំងឺផ្លូវដង្ហើម'}

In [None]:
from IPython.display import Audio as au

In [None]:
au(dataset[1]['audio']['array'], rate=16000)

In [None]:
dataset.push_to_hub(repo_id='PhanithLIM/ams-speech-dataset', commit_message='Add AMS speech dataset')

Uploading the dataset shards:   0%|          | 0/11 [00:00<?, ?it/s]

Map:   0%|          | 0/1323 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1322 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1322 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1322 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1322 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1322 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1322 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1322 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1322 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1322 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1322 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/PhanithLIM/ams-speech-dataset/commit/3b7c160ea25e16ba0307ec201e2a972bc97d0a3c', commit_message='Add AMS speech dataset', commit_description='', oid='3b7c160ea25e16ba0307ec201e2a972bc97d0a3c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/PhanithLIM/ams-speech-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='PhanithLIM/ams-speech-dataset'), pr_revision=None, pr_num=None)