In [None]:
import librosa
import soundfile
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import random

In [None]:
replace_pairs = {
    ' ': ' ',
    'a': 'a',
    'ā': 'a:',
    'b': 'b',
    'c': 'c',
    'č': 'tS',
    'd': 'd',
    'ḏ': 'D',
    'ḍ': 'd_?\\',
    'ḓ': 'D_?\\',
    'e': 'e',
    'ē': 'e:',
    'ə': '@',
    'f': 'f',
    'g': 'g',
    'ġ': 'R',
    'h': 'h',
    'ḥ': 'X\\',
    'i': 'i',
    'ī': 'i:',
    'k': 'k',
    'l': 'l',
    'm': 'm',
    'n': 'n',
    'o': 'o',
    'ō': 'o:',
    'p': 'p',
    'r': 'r',
    's': 's',
    'š': 'S',
    'ṣ': 's_?\\',
    't': 't',
    'ṯ': 'T',
    'ṭ': 't_?\\',
    'u': 'u',
    'ū': 'u:',
    'w': 'w',
    'x': 'x',
    'y': 'j',
    'z': 'z',
    'ž': 'Z',
    'ẓ': 'z_?\\',
    'ʕ': '?\\',
    'ʔ': '?'
 }

In [None]:
def convert(text):
    new_text = text
    for char in sorted(replace_pairs, key=lambda x: len(replace_pairs.get(x)), reverse=True):
        new_text = new_text.replace(replace_pairs.get(char), char)
    return new_text

In [None]:
def get_patches(table):
    string, start, end, token = [], None, None, -1
    files = []
    for _, row in table.iterrows():
        if row.MAU == '<p:>':
            if string:
                if end - start > 75000:
                  files.append({
                      'begin': start,
                      'end': end,
                      'text': convert(' '.join(string))
                  })
                  string, start, end = [], None, None
                else:
                  if files and (start - files[-1]['end'] < (row.BEGIN + row.DURATION + 1) - end):
                    files[-1].update({'end': end, 'text': files[-1]['text'] +
                                      ' ' + convert(' '.join(string))})
                    string, start, end = [], None, None
            else:
                continue
        else:
            if row.TOKEN != token:
                token = row.TOKEN
                string.append(row.ORT)
            start = start or row.BEGIN
            end = (row.BEGIN + row.DURATION) or end
    return pd.DataFrame(files)

In [None]:
def process_folder(markup_path: Path,
                   audio_path: Path,
                   out_path: Path,
                   label: str):
    tsv = []
    out_path.joinpath('audio').mkdir(exist_ok=True, parents=True)
    for markup in tqdm(list(markup_path.iterdir())):
        table = pd.read_csv(markup, sep=';')
        file = audio_path.joinpath(markup.name).with_suffix('.wav')

        audio, sr = librosa.load(file, sr=None)
        length = len(audio) / sr
        supposed_sr = round((table.iloc[-1].BEGIN + table.iloc[-1].DURATION) / length)
        if supposed_sr != sr:
            audio, sr = librosa.load(file, sr=supposed_sr)

        for idx, data in get_patches(table).iterrows():
            new_file_path = out_path.joinpath('audio', f'{file.stem}_{idx}.wav')
            new_file_resample = librosa.resample(audio[data['begin']:data['end']], orig_sr=sr, target_sr=16000)
            soundfile.write(new_file_path,
                            data=new_file_resample,
                            samplerate=16000)
            tsv.append({'path': 'audio/' + new_file_path.name,
                       'text': data['text']})
    pd.DataFrame(tsv).to_csv(out_path.joinpath(label+'.tsv'), sep='\t', index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
process_folder(Path('/content/drive/MyDrive/dataset_raw/test_csv/'),
              Path('/content/drive/MyDrive/dataset_raw/test/'),
              Path('/content/drive/MyDrive/dataset'),
              'test')

100%|██████████| 43/43 [01:34<00:00,  2.19s/it]


In [None]:
process_folder(Path('/content/drive/MyDrive/dataset_raw/train_csv/'),
              Path('/content/drive/MyDrive/dataset_raw/train/'),
              Path('/content/drive/MyDrive/dataset'),
              'train')

100%|██████████| 146/146 [04:26<00:00,  1.82s/it]


In [None]:
from IPython.display import Audio, display

def getfile(tsv: Path):
  table = pd.read_csv(tsv, sep='\t')
  filepath, text = random.choice(table.values)
  display(text, Audio(tsv.parent / filepath, autoplay=True))

In [None]:
getfile(Path('/content/drive/MyDrive/dataset/test.tsv'))

'īlun farrōša ʕaḏaġṭəl mōya miščaġlin'