In [None]:
import re
import sys
import glob
import soundfile
import unicodedata

import pandas as pd
import sentencepiece as spm
import pyarabic.araby as araby


from tqdm import tqdm
from pathlib import Path
from datasets import load_from_disk
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = load_from_disk('/l/users/speech_lab/MGB/HF_Datasets/MGB2HF_all_duration')['train']
test = load_from_disk('/l/users/speech_lab/MGB/HF_Datasets/MGB2HF_all_duration')['test']

In [3]:
map_numbers = {'0': '٠', '1': '١', '2': '٢', '3': '٣', '4': '٤', '5': '٥', '6': '٦', '7': '٧', '8': '٨', '9': '٩'}
map_numbers = dict((v, k) for k, v in map_numbers.items())
punctuations = ''.join([chr(i) for i in list(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))])
punctuations = punctuations + 'ݣ+=|$×⁄<>`åûݘ ڢ̇ پ\n'

def convert_numerals_to_digit(word):
    sentence=[]
    for w in word:
        sentence.append(map_numbers.get(w, w))
    word = ''.join(sentence)
    return word

def remove_diacritics(word):
    return araby.strip_diacritics(word)
     

def remove_punctuation(word):
    return word.translate(str.maketrans('', '', re.sub('[@% ]','', punctuations))).lower()

def preprocess_arabic_text(text):
    text = remove_diacritics(text)
    text = convert_numerals_to_digit(text)
    text = remove_punctuation(text)
    return text

base = '/l/users/speech_lab/MGB/MGB2/_segmented'
def get_frames(fname):
    fname = base + "/" + fname
    return soundfile.info(fname).frames

In [4]:
# train
tr_df = pd.DataFrame()

tr_df['arabic_text'] = train['arabic_text']
tr_df['audio_path'] = train['audio_path']
tr_df['speaker_embedding'] = train['speaker_name']

# test
te_df = pd.DataFrame()

te_df['arabic_text'] = test['arabic_text']
te_df['audio_path'] = test['audio_path']
te_df['speaker_embedding'] = test['speaker_name']

In [5]:
te_df['duration'] = te_df['audio_path'].apply(lambda x: get_frames(x))
te_df['arabic_text'] = te_df['arabic_text'].apply(lambda x: preprocess_arabic_text(x))

In [6]:
tr_df['arabic_text'] = tr_df['arabic_text'].apply(lambda x: preprocess_arabic_text(x))

In [7]:
files = {'tr_df': '/l/users/speech_lab/_SpeechT5PretrainDataset/finetune/TTS/labels/train.txt',
"te_df" :'/l/users/speech_lab/_SpeechT5PretrainDataset/finetune/TTS/labels/valid.txt'}

In [8]:
audios ={'tr_df': '/l/users/speech_lab/_SpeechT5PretrainDataset/finetune/TTS/hubert_labels/train.txt',
"te_df" :'/l/users/speech_lab/_SpeechT5PretrainDataset/finetune/TTS/hubert_labels/valid.txt'}

### Text

In [15]:
with open(files['te_df'], 'w') as f:
        for i, row in te_df.iterrows():
            if row['duration'] > 16000*40:
                continue
            else:
                print(
                    "{}".format(preprocess_arabic_text(row['arabic_text'])), file=f
                )

### Audio

In [23]:
with open(audios['te_df'], 'w') as f:
        print(base, file=f)
        tqdm.pandas()
        for i, row in te_df.iterrows():
            if row['duration'] > 16000*40:
                continue
            else:
                print("{}\t{}\tspeaker_embeddings/{}.npy".
                      format(row['audio_path'],row['duration'],row['speaker_embedding']),
                      file=f)

### Text

In [10]:
with open(files['tr_df'], 'w') as f:
        for i, row in tqdm(tr_df.iterrows(),  total =tr_df.shape[0]):
            if get_frames(row['audio_path']) > 16000*20:
                continue
            else:
                print(
                    "{}".format(preprocess_arabic_text(row['arabic_text'])), file=f
                )

  0%|          | 469/376011 [00:00<03:44, 1672.20it/s]

100%|██████████| 376011/376011 [13:10<00:00, 475.85it/s]


### Audio

In [11]:
with open(audios['tr_df'], 'w') as f:
        print(base, file=f)
        tqdm.pandas()
        for i, row in tqdm(tr_df.iterrows(),  total =tr_df.shape[0]):
            if get_frames(row['audio_path']) > 16000*20:
                continue
            else:
                print("{}\t{}\tspeaker_embeddings/{}.npy".
                      format(row['audio_path'],get_frames(row['audio_path']),row['speaker_embedding']),
                      file=f)

100%|██████████| 376011/376011 [04:57<00:00, 1263.24it/s]
