In [117]:
import re
import sys
import glob
import soundfile
import unicodedata

import pandas as pd
import sentencepiece as spm
import pyarabic.araby as araby


from tqdm import tqdm
from pathlib import Path
from datasets import load_from_disk
from sklearn.model_selection import train_test_split

In [118]:
train = load_from_disk('/l/users/speech_lab/ClArTTS[ClArD_dataset]/ClArTTSBuckWalter/ClArD_HF_BuckWalter')['train'].to_pandas()[['audio_path', 'arabic_text']]
test = load_from_disk('/l/users/speech_lab/ClArTTS[ClArD_dataset]/ClArTTSBuckWalter/ClArD_HF_BuckWalter')['validation'].to_pandas()[['audio_path', 'arabic_text']]

train['audio_path'] = train['audio_path'].apply(lambda x: x.split('/')[-1])
test['audio_path'] = test['audio_path'].apply(lambda x: x.split('/')[-1])

In [119]:
train_tsv = pd.read_csv('/l/users/speech_lab/_SpeechT5PretrainDataset/Finetune/TTS/hubert_labels/CLARTTS/train-original.tsv', sep='\t', header=None, skiprows=1)
test_tsv = pd.read_csv('/l/users/speech_lab/_SpeechT5PretrainDataset/Finetune/TTS/hubert_labels/CLARTTS/valid-original.tsv', sep='\t', header=None, skiprows=1)
clartts_tsv_path = pd.read_csv('/l/users/speech_lab/_SpeechT5PretrainDataset/Finetune/TTS/hubert_labels/CLARTTS/train-original.tsv', sep='\t').columns[0]

train_tsv.columns = ['path', 'duration']
test_tsv.columns = ['path', 'duration']

train_tsv

Unnamed: 0,path,duration
0,ch_04_arabic_tts_dataset_87.wav,68097
1,ch_07_arabic_tts_dataset_16.wav,112002
2,ch_14_arabic_tts_dataset_389.wav,59138
3,ch_09_arabic_tts_dataset_552.wav,52802
4,ch_08_arabic_tts_dataset_59.wav,68225
...,...,...
9305,ch_02_arabic_tts_dataset_195.wav,49409
9306,ch_05_arabic_tts_dataset_73.wav,50434
9307,ch_14_arabic_tts_dataset_596.wav,50850
9308,ch_14_arabic_tts_dataset_231.wav,64225


In [120]:
map_numbers = {'0': '٠', '1': '١', '2': '٢', '3': '٣', '4': '٤', '5': '٥', '6': '٦', '7': '٧', '8': '٨', '9': '٩'}
map_numbers = dict((v, k) for k, v in map_numbers.items())
punctuations = ''.join([chr(i) for i in list(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))])
punctuations = punctuations + 'ݣ+=|$×⁄<>`åûݘ ڢ̇ پ\n'

def convert_numerals_to_digit(word):
    sentence=[]
    for w in word:
        sentence.append(map_numbers.get(w, w))
    word = ''.join(sentence)
    return word

def remove_diacritics(word):
    return araby.strip_diacritics(word)
     

def remove_punctuation(word):
    return word.translate(str.maketrans('', '', re.sub('[@% ]','', punctuations))).lower()

def preprocess_arabic_text(text):
    text = remove_diacritics(text)
    text = convert_numerals_to_digit(text)
    text = remove_punctuation(text)
    return text

In [121]:
df_train = pd.merge(train, train_tsv, left_on='audio_path', right_on='path')
df_test = pd.merge(test, test_tsv, left_on='audio_path', right_on='path')
df_train

Unnamed: 0,audio_path,arabic_text,path,duration
0,ch_20_arabic_tts_dataset_48.wav,لِأَنَّهُ لَا يَرَى أَنَّهُ عَلَى السَّفَهِ ثُ...,ch_20_arabic_tts_dataset_48.wav,84669
1,ch_05_arabic_tts_dataset_436.wav,الْعُمْرُ يَنْقُصُ وَالذُّنُوبُ تَزِيدُ,ch_05_arabic_tts_dataset_436.wav,48898
2,ch_20_arabic_tts_dataset_20.wav,قَلِيلَةً بَيْنَ الْمُكْثِرِينَ فَإِنَّ النَّاسَ,ch_20_arabic_tts_dataset_20.wav,53121
3,ch_16_arabic_tts_dataset_112.wav,إلَّا تَفَضُّلَا وَمِنْهَا الْعَقْلُ الَّذِي ي...,ch_16_arabic_tts_dataset_112.wav,81259
4,ch_16_arabic_tts_dataset_554.wav,إلَّا مَثَلٌ مَرْذُولٌ وَتَشْبِيهٌ مَعْلُولٌ,ch_16_arabic_tts_dataset_554.wav,55713
...,...,...,...,...
8565,ch_17_arabic_tts_dataset_580.wav,وَاجِدِهَا وَأَنْشَدَ أَبُو الْعَيْنَاءِ عَنْ ...,ch_17_arabic_tts_dataset_580.wav,58574
8566,ch_15_arabic_tts_dataset_577.wav,وَلَا تَعِبْ أَحَدًا مِنْهُمْ بِمَا فِيكَا الْ...,ch_15_arabic_tts_dataset_577.wav,89972
8567,ch_02_arabic_tts_dataset_384.wav,وَقَدْ قِيلَ فِي مَنْثُورِ الْحِكَمِ كَمْ مِنْ...,ch_02_arabic_tts_dataset_384.wav,69345
8568,ch_05_arabic_tts_dataset_531.wav,الْجَدْبَةِ فَيُفْضِي بِهِ الظَّنُّ إلَى الْهَ...,ch_05_arabic_tts_dataset_531.wav,69282


In [122]:
df_train['speaker_embedding'] = 'speaker_embedding/CLARTTS_speaker_embedding.npy'
df_test['speaker_embedding'] = 'speaker_embedding/CLARTTS_speaker_embedding.npy'
df_train['audio_path'] = df_train['audio_path'].apply(lambda x: f'train/{x}')
df_test['audio_path'] = df_test['audio_path'].apply(lambda x: f'test/{x}')


In [123]:
df_train['arabic_text'] = df_train['arabic_text'].apply(lambda x: preprocess_arabic_text(x))

In [124]:
df_test['arabic_text'] = df_test['arabic_text'].apply(lambda x: preprocess_arabic_text(x))

In [125]:
files = {
    'tr_df': '/l/users/speech_lab/_SpeechT5PretrainDataset/Finetune/TTS/labels/CLARTTS/train.txt',
    'te_df' :'/l/users/speech_lab/_SpeechT5PretrainDataset/Finetune/TTS/labels/CLARTTS/valid.txt',
    'clartts_train_tsv_path': '/l/users/speech_lab/_SpeechT5PretrainDataset/Finetune/TTS/hubert_labels/CLARTTS/train.tsv',
    'clartts_test_tsv_path': '/l/users/speech_lab/_SpeechT5PretrainDataset/Finetune/TTS/hubert_labels/CLARTTS/valid.tsv'
    }

### Text

In [115]:
df_train['arabic_text'].to_csv(files['tr_df'], index=False, header=False)
df_test['arabic_text'].to_csv(files['te_df'], index=False, header=False)

In [116]:
df_train[['audio_path', 'duration', 'speaker_embedding']].to_csv(files['clartts_train_tsv_path'], sep='\t', index=False, header=[clartts_tsv_path, '', ''])
df_test[['audio_path', 'duration', 'speaker_embedding']].to_csv(files['clartts_test_tsv_path'], sep='\t', index=False, header=[clartts_tsv_path, '', ''])

In [None]:
import pandas as pd
 
pd.read_csv(path, header=None, sep='\t')