In [55]:
import re
import sys
import glob
import soundfile
import unicodedata

import pandas as pd
import sentencepiece as spm
import pyarabic.araby as araby


from tqdm import tqdm
from pathlib import Path
from datasets import load_from_disk
from sklearn.model_selection import train_test_split

In [56]:
train = load_from_disk('/l/users/speech_lab/ArabicSpeechCorpus/ArabicSpeechCorpusHF')['train'].to_pandas()
test = load_from_disk('/l/users/speech_lab/ArabicSpeechCorpus/ArabicSpeechCorpusHF')['validation'].to_pandas()

train['audio_path'] = train['audio_path'].apply(lambda x: x.split('/')[-1])
test['audio_path'] = test['audio_path'].apply(lambda x: x.split('/')[-1])

In [57]:
train

Unnamed: 0,audio_path,buckwalter_text,phonetic,arabic_text,audio
0,ARA NORM 0002.wav,waraj~aHa Alt~aqoriyru Al~a*iy >aEad~ahu maEoh...,w a r a' jj A H a tt A q r ii0' r u0 ll a * i0...,وَرَجَّحَ التَّقْرِيرُ الَّذِي أَعَدَّهُ مَعْه...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,ARA NORM 0003.wav,mim~aA qado yu&ad~iy <ilaY taraAjuEi masaAHaAt...,m i0' mm aa q A' d y u0 < a' dd ii0 Ah i0 l aa...,مِمَّا قَدْ يُؤَدِّي إِلَى تَرَاجُعِ مَسَاحَات...,"[7.7429485e-07, -2.5241652e-06, 5.307166e-06, ..."
2,ARA NORM 0004.wav,wa*akara Alt~aqoriyru >ana taraAjuEa masaAHapi...,w a * a' k a r a tt A q r ii0' r u0 Ah a n a t...,وَذَكَرَ التَّقْرِيرُ أَنَ تَرَاجُعَ مَسَاحَةِ...,"[-5.8334217e-07, 3.2189453e-06, -7.502435e-06,..."
3,ARA NORM 0005.wav,bayonahaA nahoraA yaluw wayaAnogotsiy - fiy Al...,b a' y n a h aa n a' h r aa y a' l uu0 w a y a...,بَيْنَهَا نَهْرَا يَلُو وَيَانْغْتسِي - فِي ال...,"[2.0966183e-06, -4.012324e-06, 6.8732256e-06, ..."
4,ARA NORM 0006.wav,wafiy Al$~awoTi Al^~aAniy AsotaEaAda baAriysu ...,w a' f i0 $$ a' w T I0 ^^ aa' n i0 s t a E aa'...,وَفِي الشَّوْطِ الثَّانِي اسْتَعَادَ بَارِيسُ ...,"[-4.1914433e-07, 7.0548987e-07, -1.1553973e-06..."
...,...,...,...,...,...
1808,ARA NORM 1810.wav,qay~wmN - waqay~awmN,q A yy uu0' m u1 n w A q A yy a' w m u1 n sil,قَيّومٌ - وَقَيَّومٌ,"[-3.293482e-08, 4.1448864e-08, -4.822517e-08, ..."
1809,ARA NORM 1811.wav,qaw~aAmN - waqaw~wmN,q A ww aa' m u1 n w A q A ww uu0' m u1 n sil,قَوَّامٌ - وَقَوّومٌ,"[2.2569596e-12, -2.698441e-12, -1.0749269e-12,..."
1810,ARA NORM 1812.wav,qaw~ymN - waqaw~aymN,q A ww ii0' m u1 n w A q A ww a' y m u1 n sil,قَوّيمٌ - وَقَوَّيمٌ,"[-1.3809912e-08, 1.649002e-08, -1.8424654e-08,..."
1811,ARA NORM 1813.wav,quy~ymN - waqiy~wmN,q U0 yy ii0' m u1 n w A q II0 y uu0' m u1 n sil,قُيّيمٌ - وَقِيّومٌ,"[4.3427735e-12, 1.3239106e-13, -2.6478316e-12,..."


In [58]:
train_tsv = pd.read_csv('/l/users/speech_lab/_SpeechT5PretrainDataset/Finetune/TTS/hubert_labels/ASC/train-original.tsv', sep='\t', header=None, skiprows=1)
test_tsv = pd.read_csv('/l/users/speech_lab/_SpeechT5PretrainDataset/Finetune/TTS/hubert_labels/ASC/valid-original.tsv', sep='\t', header=None, skiprows=1)
asc_tsv_path = pd.read_csv('/l/users/speech_lab/_SpeechT5PretrainDataset/Finetune/TTS/hubert_labels/ASC/train-original.tsv', sep='\t').columns[0]

train_tsv.columns = ['path', 'duration']
test_tsv.columns = ['path', 'duration']

train_tsv

Unnamed: 0,path,duration
0,ARA NORM 0548.wav,92301
1,ARA NORM 1769.wav,94021
2,ARA NORM 0129.wav,185228
3,ARA NORM 0599.wav,338061
4,ARA NORM 0707.wav,82466
...,...,...
1808,ARA NORM 0902.wav,91021
1809,ARA NORM 1751.wav,105907
1810,ARA NORM 1214.wav,74781
1811,ARA NORM 0853.wav,99147


In [59]:
map_numbers = {'0': '٠', '1': '١', '2': '٢', '3': '٣', '4': '٤', '5': '٥', '6': '٦', '7': '٧', '8': '٨', '9': '٩'}
map_numbers = dict((v, k) for k, v in map_numbers.items())
punctuations = ''.join([chr(i) for i in list(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))])
punctuations = punctuations + 'ݣ+=|$×⁄<>`åûݘ ڢ̇ پ\n'

def convert_numerals_to_digit(word):
    sentence=[]
    for w in word:
        sentence.append(map_numbers.get(w, w))
    word = ''.join(sentence)
    return word

def remove_diacritics(word):
    return araby.strip_diacritics(word)
     

def remove_punctuation(word):
    return word.translate(str.maketrans('', '', re.sub('[@% ]','', punctuations))).lower()

def preprocess_arabic_text(text):
    text = remove_diacritics(text)
    text = convert_numerals_to_digit(text)
    text = remove_punctuation(text)
    return text

In [60]:
df_train = pd.merge(train, train_tsv, left_on='audio_path', right_on='path')
df_test = pd.merge(test, test_tsv, left_on='audio_path', right_on='path')
df_train

Unnamed: 0,audio_path,buckwalter_text,phonetic,arabic_text,audio,path,duration
0,ARA NORM 0002.wav,waraj~aHa Alt~aqoriyru Al~a*iy >aEad~ahu maEoh...,w a r a' jj A H a tt A q r ii0' r u0 ll a * i0...,وَرَجَّحَ التَّقْرِيرُ الَّذِي أَعَدَّهُ مَعْه...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",ARA NORM 0002.wav,244457
1,ARA NORM 0003.wav,mim~aA qado yu&ad~iy <ilaY taraAjuEi masaAHaAt...,m i0' mm aa q A' d y u0 < a' dd ii0 Ah i0 l aa...,مِمَّا قَدْ يُؤَدِّي إِلَى تَرَاجُعِ مَسَاحَات...,"[7.7429485e-07, -2.5241652e-06, 5.307166e-06, ...",ARA NORM 0003.wav,105360
2,ARA NORM 0004.wav,wa*akara Alt~aqoriyru >ana taraAjuEa masaAHapi...,w a * a' k a r a tt A q r ii0' r u0 Ah a n a t...,وَذَكَرَ التَّقْرِيرُ أَنَ تَرَاجُعَ مَسَاحَةِ...,"[-5.8334217e-07, 3.2189453e-06, -7.502435e-06,...",ARA NORM 0004.wav,223287
3,ARA NORM 0005.wav,bayonahaA nahoraA yaluw wayaAnogotsiy - fiy Al...,b a' y n a h aa n a' h r aa y a' l uu0 w a y a...,بَيْنَهَا نَهْرَا يَلُو وَيَانْغْتسِي - فِي ال...,"[2.0966183e-06, -4.012324e-06, 6.8732256e-06, ...",ARA NORM 0005.wav,58082
4,ARA NORM 0006.wav,wafiy Al$~awoTi Al^~aAniy AsotaEaAda baAriysu ...,w a' f i0 $$ a' w T I0 ^^ aa' n i0 s t a E aa'...,وَفِي الشَّوْطِ الثَّانِي اسْتَعَادَ بَارِيسُ ...,"[-4.1914433e-07, 7.0548987e-07, -1.1553973e-06...",ARA NORM 0006.wav,162449
...,...,...,...,...,...,...,...
1808,ARA NORM 1810.wav,qay~wmN - waqay~awmN,q A yy uu0' m u1 n w A q A yy a' w m u1 n sil,قَيّومٌ - وَقَيَّومٌ,"[-3.293482e-08, 4.1448864e-08, -4.822517e-08, ...",ARA NORM 1810.wav,52123
1809,ARA NORM 1811.wav,qaw~aAmN - waqaw~wmN,q A ww aa' m u1 n w A q A ww uu0' m u1 n sil,قَوَّامٌ - وَقَوّومٌ,"[2.2569596e-12, -2.698441e-12, -1.0749269e-12,...",ARA NORM 1811.wav,54129
1810,ARA NORM 1812.wav,qaw~ymN - waqaw~aymN,q A ww ii0' m u1 n w A q A ww a' y m u1 n sil,قَوّيمٌ - وَقَوَّيمٌ,"[-1.3809912e-08, 1.649002e-08, -1.8424654e-08,...",ARA NORM 1812.wav,56436
1811,ARA NORM 1813.wav,quy~ymN - waqiy~wmN,q U0 yy ii0' m u1 n w A q II0 y uu0' m u1 n sil,قُيّيمٌ - وَقِيّومٌ,"[4.3427735e-12, 1.3239106e-13, -2.6478316e-12,...",ARA NORM 1813.wav,48112


In [67]:
df_train['speaker_embedding'] = 'speaker_embedding/ASC_speaker_embedding.npy'
df_test['speaker_embedding'] = 'speaker_embedding/ASC_speaker_embedding.npy'

In [68]:
df_train['arabic_text'] = df_train['arabic_text'].apply(lambda x: preprocess_arabic_text(x))

In [69]:
df_test['arabic_text'] = df_test['arabic_text'].apply(lambda x: preprocess_arabic_text(x))

In [70]:
files = {
    'tr_df': '/l/users/speech_lab/_SpeechT5PretrainDataset/Finetune/TTS/labels/ASC/train.txt',
    'te_df' :'/l/users/speech_lab/_SpeechT5PretrainDataset/Finetune/TTS/labels/ASC/valid.txt',
    'clartts_train_tsv_path': '/l/users/speech_lab/_SpeechT5PretrainDataset/Finetune/TTS/hubert_labels/ASC/train.tsv',
    'clartts_test_tsv_path': '/l/users/speech_lab/_SpeechT5PretrainDataset/Finetune/TTS/hubert_labels/ASC/valid.tsv'
    }

### Text

In [71]:
df_train['arabic_text'].to_csv(files['tr_df'], index=False, header=False)
df_test['arabic_text'].to_csv(files['te_df'], index=False, header=False)

In [72]:
df_train[['audio_path', 'duration', 'speaker_embedding']].to_csv(files['clartts_train_tsv_path'], sep='\t', index=False, header=[asc_tsv_path, '', ''])
df_test[['audio_path', 'duration', 'speaker_embedding']].to_csv(files['clartts_test_tsv_path'], sep='\t', index=False, header=[asc_tsv_path, '', ''])