In [1]:
import re
import sys
import glob
import soundfile
import unicodedata

import pandas as pd
import sentencepiece as spm
import pyarabic.araby as araby


from tqdm import tqdm
from pathlib import Path
from datasets import load_from_disk
from sklearn.model_selection import train_test_split

In [2]:
train = load_from_disk('/l/users/speech_lab/QASR_TTS/QASRTTS_HF')['train']
test = load_from_disk('/l/users/speech_lab/QASR_TTS/QASRTTS_HF')['validation']

In [3]:
train[0]

{'audio_path': '/l/users/speech_lab/QASR_TTS/Khadija_wav/wavs/1D195228-25FD-4006-8702-AA2059345B26_59.wav',
 'arabic_text': 'وَدَعا مَحْكَمَةْ الْجِناياتْ الدَّوْليَّةْ لِلتَّحْقيقْ مَعَها',
 'buckwalter_text': 'wadaEA maHokamapo AlojinAyAto Ald~awoly~apo lilta~Hoqyqo maEahA',
 'audio': [0.0024402819108217955,
  0.0005709364777430892,
  -0.004275619052350521,
  0.00741900410503149,
  0.010417826473712921,
  -0.0037584856618195772,
  -0.00204633385874331,
  0.0058802105486392975,
  0.00321106961928308,
  -0.0033372046891599894,
  0.003296030219644308,
  0.008304969407618046,
  0.0030475130770355463,
  0.004886271432042122,
  0.009371147491037846,
  0.003869204316288233,
  0.00015810040349606425,
  0.00046365620801225305,
  -0.002480176044628024,
  0.006234270520508289,
  0.0012137008598074317,
  0.00021446257596835494,
  0.0075608729384839535,
  0.013512585312128067,
  0.006424752529710531,
  0.0014989384217187762,
  0.007430405355989933,
  0.006314303260296583,
  0.008778889663517475,


In [5]:
train = train.to_pandas()
train.head()

Unnamed: 0,audio_path,arabic_text,buckwalter_text,audio
0,/l/users/speech_lab/QASR_TTS/Khadija_wav/wavs/...,وَدَعا مَحْكَمَةْ الْجِناياتْ الدَّوْليَّةْ لِ...,wadaEA maHokamapo AlojinAyAto Ald~awoly~apo li...,"[0.002440282, 0.0005709365, -0.004275619, 0.00..."
1,/l/users/speech_lab/QASR_TTS/Khadija_wav/wavs/...,بَعْضْ التَّقاريرْ تَتَحَدَّثْ عَنْ سِتّْ مِلْ...,baEoDo Alt~aqAryro tataHad~a^o Eano sit~o milo...,"[0.0028782238, 0.0019992762, 0.0038911556, 0.0..."
2,/l/users/speech_lab/QASR_TTS/Khadija_wav/wavs/...,شُكْرًا جَزيلًا لِلدُّكْتورْ خَليلْ الْعَناني,$ukorFA jazylFA lild~ukotwro xalylo AloEanAny,"[0.00015331607, 6.593444e-05, 0.00016579649, 7..."
3,/l/users/speech_lab/QASR_TTS/Khadija_wav/wavs/...,الْحالْ الَّذي وَصَلَتْ إِلَيْهِ الْبِلادْ,AloHAlo Al~a*y waSalato <ilayohi AlobilAdo,"[-0.0015696047, -0.0026458022, -0.0015093002, ..."
4,/l/users/speech_lab/QASR_TTS/Khadija_wav/wavs/...,يَعْني سَتُطيلْ عُمْرْ الْمَرْحَلَةْ الْاِنْتِ...,yaEony satuTylo Eumoro AlomaroHalapo AloAinoti...,"[-0.001962228, -0.0041532167, 0.0038702546, 0...."


In [8]:
# test = test.to_pandas()
df =pd.concat([train,test])
df.shape

(2431, 4)

In [9]:
df.tail()

Unnamed: 0,audio_path,arabic_text,buckwalter_text,audio
190,/l/users/speech_lab/QASR_TTS/Mahmoud_wav/wavs/...,هَلْ تَعْتَقِدْ أَنَّ هَذَا أَحْدَثَ رَدَّةْ ف...,halo taEotaqido >an~a ha*aA >aHoda^a rad~apo f...,"[0.016486067, 0.026456939, 0.027729986, 0.0304..."
191,/l/users/speech_lab/QASR_TTS/Mahmoud_wav/wavs/...,سَيِّدْ الفْطِيسِي مَا حَقِيقَةْ الْوَضْعْ عِن...,say~ido AlfoTiysiy maA Haqiyqapo AlowaDoEo Ein...,"[-0.0004141984, -0.00037751562, 5.692558e-05, ..."
192,/l/users/speech_lab/QASR_TTS/Mahmoud_wav/wavs/...,جَرَّاءَ حَمْلَةِ الِاعْتِقَالَاتْ اَلَّتِي اس...,jara~A'a Hamolapi AliAEotiqaAlaAto Aal~atiy As...,"[-0.0020030432, -0.002475719, -0.002262379, -0..."
193,/l/users/speech_lab/QASR_TTS/Mahmoud_wav/wavs/...,اَلَّتِي أَحَاطَتْ بِعَمَلِهَا مُنْذُ تَشْكِيل...,Aal~atiy >aHaATato biEamalihaA muno*u ta$okiyl...,"[-0.00032070698, -0.00032377313, -0.0002144109..."
194,/l/users/speech_lab/QASR_TTS/Mahmoud_wav/wavs/...,الْمُسْتَشَارْ فِي الْمَرْكَزْ الْعِرَاقِيّْ ل...,Alomusota$aAro fiy Alomarokazo AloEiraAqiy~o l...,"[-0.008429403, -0.011023204, -0.009712422, -0...."


In [10]:
import numpy as np

In [13]:
index = np.linspace(0, df.shape[0], 512, dtype=int)
len(index)

512

In [19]:
len(index) -1 

511

In [15]:
index[510]

2426

In [20]:
df[index[510]:index[511]]

Unnamed: 0,audio_path,arabic_text,buckwalter_text,audio
190,/l/users/speech_lab/QASR_TTS/Mahmoud_wav/wavs/...,هَلْ تَعْتَقِدْ أَنَّ هَذَا أَحْدَثَ رَدَّةْ ف...,halo taEotaqido >an~a ha*aA >aHoda^a rad~apo f...,"[0.016486067, 0.026456939, 0.027729986, 0.0304..."
191,/l/users/speech_lab/QASR_TTS/Mahmoud_wav/wavs/...,سَيِّدْ الفْطِيسِي مَا حَقِيقَةْ الْوَضْعْ عِن...,say~ido AlfoTiysiy maA Haqiyqapo AlowaDoEo Ein...,"[-0.0004141984, -0.00037751562, 5.692558e-05, ..."
192,/l/users/speech_lab/QASR_TTS/Mahmoud_wav/wavs/...,جَرَّاءَ حَمْلَةِ الِاعْتِقَالَاتْ اَلَّتِي اس...,jara~A'a Hamolapi AliAEotiqaAlaAto Aal~atiy As...,"[-0.0020030432, -0.002475719, -0.002262379, -0..."
193,/l/users/speech_lab/QASR_TTS/Mahmoud_wav/wavs/...,اَلَّتِي أَحَاطَتْ بِعَمَلِهَا مُنْذُ تَشْكِيل...,Aal~atiy >aHaATato biEamalihaA muno*u ta$okiyl...,"[-0.00032070698, -0.00032377313, -0.0002144109..."
194,/l/users/speech_lab/QASR_TTS/Mahmoud_wav/wavs/...,الْمُسْتَشَارْ فِي الْمَرْكَزْ الْعِرَاقِيّْ ل...,Alomusota$aAro fiy Alomarokazo AloEiraAqiy~o l...,"[-0.008429403, -0.011023204, -0.009712422, -0...."


In [3]:
map_numbers = {'0': '٠', '1': '١', '2': '٢', '3': '٣', '4': '٤', '5': '٥', '6': '٦', '7': '٧', '8': '٨', '9': '٩'}
map_numbers = dict((v, k) for k, v in map_numbers.items())
punctuations = ''.join([chr(i) for i in list(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))])
punctuations = punctuations + 'ݣ+=|$×⁄<>`åûݘ ڢ̇ پ\n'

def convert_numerals_to_digit(word):
    sentence=[]
    for w in word:
        sentence.append(map_numbers.get(w, w))
    word = ''.join(sentence)
    return word

def remove_diacritics(word):
    return araby.strip_diacritics(word)
     

def remove_punctuation(word):
    return word.translate(str.maketrans('', '', re.sub('[@% ]','', punctuations))).lower()

def preprocess_arabic_text(text):
    text = remove_diacritics(text)
    text = convert_numerals_to_digit(text)
    text = remove_punctuation(text)
    return text

base = '/l/users/speech_lab/MGB/MGB2/_segmented'
def get_frames(fname):
    fname = base + "/" + fname
    return soundfile.info(fname).frames

In [4]:
# train
tr_df = pd.DataFrame()

tr_df['arabic_text'] = train['arabic_text']
tr_df['audio_path'] = train['audio_path']
tr_df['speaker_embedding'] = train['speaker_name']

# test
te_df = pd.DataFrame()

te_df['arabic_text'] = test['arabic_text']
te_df['audio_path'] = test['audio_path']
te_df['speaker_embedding'] = test['speaker_name']

In [5]:
te_df['duration'] = te_df['audio_path'].apply(lambda x: get_frames(x))
te_df['arabic_text'] = te_df['arabic_text'].apply(lambda x: preprocess_arabic_text(x))

In [6]:
tr_df['arabic_text'] = tr_df['arabic_text'].apply(lambda x: preprocess_arabic_text(x))

In [7]:
files = {'tr_df': '/l/users/speech_lab/_SpeechT5PretrainDataset/finetune/TTS/labels/train.txt',
"te_df" :'/l/users/speech_lab/_SpeechT5PretrainDataset/finetune/TTS/labels/valid.txt'}

In [8]:
audios ={'tr_df': '/l/users/speech_lab/_SpeechT5PretrainDataset/finetune/TTS/hubert_labels/train.txt',
"te_df" :'/l/users/speech_lab/_SpeechT5PretrainDataset/finetune/TTS/hubert_labels/valid.txt'}

### Text

In [15]:
with open(files['te_df'], 'w') as f:
        for i, row in te_df.iterrows():
            if row['duration'] > 16000*40:
                continue
            else:
                print(
                    "{}".format(preprocess_arabic_text(row['arabic_text'])), file=f
                )

### Audio

In [23]:
with open(audios['te_df'], 'w') as f:
        print(base, file=f)
        tqdm.pandas()
        for i, row in te_df.iterrows():
            if row['duration'] > 16000*40:
                continue
            else:
                print("{}\t{}\tspeaker_embeddings/{}.npy".
                      format(row['audio_path'],row['duration'],row['speaker_embedding']),
                      file=f)

### Text

In [10]:
with open(files['tr_df'], 'w') as f:
        for i, row in tqdm(tr_df.iterrows(),  total =tr_df.shape[0]):
            if get_frames(row['audio_path']) > 16000*20:
                continue
            else:
                print(
                    "{}".format(preprocess_arabic_text(row['arabic_text'])), file=f
                )

  0%|          | 469/376011 [00:00<03:44, 1672.20it/s]

100%|██████████| 376011/376011 [13:10<00:00, 475.85it/s]


### Audio

In [11]:
with open(audios['tr_df'], 'w') as f:
        print(base, file=f)
        tqdm.pandas()
        for i, row in tqdm(tr_df.iterrows(),  total =tr_df.shape[0]):
            if get_frames(row['audio_path']) > 16000*20:
                continue
            else:
                print("{}\t{}\tspeaker_embeddings/{}.npy".
                      format(row['audio_path'],get_frames(row['audio_path']),row['speaker_embedding']),
                      file=f)

100%|██████████| 376011/376011 [04:57<00:00, 1263.24it/s]
