In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset

Egy_Arabic = load_dataset("MightyStudent/Egyptian-ASR-MGB-3")

In [3]:
classic_Arabic = load_dataset("MBZUAI/ClArTTS")

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

In [4]:
MSA = load_dataset("halabi2016/arabic_speech_corpus")

In [5]:
from IPython.display import Audio, display

In [6]:
sample1 = classic_Arabic['test'][0]

# Show the text
print("Text:", sample1['text'])

# Display an audio player widget
display(Audio(data=sample1['audio'], rate=sample1['sampling_rate'], autoplay=False))

Text: .صَائِرٌ .خَبَرًا فَكُنْ خَبَرًا .يَرُوقُ جَمِيلَا


In [7]:
sample2 = Egy_Arabic['train'][0]

# Show the text
print("Text:", sample2['sentence'])

# Display an audio player widget
display(Audio(data=sample2['audio']['array'], rate= sample2['audio']['sampling_rate'], autoplay=False))

Text: عزيزي المشاهد البرنامج ده مش برنامج كوميدي وبس ده حالة انسانية استضفنا في البرنامج ده ناس كتير منهم الفنان الصحفي سواق التاكسي و المحامي وغيرهم كتير اضطرينا نعرضهم لضغوط كتير زي ما بيحصل في الواقع بالظبط عشان نعرف قد إيه هما بيحبوا البلد دي تعالوا نشوف النهارده هيحصل إيه


In [8]:
# Buckwalter → Arabic Unicode map
buckwalter_map = {
    "'": "ء",  "|": "آ",  ">": "أ",  "&": "ؤ",  "<": "إ",  "}": "ئ",
    "A": "ا",  "b": "ب",  "p": "ة",  "t": "ت",  "v": "ث",  "j": "ج",
    "H": "ح",  "x": "خ",  "d": "د",  "*": "ذ",  "r": "ر",  "z": "ز",
    "s": "س",  "$": "ش",  "S": "ص",  "D": "ض",  "T": "ط",  "Z": "ظ",
    "E": "ع",  "g": "غ",  "_": "ـ",  "f": "ف",  "q": "ق",  "k": "ك",
    "l": "ل",  "m": "م",  "n": "ن",  "h": "ه",  "w": "و",  "Y": "ى",
    "y": "ي",  "F": "ً",  "N": "ٌ",  "K": "ٍ",  "a": "َ",  "u": "ُ",
    "i": "ِ",  "~": "ّ",  "o": "ْ",  "`": "ٰ",  "{": "ٱ"
}

def buckwalter_to_arabic(text):
    return "".join(buckwalter_map.get(ch, ch) for ch in text)


In [9]:
sample2 = MSA['train'][0]
bw = sample2['text']
arabic = buckwalter_to_arabic(bw)

print("Buckwalter:", bw)
print("Arabic    :", arabic)

display(Audio(data=sample2['audio']['array'], rate= sample2['audio']['sampling_rate'], autoplay=False))

Buckwalter: ﻿waraj~aHa Alt~aqoriyru Al~aTHiy >aEad~ahu maEohadu >aboHaA^i haDabapi Alt~ibiti fiy Alo>akaAdiymiy~api AlS~iyniy~api liloEuluwmi - >ano tasotamir~a darajaAtu AloHaraArapi wamusotawayaAtu Alr~uTuwbapi fiy Alo<irotifaAEi TawaAla haTHaA Aloqarono
Arabic    : ﻿وَرَجَّحَ التَّقْرِيرُ الَّطحِي أَعَدَّهُ مَعْهَدُ أَبْحَا^ِ هَضَبَةِ التِّبِتِ فِي الْأَكَادِيمِيَّةِ الصِّينِيَّةِ لِلْعُلُومِ - أَنْ تَسْتَمِرَّ دَرَجَاتُ الْحَرَارَةِ وَمُسْتَوَيَاتُ الرُّطُوبَةِ فِي الْإِرْتِفَاعِ طَوَالَ هَطحَا الْقَرْنْ


In [10]:
print(len(Egy_Arabic['train']))
print(len(classic_Arabic['train']))
print(len(MSA['train']))

1159
9500
1813


In [13]:
from datasets import load_dataset, concatenate_datasets, Audio

# 1️⃣ Load the raw “train” splits
egy_raw = load_dataset("MightyStudent/Egyptian-ASR-MGB-3", split="train")
cl_raw  = load_dataset("MBZUAI/ClArTTS",           split="train")
msa_raw = load_dataset("halabi2016/arabic_speech_corpus", split="train")

# 2️⃣ Process Egyptian‑Arabic: rename ‘sentence’→‘text’, drop all else
egy = egy_raw.rename_column("sentence", "text")
cols_to_keep = {"audio","text"}
egy = egy.remove_columns([c for c in egy.column_names if c not in cols_to_keep])

# 3️⃣ Process MSA: drop all except audio & text
msa = msa_raw.remove_columns([c for c in msa_raw.column_names if c not in cols_to_keep])

# 4️⃣ Classical‑Arabic: first subsample 200, then wrap only those
cl200 = cl_raw.shuffle(seed=0).select(range(200))

def wrap_classic(ex):
    return {
      "text":  ex["text"],
      "audio": {"array": ex["audio"], "sampling_rate": ex["sampling_rate"]}
    }

cl200 = cl200.map(
    wrap_classic,
    remove_columns=[c for c in cl200.column_names if c not in cols_to_keep]
)

# 5️⃣ Cast each of the three to Audio(sampling_rate=16000)
egy200 = egy.shuffle(seed=0).select(range(200)).cast_column("audio", Audio(sampling_rate=16000))
cl200 = cl200.cast_column("audio", Audio(sampling_rate=16000))
msa200 = msa.shuffle(seed=0).select(range(200)).cast_column("audio", Audio(sampling_rate=16000))

# 6️⃣ Concatenate → 600 examples, uniform schema
combined = concatenate_datasets([egy200, cl200, msa200])
print(combined)


Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

Dataset({
    features: ['audio', 'text'],
    num_rows: 600
})


In [22]:
combined[500]['audio']['array']

array([-2.52600216e-08,  2.89113586e-08, -3.23741460e-08, ...,
        5.53232203e-05,  5.34077481e-05,  5.94334706e-05])