In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [7]:
# Buckwalter → Arabic Unicode map
buckwalter_map = {
    "'": "ء",  "|": "آ",  ">": "أ",  "&": "ؤ",  "<": "إ",  "}": "ئ",
    "A": "ا",  "b": "ب",  "p": "ة",  "t": "ت",  "v": "ث",  "j": "ج",
    "H": "ح",  "x": "خ",  "d": "د",  "*": "ذ",  "r": "ر",  "z": "ز",
    "s": "س",  "$": "ش",  "S": "ص",  "D": "ض",  "T": "ط",  "Z": "ظ",
    "E": "ع",  "g": "غ",  "_": "ـ",  "f": "ف",  "q": "ق",  "k": "ك",
    "l": "ل",  "m": "م",  "n": "ن",  "h": "ه",  "w": "و",  "Y": "ى",
    "y": "ي",  "F": "ً",  "N": "ٌ",  "K": "ٍ",  "a": "َ",  "u": "ُ",
    "i": "ِ",  "~": "ّ",  "o": "ْ",  "`": "ٰ",  "{": "ٱ"
}

def buckwalter_to_arabic(text):
    return "".join(buckwalter_map.get(ch, ch) for ch in text)


# preparing tha dataset

all of the datasets have samiliar distribution as the records are 5-20 seconds and for each split 200 records which satfies the minimum amount of data

In [2]:
from datasets import load_dataset, concatenate_datasets, Audio

# 1️⃣ Load the raw “train” splits
egy_raw = load_dataset("MightyStudent/Egyptian-ASR-MGB-3", split="train")
cl_raw  = load_dataset("MBZUAI/ClArTTS",           split="train")
msa_raw = load_dataset("halabi2016/arabic_speech_corpus", split="train")

# 2️⃣ Process Egyptian‑Arabic: rename ‘sentence’→‘text’, drop all else
egy = egy_raw.rename_column("sentence", "text")
cols_to_keep = {"audio","text"}
egy = egy.remove_columns([c for c in egy.column_names if c not in cols_to_keep])

# 3️⃣ Process MSA: drop all except audio & text
msa = msa_raw.remove_columns([c for c in msa_raw.column_names if c not in cols_to_keep])

# 4️⃣ Classical‑Arabic: first subsample 200, then wrap only those
cl200 = cl_raw.shuffle(seed=0).select(range(200))

def wrap_classic(ex):
    return {
      "text":  ex["text"],
      "audio": {"array": ex["audio"], "sampling_rate": ex["sampling_rate"]}
    }

cl200 = cl200.map(
    wrap_classic,
    remove_columns=[c for c in cl200.column_names if c not in cols_to_keep]
)

# 5️⃣ Cast each of the three to Audio(sampling_rate=16000)
egy200 = egy.shuffle(seed=0).select(range(200)).cast_column("audio", Audio(sampling_rate=16000))
cl200 = cl200.cast_column("audio", Audio(sampling_rate=16000))
msa200 = msa.shuffle(seed=0).select(range(200)).cast_column("audio", Audio(sampling_rate=16000))

# 6️⃣ Concatenate → 600 examples, uniform schema
combined = concatenate_datasets([egy200, cl200, msa200])
print(combined)


README.md:   0%|          | 0.00/3.27k [00:00<?, ?B/s]

(…)-00000-of-00002-69f2a5d14e629dff.parquet:   0%|          | 0.00/467M [00:00<?, ?B/s]

(…)-00001-of-00002-860b0ca7a1fb4dc3.parquet:   0%|          | 0.00/454M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1159 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/26 [00:00<?, ?files/s]

(…)-00000-of-00026-1fd0fc09305c182b.parquet:   0%|          | 0.00/123M [00:00<?, ?B/s]

(…)-00001-of-00026-cbe61efe17b87bfc.parquet:   0%|          | 0.00/121M [00:00<?, ?B/s]

(…)-00002-of-00026-8ed1c0f82932dfc8.parquet:   0%|          | 0.00/120M [00:00<?, ?B/s]

(…)-00003-of-00026-59a22a5bb52f5ffe.parquet:   0%|          | 0.00/120M [00:00<?, ?B/s]

(…)-00004-of-00026-d4d70e27b8d1551d.parquet:   0%|          | 0.00/122M [00:00<?, ?B/s]

(…)-00005-of-00026-82f6f5b92771d9de.parquet:   0%|          | 0.00/123M [00:00<?, ?B/s]

(…)-00006-of-00026-a5aac08c26b28554.parquet:   0%|          | 0.00/121M [00:00<?, ?B/s]

(…)-00007-of-00026-6158c010006ef953.parquet:   0%|          | 0.00/118M [00:00<?, ?B/s]

(…)-00008-of-00026-611698734e9ea94f.parquet:   0%|          | 0.00/118M [00:00<?, ?B/s]

(…)-00009-of-00026-a7b6145ea3621054.parquet:   0%|          | 0.00/121M [00:00<?, ?B/s]

(…)-00010-of-00026-3f70b36998948d43.parquet:   0%|          | 0.00/123M [00:00<?, ?B/s]

(…)-00011-of-00026-40b8194516bf10d5.parquet:   0%|          | 0.00/118M [00:00<?, ?B/s]

(…)-00012-of-00026-5acf03e0fbfcb84e.parquet:   0%|          | 0.00/123M [00:00<?, ?B/s]

(…)-00013-of-00026-0ca66cf2a8c4cee8.parquet:   0%|          | 0.00/119M [00:00<?, ?B/s]

(…)-00014-of-00026-7a00b4aae6bd117c.parquet:   0%|          | 0.00/120M [00:00<?, ?B/s]

(…)-00015-of-00026-5a1bba801b33852b.parquet:   0%|          | 0.00/120M [00:00<?, ?B/s]

(…)-00016-of-00026-1ff2b3f99d809a0a.parquet:   0%|          | 0.00/118M [00:00<?, ?B/s]

(…)-00017-of-00026-f679f8e3e2b69e6b.parquet:   0%|          | 0.00/119M [00:00<?, ?B/s]

(…)-00018-of-00026-6e46c8ada3d13aff.parquet:   0%|          | 0.00/119M [00:00<?, ?B/s]

(…)-00019-of-00026-522807179b1c1536.parquet:   0%|          | 0.00/120M [00:00<?, ?B/s]

(…)-00020-of-00026-279fcf38676ea183.parquet:   0%|          | 0.00/120M [00:00<?, ?B/s]

(…)-00021-of-00026-822b7416ef69f478.parquet:   0%|          | 0.00/119M [00:00<?, ?B/s]

(…)-00022-of-00026-aebd2247e5c3ee37.parquet:   0%|          | 0.00/121M [00:00<?, ?B/s]

(…)-00023-of-00026-a482f9455d171bf9.parquet:   0%|          | 0.00/124M [00:00<?, ?B/s]

(…)-00024-of-00026-e06c6e687b788185.parquet:   0%|          | 0.00/123M [00:00<?, ?B/s]

(…)-00025-of-00026-130b3a71f5937c6b.parquet:   0%|          | 0.00/119M [00:00<?, ?B/s]

(…)-00000-of-00001-97f968086568e9cf.parquet:   0%|          | 0.00/68.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/205 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/9.62k [00:00<?, ?B/s]

arabic_speech_corpus.py:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

The repository for halabi2016/arabic_speech_corpus contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/halabi2016/arabic_speech_corpus.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] غ
The repository for halabi2016/arabic_speech_corpus contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/halabi2016/arabic_speech_corpus.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1813 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'text'],
    num_rows: 600
})


In [11]:
# Assuming combined is your Dataset and buckwalter_to_arabic is defined

def convert_text(example, idx):
    if 400 <= idx < 600:
        example['text'] = buckwalter_to_arabic(example['text'])
    return example

# Apply the conversion to the specific range
combined = combined.map(convert_text, with_indices=True)

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [21]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0


## wav2vec2-large-xlsr-53-arabic model

In [22]:
from transformers import pipeline
from jiwer import wer
import librosa

# Initialize the ASR pipeline
pipe = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-arabic")


config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

Device set to use cuda:0


In [25]:
def resample_audio(audio_array, orig_sr, target_sr=16000):
    if orig_sr != target_sr:
        return librosa.resample(audio_array, orig_sr=orig_sr, target_sr=target_sr)
    return audio_array

# Extract and resample audio arrays
audios = [
    resample_audio(sample["audio"]["array"], sample["audio"]["sampling_rate"])
    for sample in combined
]

# Transcribe the audio samples in batches
transcriptions = pipe(audios, batch_size=16)
predicted_texts = [transcription["text"] for transcription in transcriptions]

# Extract ground truth texts
ground_truth_texts = combined["text"]

# Calculate Word Error Rate (WER)
error_rate = wer(ground_truth_texts, predicted_texts)
print(f"Word Error Rate: {error_rate}")

Word Error Rate: 0.7859896791031497
