## 1. Install

In [1]:
!pip install -q "transformers>=4.40.0" "datasets>=2.18.0" jiwer umsc soundfile torchaudio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

## 2. Imports, constants, romanizer

In [2]:
import os, re, gc, numpy as np, pandas as pd, torch
from datasets import Dataset, DatasetDict, Audio
from transformers import AutoProcessor, Wav2Vec2ForCTC, TrainingArguments, Trainer
from jiwer import cer, wer

MODEL_ID  = "mamatjan/xls-r-uyghur-cv18"
BASE      = "/kaggle/input/the-uyghur-voice-cup"  # <- change if needed
SR        = 16000
VAL_RATIO = 0.1
SEED      = 42
CLIP_SEC  = None            # e.g. 12
device    = "cuda" if torch.cuda.is_available() else "cpu"

import re
try:
    import umsc
except ImportError:
    umsc = None

# full-ish fallback map (add chars if you spot missing ones)
AR2LAT = {
    "ا":"a","ە":"e","ب":"b","پ":"p","ت":"t","ج":"j","چ":"ch","خ":"x","د":"d","ر":"r",
    "ز":"z","ژ":"zh","س":"s","ش":"sh","غ":"gh","ف":"f","ق":"q","ك":"k","گ":"g","ڭ":"ng",
    "ل":"l","م":"m","ن":"n","ھ":"h","و":"o","ۋ":"w","ۇ":"u","ۆ":"o","ۈ":"ü","ى":"i","ي":"y","ئ":""
}

def uyghur_to_latin(text: str) -> str:
    text = text or ""
    # try umsc if it has a working API
    if umsc is not None:
        if hasattr(umsc, "convert"):
            return umsc.convert(text, "UAS", "ULS")
        if hasattr(umsc, "uas2uls"):
            return umsc.uas2uls(text)
    # fallback
    return "".join(AR2LAT.get(ch, ch) for ch in text)

def clean_spaces(t: str) -> str:
    return re.sub(r"\s+", " ", t or "").strip()

2025-07-28 14:19:31.975871: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753712372.168239      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753712372.225839      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## 3. Load CSVs & datasets

In [3]:
train_df  = pd.read_csv(os.path.join(BASE, "train.csv"))
test_df   = pd.read_csv(os.path.join(BASE, "test.csv"))
sample_df = pd.read_csv(os.path.join(BASE, "sample.csv"))

train_df["audio"] = train_df["filepath"].apply(lambda p: os.path.join(BASE, p))
test_df["audio"]  = test_df["filepath"].apply(lambda p: os.path.join(BASE, p))

train_raw = Dataset.from_pandas(train_df).cast_column("audio", Audio(sampling_rate=SR))
test_raw  = Dataset.from_pandas(test_df ).cast_column("audio", Audio(sampling_rate=SR))

def clip_audio_fn(batch):
    if CLIP_SEC is None: return batch
    arr = batch["audio"]["array"]
    if len(arr) > SR * CLIP_SEC:
        batch["audio"]["array"] = arr[: SR * CLIP_SEC]
    return batch

train_raw = train_raw.map(clip_audio_fn, num_proc=1)
test_raw  = test_raw.map(clip_audio_fn,  num_proc=1)

splits = train_raw.train_test_split(test_size=VAL_RATIO, seed=SEED)
dataset = DatasetDict(train=splits["train"], validation=splits["test"], test=test_raw)
dataset


Map:   0%|          | 0/7574 [00:00<?, ? examples/s]

Map:   0%|          | 0/1894 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'filepath', 'transcription', 'audio'],
        num_rows: 6816
    })
    validation: Dataset({
        features: ['ID', 'filepath', 'transcription', 'audio'],
        num_rows: 758
    })
    test: Dataset({
        features: ['ID', 'filepath', 'audio'],
        num_rows: 1894
    })
})

## 4. Load model & processor

In [4]:
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).to(device)
model.eval()


preprocessor_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/468 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

## 5. Preprocess

In [5]:
def prepare(batch):
    audio = batch["audio"]
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    if "transcription" in batch:
        txt = clean_spaces(batch["transcription"])
        batch["labels"] = processor.tokenizer(txt).input_ids
    return batch

def map_split(ds, drop_cols):
    drop = [c for c in drop_cols if c in ds.column_names]
    return ds.map(prepare, remove_columns=drop, num_proc=1)

dataset["train"] = map_split(dataset["train"], ["audio","filepath","ID","transcription"])
dataset["validation"] = map_split(dataset["validation"], ["audio","filepath","ID","transcription"])
dataset["test"] = map_split(dataset["test"], ["audio","filepath","transcription"])

if "ID" not in dataset["test"].column_names:
    dataset["test"] = dataset["test"].add_column("ID", test_df["ID"].tolist())

print(dataset["train"][0].keys(), dataset["test"][0].keys())


Map:   0%|          | 0/6816 [00:00<?, ? examples/s]

Map:   0%|          | 0/758 [00:00<?, ? examples/s]

Map:   0%|          | 0/1894 [00:00<?, ? examples/s]

dict_keys(['input_values', 'labels']) dict_keys(['ID', 'input_values'])


## 6. Data collator

In [6]:
from dataclasses import dataclass
from typing import Dict, List, Union

@dataclass
class DataCollatorCTC:
    processor: AutoProcessor
    padding: Union[bool, str] = "longest"
    def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]:
        inputs = [{"input_values": f["input_values"]} for f in features]
        batch = self.processor.feature_extractor.pad(inputs, padding=self.padding, return_tensors="pt")
        if "labels" in features[0]:
            labels = [{"input_ids": f["labels"]} for f in features]
            with self.processor.as_target_processor():
                labels_batch = self.processor.tokenizer.pad(labels, padding=self.padding, return_tensors="pt")
            batch["labels"] = labels_batch["input_ids"].masked_fill(
                labels_batch["input_ids"] == self.processor.tokenizer.pad_token_id, -100
            )
        return batch

data_collator = DataCollatorCTC(processor)


## 7. Metrics (Latin CER/WER)

In [7]:
def compute_metrics(pred):
    logits = pred.predictions
    pred_ids = np.argmax(logits, axis=-1)
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)

    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    pred_lat = [uyghur_to_latin(clean_spaces(t)) for t in pred_str]
    label_lat = [uyghur_to_latin(clean_spaces(t)) for t in label_str]

    return {"cer": cer(label_lat, pred_lat), "wer": wer(label_lat, pred_lat)}


## 8. (Optional) Quick fine-tune

In [8]:
DO_TRAIN = False

if DO_TRAIN:
    training_args = TrainingArguments(
        output_dir="./xlsr_uyghur_ft",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=8,
        num_train_epochs=1,
        learning_rate=3e-4,
        warmup_steps=200,
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=500,
        save_steps=500,
        logging_steps=100,
        fp16=torch.cuda.is_available(),
        report_to="none",
        remove_unused_columns=False,
        load_best_model_at_end=True,
        metric_for_best_model="cer",
        greater_is_better=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        data_collator=data_collator,
        tokenizer=processor.feature_extractor,
        compute_metrics=compute_metrics
    )

    gc.collect(); torch.cuda.empty_cache()
    trainer.train()
    print(trainer.evaluate())
else:
    print("Skipping fine-tuning.")


Skipping fine-tuning.


## 9. Inference & Latin submission.csv

In [9]:
model.eval()
rows = []
for ex in dataset["test"]:
    iv = torch.tensor([ex["input_values"]], dtype=torch.float32).to(device)
    with torch.no_grad():
        logits = model(iv).logits
    pred_ids = torch.argmax(logits, dim=-1)
    text_ar = processor.batch_decode(pred_ids, skip_special_tokens=True)[0]
    text_lat = uyghur_to_latin(clean_spaces(text_ar))
    rows.append({"ID": ex["ID"], "transcription": text_lat})

pred_df = pd.DataFrame(rows)
sub = sample_df[["ID"]].merge(pred_df, on="ID", how="left")
sub["transcription"] = sub["transcription"].fillna("")
sub = sub[["ID", "transcription"]]
sub.to_csv("submission.csv", index=False)
print("submission.csv saved!")
sub.head()


submission.csv saved!


Unnamed: 0,ID,transcription
0,f068a206b84c4632865e0629a1b62fb8,bu dorini hېlila qaynatqan chaqan bol isiqida ...
1,a9d8cfab47b34f12b8f4b4769075713e,yamghurdin kېyinki hawa xudi süzüp tazlanghand...
2,34147b4f995144288b720d7474ba4dd6,qar barghanche qatiq yaghdi yoldiki piyadiler ...
3,c6c201bcd81a402385c2f008983f7474,chetke chiqip bilim igenligendin kېyin qaytip ...
4,c3c190cc67c14d4a946ef1b722196248,eyiblesh kishini chüshkünleshtüridu ilhamlandu...


## 10. Manual CER/WER on validation (if not trained)

In [10]:
if 'trainer' not in globals():
    preds, refs = [], []
    for ex in dataset["validation"]:
        iv = torch.tensor([ex["input_values"]], dtype=torch.float32).to(device)
        with torch.no_grad():
            logits = model(iv).logits
        pred_ids = torch.argmax(logits, dim=-1)
        preds.append(processor.batch_decode(pred_ids, skip_special_tokens=True)[0])

        ids = np.array(ex["labels"])
        ids[ids == -100] = processor.tokenizer.pad_token_id
        refs.append(processor.batch_decode([ids], skip_special_tokens=True)[0])

    preds_lat = [uyghur_to_latin(clean_spaces(t)) for t in preds]
    refs_lat  = [uyghur_to_latin(clean_spaces(t)) for t in refs]
    print("CER:", cer(refs_lat, preds_lat), "WER:", wer(refs_lat, preds_lat))


CER: 83501 WER: 10504
