In [None]:
# 1. Install dependencies
!pip install datasets transformers librosa jiwer evaluate scikit-learn accelerate --quiet
!apt install git-lfs -qq

In [None]:
# 2. Environment & Login
import os
os.environ["HUGGINGFACE_TOKEN"] = "INSERT_YOUR_HUGGINGFACE_TOKEN_HERE"

from huggingface_hub import login
login(token=os.getenv("HUGGINGFACE_TOKEN"), add_to_git_credential=True)

In [3]:
# 3. Load & split the SLR66 dataset
from datasets import load_dataset
seed = 42

dataset = load_dataset("openslr", "SLR66", trust_remote_code=True)
train_test_split = dataset["train"].train_test_split(test_size=0.2, seed=seed)
slr_train = train_test_split["train"]  # 80%
slr_test  = train_test_split["test"]   # 20%

print(f"Training set size: {len(slr_train)}")
print(f"Test set size:     {len(slr_test)}")

Training set size: 3558
Test set size:     890


In [4]:
# 4. Quick EDA helper
import random, pandas as pd
from IPython.display import display, HTML

def show_random_elements(ds, num=5):
    picks = random.sample(range(len(ds)), num)
    df = pd.DataFrame(ds[picks])
    display(HTML(df.to_html()))

show_random_elements(slr_train.remove_columns(["path", "audio"]), num=5)

Unnamed: 0,sentence
0,ఇందు ప్రఖ్యాతి వహించినది దర్బారుగుహ
1,వ్యవసాయం మూలంగా నీటి లోతు తగ్గిపోయింది
2,గ్రామ జనాభా రెండు వెలు ఏడు వందలు యాభై ఆరు
3,పశ్చిమ సైబీరియ హిమనీనదీయ సరస్సు చూడండి
4,వాని గింజల నుండి తైలమును దీసెదరు


In [5]:
telugu_special_unwanted_characters = [
    'ఁ',  # Chandrabindu
    'ౄ',  # Vocalic RR
    'ౢ',  # Vocalic L
    'ౣ',  # Vocalic LL
    'ౠ',  # Long Vocalic RR
    'ఽ',  # Avagraha
    '౦', '౧', '౨', '౩', '౪', '౫', '౬', '౭', '౮', '౯',  # Telugu digits
    'ఀ',  # Telugu Sign Combining Candrabindu Above
    'ౘ',  # Letter TTHA
    'ౙ',  # Letter DDA
    'ౚ',  # Letter RHA
    '౷',  # Vedic Tone
    '‘', '’', '“', '”', '%', '.', ';', '-', ',', '/', '\\', '_', '&',  # Common punctuation
    'G', 'P', 'S', 'e', 'l', 'n', 'r', 't', '\u200c', '\n' #Unwanted in the dataset
]

In [6]:
# 5. Clean up unwanted characters
import re

chars_to_remove = f'[{re.escape("".join(telugu_special_unwanted_characters))}]'

def remove_special_chars(batch):
    batch["sentence"] = re.sub(chars_to_remove, "", batch["sentence"])
    return batch

slr_train = slr_train.map(remove_special_chars)
slr_test  = slr_test.map(remove_special_chars)
show_random_elements(slr_train.remove_columns(["path", "audio"]), num=5)

Unnamed: 0,sentence
0,ఈ గ్రామంలో ఉత్పత్తి చేసిన పళ్ళు కూరగాయలు హైదరాబాదుకు సరఫరా చేస్తారు
1,వీటి జాబితా మొదటి పేజి లొ బొమ్మ క్రింద ఇస్తే ఎలా ఉంటుంది
2,దీని అధిపతి రామ్ గోపాల్ వర్మ
3,మెడ నిలబడకుండా వాలి పోవడం కూడా కద్దు
4,పిన్ కోడ్అయిదు వందలు పధ్ధెనిమిది అయిదు వందలు తొంభై నాలుగు


In [7]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

vocab_train = slr_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=slr_train.column_names)
vocab_test = slr_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=slr_test.column_names)

Map:   0%|          | 0/3558 [00:00<?, ? examples/s]

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

In [8]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict

{' ': 0,
 'ం': 1,
 'ః': 2,
 'అ': 3,
 'ఆ': 4,
 'ఇ': 5,
 'ఈ': 6,
 'ఉ': 7,
 'ఊ': 8,
 'ఋ': 9,
 'ఎ': 10,
 'ఏ': 11,
 'ఐ': 12,
 'ఒ': 13,
 'ఓ': 14,
 'ఔ': 15,
 'క': 16,
 'ఖ': 17,
 'గ': 18,
 'ఘ': 19,
 'ఙ': 20,
 'చ': 21,
 'ఛ': 22,
 'జ': 23,
 'ఞ': 24,
 'ట': 25,
 'ఠ': 26,
 'డ': 27,
 'ఢ': 28,
 'ణ': 29,
 'త': 30,
 'థ': 31,
 'ద': 32,
 'ధ': 33,
 'న': 34,
 'ప': 35,
 'ఫ': 36,
 'బ': 37,
 'భ': 38,
 'మ': 39,
 'య': 40,
 'ర': 41,
 'ఱ': 42,
 'ల': 43,
 'ళ': 44,
 'వ': 45,
 'శ': 46,
 'ష': 47,
 'స': 48,
 'హ': 49,
 'ా': 50,
 'ి': 51,
 'ీ': 52,
 'ు': 53,
 'ూ': 54,
 'ృ': 55,
 'ె': 56,
 'ే': 57,
 'ై': 58,
 'ొ': 59,
 'ో': 60,
 'ౌ': 61,
 '్': 62}

In [9]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [10]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

65

In [11]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [12]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|", clean_up_tokenization_spaces=False)

In [13]:
repo_name = "wav2vec2-large-xls-r-53-telugu-final-k-fold-2"

In [14]:
tokenizer.push_to_hub(repo_name)

CommitInfo(commit_url='https://huggingface.co/kaarthu2003/wav2vec2-large-xls-r-53-telugu-final-k-fold-2/commit/e897b216b14e9acdff0d1bc64e9b9067a414af1f', commit_message='Upload tokenizer', commit_description='', oid='e897b216b14e9acdff0d1bc64e9b9067a414af1f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kaarthu2003/wav2vec2-large-xls-r-53-telugu-final-k-fold-2', endpoint='https://huggingface.co', repo_type='model', repo_id='kaarthu2003/wav2vec2-large-xls-r-53-telugu-final-k-fold-2'), pr_revision=None, pr_num=None)

In [15]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

In [16]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [17]:
slr_train[0]["path"]
slr_train[0]["audio"]

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/f20a13dc2ab5963431c369a4b088f5c096f6c7230a90b08359007370aaeb6145/tef_04830_00965131543.wav',
 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00158691,
        -0.00112915, -0.00115967]),
 'sampling_rate': 48000}

In [18]:
from datasets import Audio
slr_train = slr_train.cast_column("audio", Audio(sampling_rate=16_000))
slr_test = slr_test.cast_column("audio", Audio(sampling_rate=16_000))

In [19]:
slr_train[0]["audio"]

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/f20a13dc2ab5963431c369a4b088f5c096f6c7230a90b08359007370aaeb6145/tef_04830_00965131543.wav',
 'array': array([ 6.55302210e-06, -4.48762003e-05, -8.45453615e-05, ...,
        -2.42830161e-03, -1.52032159e-03, -1.23449310e-03]),
 'sampling_rate': 16000}

In [20]:
rand_int = random.randint(0, len(slr_train))

print("Target text:", slr_train[rand_int]["sentence"])
print("Input array shape:", slr_train[rand_int]["audio"]["array"].shape)
print("Sampling rate:", slr_train[rand_int]["audio"]["sampling_rate"])

Target text: స్నేహం ఎంతో తియ్యనైనది
Input array shape: (55979,)
Sampling rate: 16000


In [21]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    batch["labels"] = processor(text=batch["sentence"]).input_ids

    return batch

In [22]:
slr_train = slr_train.map(prepare_dataset, remove_columns=slr_train.column_names, num_proc = 4)
slr_test = slr_test.map(prepare_dataset, remove_columns=slr_test.column_names, num_proc = 4)

In [23]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [24]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [25]:
import evaluate

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

In [26]:
%%capture
!pip install numpy

In [27]:
import numpy as np

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer, "cer": cer}

In [28]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    # "facebook/wav2vec2-xls-r-300m",
    'facebook/wav2vec2-large-xlsr-53',
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.0,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

  return self.fget.__get__(instance, owner)()
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
model.freeze_feature_encoder()

In [30]:
# 11. K-Fold Training Loop with WER and CER
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import KFold

k_folds = 2 # ← adjust number of folds here
kf = KFold(n_splits=k_folds, shuffle=True, random_state=seed)

for fold, (train_idx, val_idx) in enumerate(kf.split(slr_train)):
    print(f"\n===== Fold {fold+1}/{k_folds} =====")
    train_ds = slr_train.select(train_idx)
    val_ds   = slr_train.select(val_idx)

    args = TrainingArguments(
        output_dir=repo_name,
        group_by_length=True,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        eval_strategy="steps",
        eval_steps=400,
        save_steps=600,
        logging_steps=400,
        learning_rate=3e-4,
        warmup_steps=400,
        num_train_epochs=16,
        fp16=True,
        gradient_checkpointing=True,
        save_total_limit=2,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=processor.feature_extractor,
    )

    trainer.train()
    metrics = trainer.evaluate()
    print(
        f"Fold {fold+1} — eval_wer: {metrics['eval_wer']:.4f}, "
        f"eval_cer: {metrics['eval_cer']:.4f}"
    )

  trainer = Trainer(



===== Fold 1/2 =====




Step,Training Loss,Validation Loss,Wer,Cer
400,6.0426,0.840106,0.832086,0.211348
800,0.5036,0.343223,0.432801,0.082431
1200,0.2137,0.347397,0.388113,0.072289
1600,0.1267,0.341665,0.365211,0.066611




Fold 1 — eval_wer: 0.3558, eval_cer: 0.0645

===== Fold 2/2 =====


  trainer = Trainer(


Step,Training Loss,Validation Loss,Wer,Cer
400,0.2954,0.054327,0.108613,0.015134
800,0.1973,0.063224,0.119631,0.018078
1200,0.1273,0.062568,0.10661,0.016464
1600,0.0904,0.055714,0.091142,0.014041




Fold 2 — eval_wer: 0.0882, eval_cer: 0.0137


In [31]:
# (Optional) Final evaluation on your held-out test set:
final_trainer = Trainer(
    model=model, args=args,
    train_dataset=slr_train, eval_dataset=slr_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)
print("Test WER:", final_trainer.evaluate()["eval_wer"])
print("Test CER:", final_trainer.evaluate()["eval_cer"])
final_trainer.push_to_hub()

  final_trainer = Trainer(


Test WER: 0.2944839857651246
Test CER: 0.0530852417302799


training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/kaarthu2003/wav2vec2-large-xls-r-53-telugu-final-k-fold-2/commit/d9cb50df68782ce416af9cdf799deb9bf26a9cbf', commit_message='End of training', commit_description='', oid='d9cb50df68782ce416af9cdf799deb9bf26a9cbf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kaarthu2003/wav2vec2-large-xls-r-53-telugu-final-k-fold-2', endpoint='https://huggingface.co', repo_type='model', repo_id='kaarthu2003/wav2vec2-large-xls-r-53-telugu-final-k-fold-2'), pr_revision=None, pr_num=None)

In [None]:
cv_17 = load_dataset("mozilla-foundation/common_voice_17_0", "te")

README.md:   0%|          | 0.00/12.7k [00:00<?, ?B/s]

common_voice_17_0.py:   0%|          | 0.00/8.19k [00:00<?, ?B/s]

languages.py:   0%|          | 0.00/3.92k [00:00<?, ?B/s]

release_stats.py:   0%|          | 0.00/132k [00:00<?, ?B/s]

In [None]:
print(cv_17)

In [None]:
cv_17["other"] = cv_17["other"].map(prepare_dataset, remove_columns=cv_17.column_names, num_proc = 4)

In [None]:
import torch

# Move the model to the GPU
model.to("cuda")

def map_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
    # Now the model and input_values are on the same device
    logits = model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)

  return batch

results_cv_17 = cv_17["other"].map(map_to_result, remove_columns=slr_test.column_names)

In [None]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))

In [None]:
print("Test CER: {:.3f}".format(cer_metric.compute(predictions=results["pred_str"], references=results["text"])))