In [1]:
import torch

if torch.cuda.is_available():
    print("✅ GPU is available!")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("❌ GPU is not available. Using CPU instead.")

✅ GPU is available!
Using GPU: NVIDIA GeForce RTX 4090


In [2]:
%%capture
!pip install transformers
!pip install librosa
!pip install jiwer
!pip install evaluate
!pip install wandb
!pip install numpy==1.23.5
!pip install scipy==1.11.4
!pip install librosa==0.10.1
!pip install numba==0.58.1
!pip install datasets>=2.14.0
!pip install accelerate>=0.26.0
!pip install typing_extensions --upgrade

In [3]:
%%capture
!pip install --upgrade torch transformers accelerate

In [4]:
# !pip install huggingface_hub --quiet


from huggingface_hub import login
login(token="INSERT_YOUR_HUGGINGFACE_TOKEN")

In [5]:
%%capture
!apt install git-lfs

In [6]:
import shutil

# Check space where the container is running (usually '/')
total, used, free = shutil.disk_usage("/nvme") # use /nvme and not /

print(f"Total Space: {total // (2**30)} GB")
print(f"Used Space:  {used // (2**30)} GB")
print(f"Free Space:  {free // (2**30)} GB")

Total Space: 60 GB
Used Space:  19 GB
Free Space:  40 GB


In [7]:
from datasets import load_dataset, concatenate_datasets, Audio

ds = load_dataset("kaarthu2003/SlrCvVoicesTtsDataset")
train_dataset = ds["train"]
val_dataset = ds["validation"]

In [8]:
# Print confirmation
print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(val_dataset)}")

# Sample peek
print("\nSample example:")
print(train_dataset[0])

Train size: 15811
Validation size: 1610

Sample example:
{'audio': {'path': '4503599627643336_chunk_12.flac', 'array': array([-0.0328064 , -0.03216553, -0.02658081, ...,  0.03289795,
        0.03430176,  0.03677368]), 'sampling_rate': 16000}, 'sentence': 'దాగుడుమూతల ఆట వల్ల'}


In [10]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(train_dataset.remove_columns(["audio"]))

Unnamed: 0,sentence
0,తెలంగాణలో కొప్పలి బ్రిడ్జ్
1,ఒక మైలులో
2,మాకు అందుబాటులో దొరుకుతాయి
3,ఏమి కావాలండి మీకు
4,వీరిలో పురుషుల శాతం యాభై ఒకటి% స్త్రీల శాతం నలభై తొమ్మిది%
5,కష్టసాధ్యమైన కార్యమ్ము నెరవేర్ప
6,ఒకసారి నేను చేసిన మార్పుల జాబితా చూడండి
7,ఇది కేవలం తొలి సమాచారం మాత్రమే
8,నాతో పాటు ఆరుగురండి
9,ఓకే ఓకే రండి


In [11]:
telugu_special_unwanted_characters = [
    'ఁ',  # Chandrabindu
    'ౄ',  # Vocalic RR
    'ౢ',  # Vocalic L
    'ౣ',  # Vocalic LL
    'ౠ',  # Long Vocalic RR
    'ఽ',  # Avagraha
    '౦', '౧', '౨', '౩', '౪', '౫', '౬', '౭', '౮', '౯',  # Telugu digits
    'ఀ',  # Telugu Sign Combining Candrabindu Above
    'ౘ',  # Letter TTHA
    'ౙ',  # Letter DDA
    'ౚ',  # Letter RHA
    '౷',  # Vedic Tone
    '‘', '’', '“', '”', '%', '.', ';', '-', ',', '/', '\\', '_', '&',  # Common punctuation
    'G', 'P', 'S', 'e', 'l', 'n', 'r', 't', '\u200c', '\n' #Unwanted in the dataset
]

In [12]:
import re
chars_to_remove_regex = f'[{re.escape("".join(telugu_special_unwanted_characters))}]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"])
    return batch

In [13]:
train_dataset = train_dataset.map(remove_special_characters)
val_dataset = val_dataset.map(remove_special_characters)

In [14]:
show_random_elements(train_dataset.remove_columns(["audio"]))

Unnamed: 0,sentence
0,ఓకే థ్యాంక్ యూ
1,ఆఫర్లు ఏమన్నా ఉన్నాయా
2,ఇవి ఒకటి లేదా జతలుగా పూస్తాయి
3,సరేనండీ బాగా తియ్యాలండీ
4,సతుల సీత
5,అటువంటి ప్రాంతాలలో ముఖ్య మంత్రి పదవి కూడా వుంటుంది
6,అంటే ఏస్కివా ఆనకాడపివా
7,ఇక అలా చూసుకుంటే
8,ఆ ఓకే సరేనండి
9,అతన్ని తన సహాయకునిగా పెట్టుకున్నారు


In [15]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

vocab_train = train_dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_dataset.column_names)
vocab_test = val_dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=val_dataset.column_names)

Map:   0%|          | 0/15811 [00:00<?, ? examples/s]

Map:   0%|          | 0/1610 [00:00<?, ? examples/s]

In [16]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}

In [17]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [18]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

69

In [19]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [20]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|", clean_up_tokenization_spaces=False)

In [21]:
repo_name = "wav2vec2-IEEEAccess-FinalRun-4Datasets"

In [22]:
tokenizer.push_to_hub(repo_name)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets/commit/a1e1617b7d11bcb310ae2f3d685a72c3e95c4ce9', commit_message='Upload tokenizer', commit_description='', oid='a1e1617b7d11bcb310ae2f3d685a72c3e95c4ce9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets', endpoint='https://huggingface.co', repo_type='model', repo_id='kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets'), pr_revision=None, pr_num=None)

In [23]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

In [24]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

  warn(


In [25]:
train_dataset[0]["audio"]

{'path': '4503599627643336_chunk_12.flac',
 'array': array([-0.0328064 , -0.03216553, -0.02658081, ...,  0.03289795,
         0.03430176,  0.03677368]),
 'sampling_rate': 16000}

In [26]:
from datasets import Audio
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16_000))
val_dataset = val_dataset.cast_column("audio", Audio(sampling_rate=16_000))

In [27]:
rand_int = random.randint(0, len(train_dataset))

print("Target text:", train_dataset[rand_int]["sentence"])
print("Input array shape:", train_dataset[rand_int]["audio"]["array"].shape)
print("Sampling rate:", train_dataset[rand_int]["audio"]["sampling_rate"])

Target text: ఈ గ్రామంలో ఉత్పత్తి చేసిన పళ్ళు కూరగాయలు హైదరాబాదుకు సరఫరా చేస్తారు
Input array shape: (103851,)
Sampling rate: 16000


In [28]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    batch["labels"] = processor(text=batch["sentence"]).input_ids

    return batch

In [29]:
train_dataset = train_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names, num_proc = 4)
val_dataset = val_dataset.map(prepare_dataset, remove_columns=val_dataset.column_names, num_proc = 4)

In [30]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [31]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [32]:
import evaluate

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

In [33]:
import numpy as np

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer, "cer": cer}

In [34]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    'facebook/wav2vec2-large-xlsr-53',
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.0,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
model.freeze_feature_encoder()

In [36]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=repo_name,
  group_by_length=True,
  per_device_train_batch_size=16,
  gradient_accumulation_steps=2,
  eval_strategy="steps",
  num_train_epochs=45,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=1600,
  eval_steps=1600,
  logging_steps=400,
  learning_rate=3e-4,
  warmup_ratio=0.1,
  save_total_limit=2,
  report_to="wandb",
  push_to_hub=True,
)

In [37]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=processor.feature_extractor,
)

In [38]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

  ········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkaarthu2003[0m ([33mkaarthu2003-vellore-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Wer,Cer
1600,1.0425,0.499787,0.512229,0.121797
3200,0.7519,0.433556,0.414273,0.098736
4800,0.5768,0.43092,0.393578,0.093728
6400,0.46,0.392628,0.34805,0.08318
8000,0.382,0.437613,0.36448,0.082773
9600,0.3188,0.438422,0.33764,0.0798
11200,0.2536,0.461104,0.31105,0.075906
12800,0.2115,0.461185,0.304779,0.073676
14400,0.1882,0.539191,0.314938,0.0758
16000,0.1496,0.505645,0.295121,0.072774




TrainOutput(global_step=22275, training_loss=0.5506090576480134, metrics={'train_runtime': 19560.5653, 'train_samples_per_second': 36.374, 'train_steps_per_second': 1.139, 'total_flos': 6.590050445487777e+19, 'train_loss': 0.5506090576480134, 'epoch': 45.0})

In [39]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets/commit/d1c103c560214000c1c0fa67f60d0c9d847abb8b', commit_message='End of training', commit_description='', oid='d1c103c560214000c1c0fa67f60d0c9d847abb8b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets', endpoint='https://huggingface.co', repo_type='model', repo_id='kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets'), pr_revision=None, pr_num=None)

In [40]:
from transformers import AutoProcessor, AutoModelForCTC

processor = AutoProcessor.from_pretrained("kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets")
model = AutoModelForCTC.from_pretrained("kaarthu2003/wav2vec2-IEEEAccess-FinalRun-4Datasets")

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.28k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/886 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

In [41]:
import torch

# Move the model to the GPU
model.to("cuda")

def map_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
    # Now the model and input_values are on the same device
    logits = model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)

  return batch

results = val_dataset.map(map_to_result, remove_columns=val_dataset.column_names)



Map:   0%|          | 0/1610 [00:00<?, ? examples/s]

In [42]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))

Test WER: 0.281


In [43]:
import evaluate
cer_metric = evaluate.load("cer")

In [44]:
print("Test CER: {:.3f}".format(cer_metric.compute(predictions=results["pred_str"], references=results["text"])))

Test CER: 0.068
