## Testing Cuda

In [2]:
# Now verify our GPU setup
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("GPU device name:", torch.cuda.get_device_name(0))
    print("GPU memory allocated:", torch.cuda.memory_allocated(0)/1024**2, "MB")
    print("GPU memory reserved:", torch.cuda.memory_reserved(0)/1024**2, "MB")

PyTorch version: 2.6.0+cu126
CUDA available: True
CUDA version: 12.6
GPU device name: NVIDIA GeForce RTX 3060
GPU memory allocated: 0.0 MB
GPU memory reserved: 0.0 MB


## Load Dataset

In [3]:
import pandas as pd
from datasets import Dataset, DatasetDict
import os

def load_urdu_dataset(data_dir):
    """
    Load Urdu dataset from local TSV files
    data_dir: Directory containing train.tsv, dev.tsv, test.tsv and clips folder
    """
    # Load TSV files
    train_df = pd.read_csv(os.path.join(data_dir, 'train.tsv'), sep='\t')
    dev_df = pd.read_csv(os.path.join(data_dir, 'dev.tsv'), sep='\t')
    test_df = pd.read_csv(os.path.join(data_dir, 'test.tsv'), sep='\t')
    
    # Add full audio path
    clips_dir = os.path.join(data_dir, 'clips')
    train_df['audio'] = train_df['path'].apply(lambda x: os.path.join(clips_dir, x))
    dev_df['audio'] = dev_df['path'].apply(lambda x: os.path.join(clips_dir, x))
    test_df['audio'] = test_df['path'].apply(lambda x: os.path.join(clips_dir, x))
    
    # Keep only necessary columns
    columns_to_keep = ['audio', 'sentence']
    train_df = train_df[columns_to_keep]
    dev_df = dev_df[columns_to_keep]
    test_df = test_df[columns_to_keep]
    
    # Convert to Hugging Face datasets
    train_dataset = Dataset.from_pandas(train_df)
    dev_dataset = Dataset.from_pandas(dev_df)
    test_dataset = Dataset.from_pandas(test_df)
    
    # Combine into DatasetDict
    dataset_dict = DatasetDict({
        'train': train_dataset,
        'validation': dev_dataset,
        'test': test_dataset
    })
    
    return dataset_dict

# Load the dataset
# Replace with your actual data directory path
data_dir = "S:/cv-corpus-20.0-2024-12-06/ur"
urdu_dataset = load_urdu_dataset(data_dir)

# Print dataset statistics
print("\nDataset statistics:")
for split in urdu_dataset:
    print(f"{split}: {len(urdu_dataset[split])} examples")

# Print first example from training set
print("\nExample from training set:")
print(urdu_dataset['train'][0])


Dataset statistics:
train: 7205 examples
validation: 4982 examples
test: 5026 examples

Example from training set:
{'audio': 'S:/cv-corpus-20.0-2024-12-06/ur\\clips\\common_voice_ur_31898340.mp3', 'sentence': 'طرح طرح کے پرندے'}


In [4]:
urdu_dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 7205
    })
    validation: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 4982
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 5026
    })
})

In [5]:
import soundfile
print(soundfile.__libsndfile_version__)

1.2.2


## Prepare Feature Extractor, Tokenizer and Data

In [6]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

# Initialize the feature extractor
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large-v3")

# Initialize the tokenizer with Urdu language
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large-v3", language="Urdu", task="transcribe")

# Let's also create the processor which combines both feature_extractor and tokenizer
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3", language="Urdu", task="transcribe")

# Let's test the tokenizer with a sample Urdu text from our dataset
sample_text = urdu_dataset["train"][0]["sentence"]
print("\nTesting tokenizer:")
print("Original text:", sample_text)
encoded = tokenizer(sample_text)
print("Encoded text:", encoded)

decoded = tokenizer.decode(encoded.input_ids)
print("Decoded text:", decoded)

# Let's also look at the feature extractor's configuration
print("\nFeature extractor config:")
print("Sampling rate:", feature_extractor.sampling_rate)
print("Feature size:", feature_extractor.feature_size)
print("Padding value:", feature_extractor.padding_value)


Testing tokenizer:
Original text: طرح طرح کے پرندے
Encoded text: {'input_ids': [50258, 50290, 50360, 50364, 9566, 2288, 5016, 23032, 2288, 5016, 24049, 21453, 2288, 41260, 7369, 50257], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Decoded text: <|startoftranscript|><|ur|><|transcribe|><|notimestamps|>طرح طرح کے پرندے<|endoftext|>

Feature extractor config:
Sampling rate: 16000
Feature size: 128
Padding value: 0.0


## Testing tokenizer

In [7]:
# Test tokenizer with and without special tokens
input_str = urdu_dataset["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print("\nTokenizer test with special tokens:")
print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


Tokenizer test with special tokens:
Input:                 طرح طرح کے پرندے
Decoded w/ special:    <|startoftranscript|><|ur|><|transcribe|><|notimestamps|>طرح طرح کے پرندے<|endoftext|>
Decoded w/out special: طرح طرح کے پرندے
Are equal:             True


## check default sampling rate of 1st training example

In [8]:
print(urdu_dataset["train"][0])

{'audio': 'S:/cv-corpus-20.0-2024-12-06/ur\\clips\\common_voice_ur_31898340.mp3', 'sentence': 'طرح طرح کے پرندے'}


## fixing sampling rate

In [9]:
# Now let's prepare our dataset with the correct sampling rate
# First, let's check the current audio format
from datasets import Audio
import numpy as np

# Cast the audio column to audio format
urdu_dataset = urdu_dataset.cast_column("audio", Audio(sampling_rate=16000))

# Let's examine the first audio file after casting
print("\nAudio format check:")
print("First example audio details:")
audio = urdu_dataset["train"][0]["audio"]
print(f"Path: {audio['path']}")
print(f"Sampling rate: {audio['sampling_rate']}")
print(f"Array shape: {audio['array'].shape}")


Audio format check:
First example audio details:
Path: S:/cv-corpus-20.0-2024-12-06/ur\clips\common_voice_ur_31898340.mp3
Sampling rate: 16000
Array shape: (50688,)


In [10]:
#check the format of 1st training example
print(urdu_dataset["train"][0])


{'audio': {'path': 'S:/cv-corpus-20.0-2024-12-06/ur\\clips\\common_voice_ur_31898340.mp3', 'array': array([1.39698386e-09, 1.74622983e-09, 2.91038305e-10, ...,
       1.17505246e-04, 1.56884955e-04, 4.67942300e-05]), 'sampling_rate': 16000}, 'sentence': 'طرح طرح کے پرندے'}


In [11]:
def prepare_dataset(batch, processor):
    """
    Prepare the dataset for training:
    1. Load and resample audio (already at 16kHz)
    2. Compute log-Mel input features
    3. Encode target text to label ids
    """
    # Load audio
    audio = batch["audio"]
    
    # Compute log-Mel input features using processor
    batch["input_features"] = processor.feature_extractor(
        audio["array"], 
        sampling_rate=audio["sampling_rate"]
    ).input_features[0]
    
    # Encode target text to label ids using processor
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch

In [12]:
# Test the prepare_dataset function on a single example
print("Testing data preparation on one example:")
test_prep = prepare_dataset(urdu_dataset["train"][0], processor)
print("\nProcessed features:")
print(f"Input features shape: {test_prep['input_features'].shape}")
print(f"Labels: {test_prep['labels']}")

Testing data preparation on one example:

Processed features:
Input features shape: (128, 3000)
Labels: [50258, 50290, 50360, 50364, 9566, 2288, 5016, 23032, 2288, 5016, 24049, 21453, 2288, 41260, 7369, 50257]


## original code no tqdm

In [13]:
# Process the entire dataset
print("\nProcessing the entire dataset...")
urdu_dataset = urdu_dataset.map(
    function=prepare_dataset,
    fn_kwargs={"processor": processor},  # Pass processor as a keyword argument
    remove_columns=urdu_dataset.column_names["train"],
    desc="Processing audio files",
    num_proc=1  # Adjust based on your CPU cores
)



Processing the entire dataset...


Processing audio files:   0%|          | 0/7205 [00:00<?, ? examples/s]

Processing audio files:   0%|          | 0/4982 [00:01<?, ? examples/s]

Processing audio files:   0%|          | 0/5026 [00:00<?, ? examples/s]

In [14]:
# Print the first example from processed dataset
print("\nFirst example from processed dataset:")
print(urdu_dataset["train"][0])


First example from processed dataset:
{'input_features': [[-0.46094655990600586, -0.46094655990600586, -0.46094655990600586, -0.46094655990600586, -0.29050612449645996, -0.14959204196929932, 0.01464623212814331, -0.28188109397888184, 0.20644724369049072, 0.26319533586502075, 0.25345373153686523, 0.38587695360183716, 0.4300098419189453, 0.3816331624984741, 0.4318208694458008, 0.4625791311264038, 0.3591630458831787, 0.17014223337173462, 0.17455577850341797, 0.012070775032043457, -0.2486199140548706, 0.1587790846824646, -0.368524432182312, -0.11219727993011475, 0.012458443641662598, -0.002771139144897461, -0.012964367866516113, -0.17104017734527588, -0.14879858493804932, 0.13045001029968262, 0.03072810173034668, -0.033832430839538574, 0.03232938051223755, 0.1293550729751587, -0.2064659595489502, -0.18981075286865234, -0.34497785568237305, 0.011213898658752441, -0.2246931791305542, -0.25324714183807373, -0.09546971321105957, 0.03597372770309448, 0.22246956825256348, 0.2069726586341858, 0.

In [15]:

# Print dataset format
print("\nDataset format:")
print(urdu_dataset)


Dataset format:
DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 7205
    })
    validation: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 4982
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 5026
    })
})


In [18]:
from huggingface_hub import login
# Login to Hugging Face (you'll be prompted for your token)
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
# Push the dataset to the Hub
dataset_name = "urdu-common-voice-20-processed-large-v3"
urdu_dataset.push_to_hub(
    f"osman31/{dataset_name}",
    private=True  # Set to False if you want it public
)

print(f"Dataset pushed to: https://huggingface.co/datasets/osman31/{dataset_name}")

Uploading the dataset shards:   0%|          | 0/23 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/16 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/16 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset pushed to: https://huggingface.co/datasets/osman31/urdu-common-voice-20-processed-large-v3
