## Testing Cuda

In [2]:
# Now verify our GPU setup
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("GPU device name:", torch.cuda.get_device_name(0))
    print("GPU memory allocated:", torch.cuda.memory_allocated(0)/1024**2, "MB")
    print("GPU memory reserved:", torch.cuda.memory_reserved(0)/1024**2, "MB")

PyTorch version: 2.6.0+cu126
CUDA available: True
CUDA version: 12.6
GPU device name: NVIDIA GeForce RTX 3060
GPU memory allocated: 0.0 MB
GPU memory reserved: 0.0 MB


## Load Dataset

In [3]:
import pandas as pd
from datasets import Dataset, DatasetDict
import os

def load_urdu_dataset(data_dir):
    """
    Load Urdu dataset from local TSV files
    data_dir: Directory containing train.tsv, dev.tsv, test.tsv and clips folder
    """
    # Load TSV files
    train_df = pd.read_csv(os.path.join(data_dir, 'train.tsv'), sep='\t')
    dev_df = pd.read_csv(os.path.join(data_dir, 'dev.tsv'), sep='\t')
    test_df = pd.read_csv(os.path.join(data_dir, 'test.tsv'), sep='\t')
    
    # Add full audio path
    clips_dir = os.path.join(data_dir, 'clips')
    train_df['audio'] = train_df['path'].apply(lambda x: os.path.join(clips_dir, x))
    dev_df['audio'] = dev_df['path'].apply(lambda x: os.path.join(clips_dir, x))
    test_df['audio'] = test_df['path'].apply(lambda x: os.path.join(clips_dir, x))
    
    # Keep only necessary columns
    columns_to_keep = ['audio', 'sentence']
    train_df = train_df[columns_to_keep]
    dev_df = dev_df[columns_to_keep]
    test_df = test_df[columns_to_keep]
    
    # Convert to Hugging Face datasets
    train_dataset = Dataset.from_pandas(train_df)
    dev_dataset = Dataset.from_pandas(dev_df)
    test_dataset = Dataset.from_pandas(test_df)
    
    # Combine into DatasetDict
    dataset_dict = DatasetDict({
        'train': train_dataset,
        'validation': dev_dataset,
        'test': test_dataset
    })
    
    return dataset_dict

# Load the dataset
# Replace with your actual data directory path
data_dir = "S:/cv-corpus-20.0-2024-12-06/ur"
urdu_dataset = load_urdu_dataset(data_dir)

# Print dataset statistics
print("\nDataset statistics:")
for split in urdu_dataset:
    print(f"{split}: {len(urdu_dataset[split])} examples")

# Print first example from training set
print("\nExample from training set:")
print(urdu_dataset['train'][0])


Dataset statistics:
train: 7205 examples
validation: 4982 examples
test: 5026 examples

Example from training set:
{'audio': 'S:/cv-corpus-20.0-2024-12-06/ur\\clips\\common_voice_ur_31898340.mp3', 'sentence': 'طرح طرح کے پرندے'}


In [4]:
urdu_dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 7205
    })
    validation: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 4982
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 5026
    })
})

## Prepare Feature Extractor, Tokenizer and Data

In [5]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

# Initialize the feature extractor
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")

# Initialize the tokenizer with Urdu language
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="Urdu", task="transcribe")

# Let's also create the processor which combines both feature_extractor and tokenizer
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="Urdu", task="transcribe")

# Let's test the tokenizer with a sample Urdu text from our dataset
sample_text = urdu_dataset["train"][0]["sentence"]
print("\nTesting tokenizer:")
print("Original text:", sample_text)
encoded = tokenizer(sample_text)
print("Encoded text:", encoded)

decoded = tokenizer.decode(encoded.input_ids)
print("Decoded text:", decoded)

# Let's also look at the feature extractor's configuration
print("\nFeature extractor config:")
print("Sampling rate:", feature_extractor.sampling_rate)
print("Feature size:", feature_extractor.feature_size)
print("Padding value:", feature_extractor.padding_value)


Testing tokenizer:
Original text: طرح طرح کے پرندے
Encoded text: {'input_ids': [50258, 50290, 50359, 50363, 9566, 2288, 5016, 23032, 2288, 5016, 24049, 21453, 2288, 41260, 7369, 50257], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Decoded text: <|startoftranscript|><|ur|><|transcribe|><|notimestamps|>طرح طرح کے پرندے<|endoftext|>

Feature extractor config:
Sampling rate: 16000
Feature size: 80
Padding value: 0.0


## Testing tokenizer

In [6]:
# Test tokenizer with and without special tokens
input_str = urdu_dataset["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print("\nTokenizer test with special tokens:")
print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


Tokenizer test with special tokens:
Input:                 طرح طرح کے پرندے
Decoded w/ special:    <|startoftranscript|><|ur|><|transcribe|><|notimestamps|>طرح طرح کے پرندے<|endoftext|>
Decoded w/out special: طرح طرح کے پرندے
Are equal:             True


## check default sampling rate of 1st training example

In [7]:
print(urdu_dataset["train"][0])

{'audio': 'S:/cv-corpus-20.0-2024-12-06/ur\\clips\\common_voice_ur_31898340.mp3', 'sentence': 'طرح طرح کے پرندے'}


## fixing sampling rate

In [8]:
# Now let's prepare our dataset with the correct sampling rate
# First, let's check the current audio format
from datasets import Audio
import numpy as np

# Cast the audio column to audio format
urdu_dataset = urdu_dataset.cast_column("audio", Audio(sampling_rate=16000))

# Let's examine the first audio file after casting
print("\nAudio format check:")
print("First example audio details:")
audio = urdu_dataset["train"][0]["audio"]
print(f"Path: {audio['path']}")
print(f"Sampling rate: {audio['sampling_rate']}")
print(f"Array shape: {audio['array'].shape}")


Audio format check:
First example audio details:
Path: S:/cv-corpus-20.0-2024-12-06/ur\clips\common_voice_ur_31898340.mp3
Sampling rate: 16000
Array shape: (50688,)


In [9]:
#check the format of 1st training example
print(urdu_dataset["train"][0])


{'audio': {'path': 'S:/cv-corpus-20.0-2024-12-06/ur\\clips\\common_voice_ur_31898340.mp3', 'array': array([1.39698386e-09, 1.74622983e-09, 2.91038305e-10, ...,
       1.17505246e-04, 1.56884955e-04, 4.67942300e-05]), 'sampling_rate': 16000}, 'sentence': 'طرح طرح کے پرندے'}


In [None]:
def prepare_dataset(batch, processor):
    """
    Prepare the dataset for training:
    1. Load and resample audio
    2. Compute log-Mel input features
    3. Encode target text to label ids
    """
    # Load audio
    audio = batch["audio"]
    
    # Compute log-Mel input features
    batch["input_features"] = processor.feature_extractor(
        audio["array"], 
        sampling_rate=audio["sampling_rate"]
    ).input_features[0]  # Already a numpy array
    
    # Encode target text to label ids
    batch["labels"] = processor.tokenizer(
        batch["sentence"],
        max_length=1024,
        truncation=True
    ).input_ids
    
    return batch

def process_and_save_dataset(dataset, processor, save_path="processed_urdu_dataset"):
    """Process the entire dataset and save as a single DatasetDict"""
    processed_dataset = {}
    
    for split in dataset.keys():
        print(f"\nProcessing {split} split...")
        processed_dataset[split] = dataset[split].map(
            function=prepare_dataset,
            fn_kwargs={"processor": processor},
            remove_columns=dataset[split].column_names,
            desc=f"Processing {split} split",
            batch_size=4,
            writer_batch_size=100
        )
    
    # Create DatasetDict and save
    final_dataset = DatasetDict(processed_dataset)
    final_dataset.save_to_disk(save_path)
    print(f"\nDataset saved to {save_path}")
    return final_dataset

# Process and save the dataset
urdu_dataset = process_and_save_dataset(urdu_dataset, processor)

# Verify the processed dataset
print("\nVerifying processed dataset:")
example = urdu_dataset["train"][0]
print(f"Input features shape: {example['input_features'].shape}")
print(f"Labels length: {len(example['labels'])}")
print(f"Transcription: {processor.tokenizer.decode(example['labels'], skip_special_tokens=True)}")


Processing train split...

Processing train examples 0 to 50


Processing chunk 0-50:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 50 to 100


Processing chunk 50-100:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 100 to 150


Processing chunk 100-150:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 150 to 200


Processing chunk 150-200:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 200 to 250


Processing chunk 200-250:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 250 to 300


Processing chunk 250-300:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 300 to 350


Processing chunk 300-350:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 350 to 400


Processing chunk 350-400:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 400 to 450


Processing chunk 400-450:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 450 to 500


Processing chunk 450-500:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 500 to 550


Processing chunk 500-550:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 550 to 600


Processing chunk 550-600:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 600 to 650


Processing chunk 600-650:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 650 to 700


Processing chunk 650-700:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 700 to 750


Processing chunk 700-750:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 750 to 800


Processing chunk 750-800:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 800 to 850


Processing chunk 800-850:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 850 to 900


Processing chunk 850-900:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 900 to 950


Processing chunk 900-950:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 950 to 1000


Processing chunk 950-1000:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1000 to 1050


Processing chunk 1000-1050:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1050 to 1100


Processing chunk 1050-1100:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1100 to 1150


Processing chunk 1100-1150:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1150 to 1200


Processing chunk 1150-1200:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1200 to 1250


Processing chunk 1200-1250:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1250 to 1300


Processing chunk 1250-1300:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1300 to 1350


Processing chunk 1300-1350:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1350 to 1400


Processing chunk 1350-1400:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1400 to 1450


Processing chunk 1400-1450:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1450 to 1500


Processing chunk 1450-1500:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1500 to 1550


Processing chunk 1500-1550:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1550 to 1600


Processing chunk 1550-1600:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1600 to 1650


Processing chunk 1600-1650:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1650 to 1700


Processing chunk 1650-1700:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1700 to 1750


Processing chunk 1700-1750:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1750 to 1800


Processing chunk 1750-1800:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1800 to 1850


Processing chunk 1800-1850:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1850 to 1900


Processing chunk 1850-1900:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1900 to 1950


Processing chunk 1900-1950:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 1950 to 2000


Processing chunk 1950-2000:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2000 to 2050


Processing chunk 2000-2050:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2050 to 2100


Processing chunk 2050-2100:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2100 to 2150


Processing chunk 2100-2150:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2150 to 2200


Processing chunk 2150-2200:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2200 to 2250


Processing chunk 2200-2250:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2250 to 2300


Processing chunk 2250-2300:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2300 to 2350


Processing chunk 2300-2350:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2350 to 2400


Processing chunk 2350-2400:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2400 to 2450


Processing chunk 2400-2450:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2450 to 2500


Processing chunk 2450-2500:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2500 to 2550


Processing chunk 2500-2550:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2550 to 2600


Processing chunk 2550-2600:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2600 to 2650


Processing chunk 2600-2650:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2650 to 2700


Processing chunk 2650-2700:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2700 to 2750


Processing chunk 2700-2750:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2750 to 2800


Processing chunk 2750-2800:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2800 to 2850


Processing chunk 2800-2850:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2850 to 2900


Processing chunk 2850-2900:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2900 to 2950


Processing chunk 2900-2950:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 2950 to 3000


Processing chunk 2950-3000:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3000 to 3050


Processing chunk 3000-3050:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3050 to 3100


Processing chunk 3050-3100:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3100 to 3150


Processing chunk 3100-3150:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3150 to 3200


Processing chunk 3150-3200:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3200 to 3250


Processing chunk 3200-3250:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3250 to 3300


Processing chunk 3250-3300:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3300 to 3350


Processing chunk 3300-3350:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3350 to 3400


Processing chunk 3350-3400:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3400 to 3450


Processing chunk 3400-3450:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3450 to 3500


Processing chunk 3450-3500:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3500 to 3550


Processing chunk 3500-3550:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3550 to 3600


Processing chunk 3550-3600:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3600 to 3650


Processing chunk 3600-3650:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3650 to 3700


Processing chunk 3650-3700:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3700 to 3750


Processing chunk 3700-3750:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3750 to 3800


Processing chunk 3750-3800:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3800 to 3850


Processing chunk 3800-3850:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3850 to 3900


Processing chunk 3850-3900:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3900 to 3950


Processing chunk 3900-3950:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 3950 to 4000


Processing chunk 3950-4000:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4000 to 4050


Processing chunk 4000-4050:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4050 to 4100


Processing chunk 4050-4100:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4100 to 4150


Processing chunk 4100-4150:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4150 to 4200


Processing chunk 4150-4200:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4200 to 4250


Processing chunk 4200-4250:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4250 to 4300


Processing chunk 4250-4300:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4300 to 4350


Processing chunk 4300-4350:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4350 to 4400


Processing chunk 4350-4400:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4400 to 4450


Processing chunk 4400-4450:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4450 to 4500


Processing chunk 4450-4500:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4500 to 4550


Processing chunk 4500-4550:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4550 to 4600


Processing chunk 4550-4600:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4600 to 4650


Processing chunk 4600-4650:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4650 to 4700


Processing chunk 4650-4700:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4700 to 4750


Processing chunk 4700-4750:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4750 to 4800


Processing chunk 4750-4800:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4800 to 4850


Processing chunk 4800-4850:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4850 to 4900


Processing chunk 4850-4900:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4900 to 4950


Processing chunk 4900-4950:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 4950 to 5000


Processing chunk 4950-5000:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5000 to 5050


Processing chunk 5000-5050:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5050 to 5100


Processing chunk 5050-5100:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5100 to 5150


Processing chunk 5100-5150:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5150 to 5200


Processing chunk 5150-5200:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5200 to 5250


Processing chunk 5200-5250:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5250 to 5300


Processing chunk 5250-5300:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5300 to 5350


Processing chunk 5300-5350:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5350 to 5400


Processing chunk 5350-5400:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5400 to 5450


Processing chunk 5400-5450:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5450 to 5500


Processing chunk 5450-5500:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5500 to 5550


Processing chunk 5500-5550:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5550 to 5600


Processing chunk 5550-5600:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5600 to 5650


Processing chunk 5600-5650:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5650 to 5700


Processing chunk 5650-5700:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5700 to 5750


Processing chunk 5700-5750:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5750 to 5800


Processing chunk 5750-5800:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5800 to 5850


Processing chunk 5800-5850:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5850 to 5900


Processing chunk 5850-5900:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5900 to 5950


Processing chunk 5900-5950:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 5950 to 6000


Processing chunk 5950-6000:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6000 to 6050


Processing chunk 6000-6050:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6050 to 6100


Processing chunk 6050-6100:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6100 to 6150


Processing chunk 6100-6150:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6150 to 6200


Processing chunk 6150-6200:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6200 to 6250


Processing chunk 6200-6250:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6250 to 6300


Processing chunk 6250-6300:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6300 to 6350


Processing chunk 6300-6350:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6350 to 6400


Processing chunk 6350-6400:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6400 to 6450


Processing chunk 6400-6450:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6450 to 6500


Processing chunk 6450-6500:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6500 to 6550


Processing chunk 6500-6550:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6550 to 6600


Processing chunk 6550-6600:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6600 to 6650


Processing chunk 6600-6650:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6650 to 6700


Processing chunk 6650-6700:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6700 to 6750


Processing chunk 6700-6750:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6750 to 6800


Processing chunk 6750-6800:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6800 to 6850


Processing chunk 6800-6850:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6850 to 6900


Processing chunk 6850-6900:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6900 to 6950


Processing chunk 6900-6950:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 6950 to 7000


Processing chunk 6950-7000:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 7000 to 7050


Processing chunk 7000-7050:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 7050 to 7100


Processing chunk 7050-7100:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 7100 to 7150


Processing chunk 7100-7150:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 7150 to 7200


Processing chunk 7150-7200:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Processing train examples 7200 to 7205


Processing chunk 7200-7205:   0%|          | 0/5 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5 [00:00<?, ? examples/s]


Combining chunks for train split...


In [None]:

# Verify the processed dataset
print("\nVerifying processed dataset:")
example = urdu_dataset["train"][0]
print(f"Input features shape: {example['input_features'].shape}")
print(f"Labels length: {len(example['labels'])}")
print(f"Transcription: {processor.tokenizer.decode(example['labels'], skip_special_tokens=True)}")

## Load Preprocessed dataset From Disk

In [None]:
# Load the processed dataset
from datasets import load_from_disk

# Load the dataset
urdu_dataset = load_from_disk("processed_urdu_dataset")

# Check the first example
print("\nFirst example from processed dataset:")
example = urdu_dataset["train"][0]
print(f"Input features shape: {example['input_features'].shape}")
print(f"Labels length: {len(example['labels'])}")
print(f"Transcription: {processor.tokenizer.decode(example['labels'], skip_special_tokens=True)}")

# Print dataset statistics
print("\nDataset statistics:")
for split in urdu_dataset.keys():
    print(f"{split} split size: {len(urdu_dataset[split])} examples")

In [12]:

# Print dataset format
print("\nDataset format:")
print(urdu_dataset)


Dataset format:
DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 7205
    })
    validation: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 4982
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 5026
    })
})


In [18]:
from huggingface_hub import login
# Login to Hugging Face (you'll be prompted for your token)
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
# # Push the dataset to the Hub
# dataset_name = "urdu-common-voice-20-processed-large-v3"
# urdu_dataset.push_to_hub(
#     f"osman31/{dataset_name}",
#     private=True  # Set to False if you want it public
# )

# print(f"Dataset pushed to: https://huggingface.co/datasets/osman31/{dataset_name}")

Uploading the dataset shards:   0%|          | 0/23 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/16 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/16 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset pushed to: https://huggingface.co/datasets/osman31/urdu-common-voice-20-processed-large-v3


# Step 2 - Training