## Testing Cuda

In [1]:
# Now verify our GPU setup
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("GPU device name:", torch.cuda.get_device_name(0))
    print("GPU memory allocated:", torch.cuda.memory_allocated(0)/1024**2, "MB")
    print("GPU memory reserved:", torch.cuda.memory_reserved(0)/1024**2, "MB")

PyTorch version: 2.6.0+cu126
CUDA available: True
CUDA version: 12.6
GPU device name: NVIDIA GeForce RTX 3060
GPU memory allocated: 0.0 MB
GPU memory reserved: 0.0 MB


## Load Dataset

In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
import os

def load_urdu_dataset(data_dir):
    """
    Load Urdu dataset from local TSV files
    data_dir: Directory containing train.tsv, dev.tsv, test.tsv and clips folder
    """
    # Load TSV files
    train_df = pd.read_csv(os.path.join(data_dir, 'train.tsv'), sep='\t')
    dev_df = pd.read_csv(os.path.join(data_dir, 'dev.tsv'), sep='\t')
    test_df = pd.read_csv(os.path.join(data_dir, 'test.tsv'), sep='\t')
    
    # Add full audio path
    clips_dir = os.path.join(data_dir, 'clips')
    train_df['audio'] = train_df['path'].apply(lambda x: os.path.join(clips_dir, x))
    dev_df['audio'] = dev_df['path'].apply(lambda x: os.path.join(clips_dir, x))
    test_df['audio'] = test_df['path'].apply(lambda x: os.path.join(clips_dir, x))
    
    # Keep only necessary columns
    columns_to_keep = ['audio', 'sentence']
    train_df = train_df[columns_to_keep]
    dev_df = dev_df[columns_to_keep]
    test_df = test_df[columns_to_keep]
    
    # Convert to Hugging Face datasets
    train_dataset = Dataset.from_pandas(train_df)
    dev_dataset = Dataset.from_pandas(dev_df)
    test_dataset = Dataset.from_pandas(test_df)
    
    # Combine into DatasetDict
    dataset_dict = DatasetDict({
        'train': train_dataset,
        'validation': dev_dataset,
        'test': test_dataset
    })
    
    return dataset_dict

# Load the dataset
# Replace with your actual data directory path
data_dir = "S:/cv-corpus-20.0-2024-12-06/ur"
urdu_dataset = load_urdu_dataset(data_dir)

# Print dataset statistics
print("\nDataset statistics:")
for split in urdu_dataset:
    print(f"{split}: {len(urdu_dataset[split])} examples")

# Print first example from training set
print("\nExample from training set:")
print(urdu_dataset['train'][0])


Dataset statistics:
train: 7205 examples
validation: 4982 examples
test: 5026 examples

Example from training set:
{'audio': 'S:/cv-corpus-20.0-2024-12-06/ur\\clips\\common_voice_ur_31898340.mp3', 'sentence': 'طرح طرح کے پرندے'}


In [3]:
urdu_dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 7205
    })
    validation: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 4982
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 5026
    })
})

## Prepare Feature Extractor, Tokenizer and Data

In [4]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

# Initialize the feature extractor
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")

# Initialize the tokenizer with Urdu language
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="Urdu", task="transcribe")

# Let's also create the processor which combines both feature_extractor and tokenizer
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="Urdu", task="transcribe")

# Let's test the tokenizer with a sample Urdu text from our dataset
sample_text = urdu_dataset["train"][0]["sentence"]
print("\nTesting tokenizer:")
print("Original text:", sample_text)
encoded = tokenizer(sample_text)
print("Encoded text:", encoded)

decoded = tokenizer.decode(encoded.input_ids)
print("Decoded text:", decoded)

# Let's also look at the feature extractor's configuration
print("\nFeature extractor config:")
print("Sampling rate:", feature_extractor.sampling_rate)
print("Feature size:", feature_extractor.feature_size)
print("Padding value:", feature_extractor.padding_value)


Testing tokenizer:
Original text: طرح طرح کے پرندے
Encoded text: {'input_ids': [50258, 50290, 50359, 50363, 9566, 2288, 5016, 23032, 2288, 5016, 24049, 21453, 2288, 41260, 7369, 50257], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Decoded text: <|startoftranscript|><|ur|><|transcribe|><|notimestamps|>طرح طرح کے پرندے<|endoftext|>

Feature extractor config:
Sampling rate: 16000
Feature size: 80
Padding value: 0.0


## Testing tokenizer

In [5]:
# Test tokenizer with and without special tokens
input_str = urdu_dataset["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print("\nTokenizer test with special tokens:")
print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


Tokenizer test with special tokens:
Input:                 طرح طرح کے پرندے
Decoded w/ special:    <|startoftranscript|><|ur|><|transcribe|><|notimestamps|>طرح طرح کے پرندے<|endoftext|>
Decoded w/out special: طرح طرح کے پرندے
Are equal:             True


## check default sampling rate of 1st training example

In [6]:
print(urdu_dataset["train"][0])

{'audio': 'S:/cv-corpus-20.0-2024-12-06/ur\\clips\\common_voice_ur_31898340.mp3', 'sentence': 'طرح طرح کے پرندے'}


## fixing sampling rate

In [7]:
# Now let's prepare our dataset with the correct sampling rate
# First, let's check the current audio format
from datasets import Audio
import numpy as np

# Cast the audio column to audio format
urdu_dataset = urdu_dataset.cast_column("audio", Audio(sampling_rate=16000))

# Let's examine the first audio file after casting
print("\nAudio format check:")
print("First example audio details:")
audio = urdu_dataset["train"][0]["audio"]
print(f"Path: {audio['path']}")
print(f"Sampling rate: {audio['sampling_rate']}")
print(f"Array shape: {audio['array'].shape}")


Audio format check:
First example audio details:
Path: S:/cv-corpus-20.0-2024-12-06/ur\clips\common_voice_ur_31898340.mp3
Sampling rate: 16000
Array shape: (50688,)


In [8]:
#check the format of 1st training example
print(urdu_dataset["train"][0])


{'audio': {'path': 'S:/cv-corpus-20.0-2024-12-06/ur\\clips\\common_voice_ur_31898340.mp3', 'array': array([1.39698386e-09, 1.74622983e-09, 2.91038305e-10, ...,
       1.17505246e-04, 1.56884955e-04, 4.67942300e-05]), 'sampling_rate': 16000}, 'sentence': 'طرح طرح کے پرندے'}


In [9]:
def prepare_dataset(batch, processor):
    """
    Prepare the dataset for training, leaving padding and tensor conversion to the data collator:
    1. Load and resample audio
    2. Compute log-Mel input features
    3. Encode target text to label ids (without padding or tensor conversion)
    """
    # Load audio
    audio = batch["audio"]
    
    # Compute log-Mel input features
    batch["input_features"] = processor.feature_extractor(
        audio["array"], 
        sampling_rate=audio["sampling_rate"]
    ).input_features[0]
    
    # Encode target text to label ids (without padding)
    batch["labels"] = processor.tokenizer(
        batch["sentence"],
        max_length=1024,
        truncation=True
    ).input_ids

    return batch

In [10]:
# Process dataset split by split with very small batches
def process_and_save_split(dataset_split, split_name, processor, save_dir="./processed_dataset"):
    """Process a single split of the dataset with minimal memory usage"""
    os.makedirs(save_dir, exist_ok=True)
    
    # Process in chunks of 100 examples
    chunk_size = 100
    processed_chunks = []
    
    for i in range(0, len(dataset_split), chunk_size):
        end_idx = min(i + chunk_size, len(dataset_split))
        print(f"\nProcessing {split_name} examples {i} to {end_idx}")
        
        # Select a small chunk
        chunk = dataset_split.select(range(i, end_idx))
        
        # Process the chunk
        processed_chunk = chunk.map(
            function=prepare_dataset,
            fn_kwargs={"processor": processor},
            remove_columns=chunk.column_names,
            desc=f"Processing chunk {i}-{end_idx}",
            batch_size=4,  # Very small batch size
            writer_batch_size=50  # Small writer batch size
        )
        
        # Save the processed chunk immediately
        chunk_path = f"{save_dir}/{split_name}_chunk_{i}"
        processed_chunk.save_to_disk(chunk_path)
        processed_chunks.append(chunk_path)
        
        # Clear memory
        del chunk, processed_chunk
        import gc
        gc.collect()
    
    # Combine all chunks for this split
    print(f"\nCombining chunks for {split_name} split...")
    combined_split = concatenate_datasets([load_from_disk(path) for path in processed_chunks])
    
    # Save the combined split
    combined_split.save_to_disk(f"{save_dir}/{split_name}")
    
    # Clean up chunk files
    for chunk_path in processed_chunks:
        import shutil
        shutil.rmtree(chunk_path)
    
    return combined_split

# Process each split separately
try:
    from datasets import concatenate_datasets, load_from_disk
    
    print("Processing dataset split by split...")
    processed_dataset = {}
    
    for split in urdu_dataset.keys():
        print(f"\nProcessing {split} split...")
        processed_dataset[split] = process_and_save_split(
            urdu_dataset[split], 
            split, 
            processor,
            save_dir="./processed_urdu_dataset"
        )
    
    # Create the final DatasetDict
    from datasets import DatasetDict
    urdu_dataset = DatasetDict(processed_dataset)
    
    print("\nDataset processing completed!")
    
    # Verify the processed dataset
    for split in urdu_dataset.keys():
        print(f"\n{split} split size:", len(urdu_dataset[split]))
        
except Exception as e:
    print(f"An error occurred: {str(e)}")
    raise e

Processing dataset split by split...

Processing train split...

Processing train examples 0 to 100


Processing chunk 0-100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 100 to 200


Processing chunk 100-200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 200 to 300


Processing chunk 200-300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 300 to 400


Processing chunk 300-400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 400 to 500


Processing chunk 400-500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 500 to 600


Processing chunk 500-600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 600 to 700


Processing chunk 600-700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 700 to 800


Processing chunk 700-800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 800 to 900


Processing chunk 800-900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 900 to 1000


Processing chunk 900-1000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 1000 to 1100


Processing chunk 1000-1100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 1100 to 1200


Processing chunk 1100-1200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 1200 to 1300


Processing chunk 1200-1300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 1300 to 1400


Processing chunk 1300-1400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 1400 to 1500


Processing chunk 1400-1500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 1500 to 1600


Processing chunk 1500-1600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 1600 to 1700


Processing chunk 1600-1700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 1700 to 1800


Processing chunk 1700-1800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 1800 to 1900


Processing chunk 1800-1900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 1900 to 2000


Processing chunk 1900-2000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 2000 to 2100


Processing chunk 2000-2100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 2100 to 2200


Processing chunk 2100-2200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 2200 to 2300


Processing chunk 2200-2300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 2300 to 2400


Processing chunk 2300-2400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 2400 to 2500


Processing chunk 2400-2500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 2500 to 2600


Processing chunk 2500-2600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 2600 to 2700


Processing chunk 2600-2700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 2700 to 2800


Processing chunk 2700-2800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 2800 to 2900


Processing chunk 2800-2900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 2900 to 3000


Processing chunk 2900-3000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 3000 to 3100


Processing chunk 3000-3100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 3100 to 3200


Processing chunk 3100-3200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 3200 to 3300


Processing chunk 3200-3300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 3300 to 3400


Processing chunk 3300-3400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 3400 to 3500


Processing chunk 3400-3500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 3500 to 3600


Processing chunk 3500-3600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 3600 to 3700


Processing chunk 3600-3700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 3700 to 3800


Processing chunk 3700-3800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 3800 to 3900


Processing chunk 3800-3900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 3900 to 4000


Processing chunk 3900-4000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 4000 to 4100


Processing chunk 4000-4100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 4100 to 4200


Processing chunk 4100-4200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 4200 to 4300


Processing chunk 4200-4300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 4300 to 4400


Processing chunk 4300-4400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 4400 to 4500


Processing chunk 4400-4500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 4500 to 4600


Processing chunk 4500-4600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 4600 to 4700


Processing chunk 4600-4700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 4700 to 4800


Processing chunk 4700-4800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 4800 to 4900


Processing chunk 4800-4900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 4900 to 5000


Processing chunk 4900-5000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 5000 to 5100


Processing chunk 5000-5100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 5100 to 5200


Processing chunk 5100-5200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 5200 to 5300


Processing chunk 5200-5300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 5300 to 5400


Processing chunk 5300-5400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 5400 to 5500


Processing chunk 5400-5500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 5500 to 5600


Processing chunk 5500-5600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 5600 to 5700


Processing chunk 5600-5700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 5700 to 5800


Processing chunk 5700-5800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 5800 to 5900


Processing chunk 5800-5900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 5900 to 6000


Processing chunk 5900-6000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 6000 to 6100


Processing chunk 6000-6100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 6100 to 6200


Processing chunk 6100-6200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 6200 to 6300


Processing chunk 6200-6300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 6300 to 6400


Processing chunk 6300-6400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 6400 to 6500


Processing chunk 6400-6500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 6500 to 6600


Processing chunk 6500-6600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 6600 to 6700


Processing chunk 6600-6700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 6700 to 6800


Processing chunk 6700-6800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 6800 to 6900


Processing chunk 6800-6900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 6900 to 7000


Processing chunk 6900-7000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 7000 to 7100


Processing chunk 7000-7100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 7100 to 7200


Processing chunk 7100-7200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing train examples 7200 to 7205


Processing chunk 7200-7205:   0%|          | 0/5 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5 [00:00<?, ? examples/s]


Combining chunks for train split...


Saving the dataset (0/14 shards):   0%|          | 0/7205 [00:00<?, ? examples/s]


Processing validation split...

Processing validation examples 0 to 100


Processing chunk 0-100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 100 to 200


Processing chunk 100-200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 200 to 300


Processing chunk 200-300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 300 to 400


Processing chunk 300-400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 400 to 500


Processing chunk 400-500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 500 to 600


Processing chunk 500-600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 600 to 700


Processing chunk 600-700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 700 to 800


Processing chunk 700-800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 800 to 900


Processing chunk 800-900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 900 to 1000


Processing chunk 900-1000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 1000 to 1100


Processing chunk 1000-1100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 1100 to 1200


Processing chunk 1100-1200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 1200 to 1300


Processing chunk 1200-1300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 1300 to 1400


Processing chunk 1300-1400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 1400 to 1500


Processing chunk 1400-1500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 1500 to 1600


Processing chunk 1500-1600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 1600 to 1700


Processing chunk 1600-1700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 1700 to 1800


Processing chunk 1700-1800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 1800 to 1900


Processing chunk 1800-1900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 1900 to 2000


Processing chunk 1900-2000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 2000 to 2100


Processing chunk 2000-2100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 2100 to 2200


Processing chunk 2100-2200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 2200 to 2300


Processing chunk 2200-2300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 2300 to 2400


Processing chunk 2300-2400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 2400 to 2500


Processing chunk 2400-2500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 2500 to 2600


Processing chunk 2500-2600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 2600 to 2700


Processing chunk 2600-2700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 2700 to 2800


Processing chunk 2700-2800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 2800 to 2900


Processing chunk 2800-2900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 2900 to 3000


Processing chunk 2900-3000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 3000 to 3100


Processing chunk 3000-3100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 3100 to 3200


Processing chunk 3100-3200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 3200 to 3300


Processing chunk 3200-3300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 3300 to 3400


Processing chunk 3300-3400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 3400 to 3500


Processing chunk 3400-3500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 3500 to 3600


Processing chunk 3500-3600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 3600 to 3700


Processing chunk 3600-3700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 3700 to 3800


Processing chunk 3700-3800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 3800 to 3900


Processing chunk 3800-3900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 3900 to 4000


Processing chunk 3900-4000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 4000 to 4100


Processing chunk 4000-4100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 4100 to 4200


Processing chunk 4100-4200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 4200 to 4300


Processing chunk 4200-4300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 4300 to 4400


Processing chunk 4300-4400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 4400 to 4500


Processing chunk 4400-4500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 4500 to 4600


Processing chunk 4500-4600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 4600 to 4700


Processing chunk 4600-4700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 4700 to 4800


Processing chunk 4700-4800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 4800 to 4900


Processing chunk 4800-4900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing validation examples 4900 to 4982


Processing chunk 4900-4982:   0%|          | 0/82 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/82 [00:00<?, ? examples/s]


Combining chunks for validation split...


Saving the dataset (0/10 shards):   0%|          | 0/4982 [00:00<?, ? examples/s]


Processing test split...

Processing test examples 0 to 100


Processing chunk 0-100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 100 to 200


Processing chunk 100-200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 200 to 300


Processing chunk 200-300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 300 to 400


Processing chunk 300-400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 400 to 500


Processing chunk 400-500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 500 to 600


Processing chunk 500-600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 600 to 700


Processing chunk 600-700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 700 to 800


Processing chunk 700-800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 800 to 900


Processing chunk 800-900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 900 to 1000


Processing chunk 900-1000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 1000 to 1100


Processing chunk 1000-1100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 1100 to 1200


Processing chunk 1100-1200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 1200 to 1300


Processing chunk 1200-1300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 1300 to 1400


Processing chunk 1300-1400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 1400 to 1500


Processing chunk 1400-1500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 1500 to 1600


Processing chunk 1500-1600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 1600 to 1700


Processing chunk 1600-1700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 1700 to 1800


Processing chunk 1700-1800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 1800 to 1900


Processing chunk 1800-1900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 1900 to 2000


Processing chunk 1900-2000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 2000 to 2100


Processing chunk 2000-2100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 2100 to 2200


Processing chunk 2100-2200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 2200 to 2300


Processing chunk 2200-2300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 2300 to 2400


Processing chunk 2300-2400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 2400 to 2500


Processing chunk 2400-2500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 2500 to 2600


Processing chunk 2500-2600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 2600 to 2700


Processing chunk 2600-2700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 2700 to 2800


Processing chunk 2700-2800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 2800 to 2900


Processing chunk 2800-2900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 2900 to 3000


Processing chunk 2900-3000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 3000 to 3100


Processing chunk 3000-3100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 3100 to 3200


Processing chunk 3100-3200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 3200 to 3300


Processing chunk 3200-3300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 3300 to 3400


Processing chunk 3300-3400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 3400 to 3500


Processing chunk 3400-3500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 3500 to 3600


Processing chunk 3500-3600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 3600 to 3700


Processing chunk 3600-3700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 3700 to 3800


Processing chunk 3700-3800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 3800 to 3900


Processing chunk 3800-3900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 3900 to 4000


Processing chunk 3900-4000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 4000 to 4100


Processing chunk 4000-4100:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 4100 to 4200


Processing chunk 4100-4200:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 4200 to 4300


Processing chunk 4200-4300:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 4300 to 4400


Processing chunk 4300-4400:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 4400 to 4500


Processing chunk 4400-4500:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 4500 to 4600


Processing chunk 4500-4600:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 4600 to 4700


Processing chunk 4600-4700:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 4700 to 4800


Processing chunk 4700-4800:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 4800 to 4900


Processing chunk 4800-4900:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 4900 to 5000


Processing chunk 4900-5000:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


Processing test examples 5000 to 5026


Processing chunk 5000-5026:   0%|          | 0/26 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/26 [00:00<?, ? examples/s]


Combining chunks for test split...


Saving the dataset (0/10 shards):   0%|          | 0/5026 [00:00<?, ? examples/s]


Dataset processing completed!

train split size: 7205

validation split size: 4982

test split size: 5026


## Load Preprocessed dataset From Disk

In [13]:
from datasets import load_from_disk, DatasetDict

# Load each split separately
urdu_dataset = DatasetDict({
    "train": load_from_disk("./processed_urdu_dataset/train"),
    "validation": load_from_disk("./processed_urdu_dataset/validation"),
    "test": load_from_disk("./processed_urdu_dataset/test")
})

# Check the first example from each split
for split in urdu_dataset.keys():
    print(f"\n=== Example from {split} split ===")
    example = urdu_dataset[split][0]
    
    # Print the input features shape
    print(f"Input features shape: {example['input_features'].shape}")
    
    # Decode the labels back to text
    decoded_text = processor.tokenizer.decode(example['labels'], skip_special_tokens=True)
    print(f"Transcription: {decoded_text}")
    
# Print dataset sizes
print("\n=== Dataset Statistics ===")
for split in urdu_dataset.keys():
    print(f"{split} split size: {len(urdu_dataset[split])} examples")


=== Example from train split ===


AttributeError: 'list' object has no attribute 'shape'

In [12]:

# Print dataset format
print("\nDataset format:")
print(urdu_dataset)


Dataset format:
DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 7205
    })
    validation: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 4982
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 5026
    })
})


In [18]:
from huggingface_hub import login
# Login to Hugging Face (you'll be prompted for your token)
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
# # Push the dataset to the Hub
# dataset_name = "urdu-common-voice-20-processed-large-v3"
# urdu_dataset.push_to_hub(
#     f"osman31/{dataset_name}",
#     private=True  # Set to False if you want it public
# )

# print(f"Dataset pushed to: https://huggingface.co/datasets/osman31/{dataset_name}")

Uploading the dataset shards:   0%|          | 0/23 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/16 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/16 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset pushed to: https://huggingface.co/datasets/osman31/urdu-common-voice-20-processed-large-v3


# Step 2 - Training