In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
!pip install huggingface_hub[hf_xet]

Collecting hf-xet>=0.1.4 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hf-xet
Successfully installed hf-xet-1.0.2


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import load_dataset
import numpy as np
import time
from datetime import datetime

class SimpleASRModel(nn.Module):
    """
    A simplified ASR model using an LSTM.
    Note: When using raw waveforms, the input dimension is 1.
    """
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleASRModel, self).__init__()
        # LSTM layer expects input shape (batch, time, feature)
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        # Fully connected layer to map hidden state to output classes
        self.fc = nn.Linear(hidden_dim, output_dim)

        print(f"[MODEL] Initialized SimpleASRModel with input_dim={input_dim}, hidden_dim={hidden_dim}, output_dim={output_dim}")

    def forward(self, x):
        # x shape: (batch, time, feature)
        outputs, _ = self.lstm(x)
        # Use the last time-step for classification
        logits = self.fc(outputs[:, -1, :])
        return logits

def load_data(dataset_name, split):
    """
    Load a dataset from Hugging Face using the datasets library.

    Parameters:
      - dataset_name: the identifier of the dataset
          (e.g., 'united-we-care/United-Syn-Med' or 'mozilla-foundation/common_voice_11_0')
      - split: the dataset split to load (e.g., 'train')

    Returns:
      The loaded dataset.
    """
    print(f"[DATA] Loading dataset: {dataset_name}, split: {split}")
    start_time = time.time()

    try:
        dataset = load_dataset(dataset_name, split=split)
        load_time = time.time() - start_time
        print(f"[DATA] Successfully loaded dataset in {load_time:.2f} seconds")
        print(f"[DATA] Dataset size: {len(dataset)} examples")
        print(f"[DATA] First example keys: {list(dataset[0].keys())}")
        return dataset
    except Exception as e:
        print(f"[ERROR] Failed to load dataset: {e}")
        raise

def collate_fn(batch):
    """
    A collate function to pad variable-length audio sequences.

    This function supports datasets that provide:
      - Audio data under 'speech' or under 'audio' (with key 'array')
      - Transcriptions under 'transcript', 'sentence', 'spoken_text', or 'normalized_text'

    It pads the NumPy arrays using np.pad, then converts them to a tensor
    and unsqueezes the last dimension to add a feature dimension.
    """
    # Log batch size
    print(f"[COLLATE] Processing batch of size {len(batch)}")

    # Determine the audio field to use
    if 'speech' in batch[0]:
        print("[COLLATE] Using 'speech' field for audio data")
        speeches = [np.array(item['speech']) for item in batch]
    elif 'audio' in batch[0]:
        print("[COLLATE] Using 'audio.array' field for audio data")
        # Assume audio is stored as a dictionary with an 'array' key
        speeches = [np.array(item['audio']['array']) for item in batch]
    else:
        error_msg = f"[ERROR] No recognized audio field in dataset item. Available keys: {list(batch[0].keys())}"
        print(error_msg)
        raise ValueError(error_msg)

    # Determine the transcript field to use
    if 'transcript' in batch[0]:
        print("[COLLATE] Using 'transcript' field for text data")
        transcripts = [item['transcript'] for item in batch]
    elif 'sentence' in batch[0]:
        print("[COLLATE] Using 'sentence' field for text data")
        transcripts = [item['sentence'] for item in batch]
    elif 'spoken_text' in batch[0]:
        print("[COLLATE] Using 'spoken_text' field for text data")
        transcripts = [item['spoken_text'] for item in batch]
    elif 'normalized_text' in batch[0]:
        print("[COLLATE] Using 'normalized_text' field for text data")
        transcripts = [item['normalized_text'] for item in batch]
    else:
        error_msg = f"[ERROR] No recognized transcript field in dataset item. Available keys: {list(batch[0].keys())}"
        print(error_msg)
        raise ValueError(error_msg)

    # Find the maximum sequence length in the batch
    max_len = max(speech.shape[0] for speech in speeches)
    print(f"[COLLATE] Maximum sequence length in batch: {max_len}")

    # Print audio statistics
    min_len = min(speech.shape[0] for speech in speeches)
    avg_len = sum(speech.shape[0] for speech in speeches) / len(speeches)
    print(f"[COLLATE] Audio length statistics - Min: {min_len}, Max: {max_len}, Avg: {avg_len:.2f}")

    # Pad each audio sequence with zeros so all have the same length
    padded_speeches = [np.pad(speech, (0, max_len - speech.shape[0]), mode='constant') for speech in speeches]
    # Convert list to a single NumPy array before converting to tensor
    padded_speeches = np.array(padded_speeches)
    padded_speeches = torch.tensor(padded_speeches, dtype=torch.float32)
    # Add an extra dimension to represent the feature dimension (channel)
    padded_speeches = padded_speeches.unsqueeze(-1).contiguous()

    print(f"[COLLATE] Final tensor shape: {padded_speeches.shape}")
    print(f"[COLLATE] Sample transcript: '{transcripts[0][:50]}{'...' if len(transcripts[0]) > 50 else ''}'")

    return padded_speeches, transcripts

def create_dataloader(dataset, batch_size=16, shuffle=True):
    """
    Create a DataLoader for the given dataset.
    """
    print(f"[DATALOADER] Creating DataLoader with batch_size={batch_size}, shuffle={shuffle}")
    try:
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)
        print(f"[DATALOADER] Successfully created DataLoader with {len(dataloader)} batches")
        return dataloader
    except Exception as e:
        print(f"[ERROR] Failed to create DataLoader: {e}")
        raise

def train(model, dataloader, epochs=10, learning_rate=0.001, device='cpu'):
    """
    A simple training loop for the ASR model.

    For demonstration purposes, this loop uses dummy targets.
    In an actual ASR system, you would perform sequence-to-sequence training.

    Parameters:
      - model: the ASR model to train.
      - dataloader: the DataLoader for training data.
      - epochs: number of training epochs.
      - learning_rate: learning rate for the optimizer.
      - device: device to run training on (cpu or cuda).
    """
    print(f"\n[TRAIN] Starting training with {epochs} epochs, learning_rate={learning_rate}, device={device}")
    print(f"[TRAIN] Model will process {len(dataloader)} batches per epoch")

    # Check if CUDA is available if device is set to 'cuda'
    if device == 'cuda' and not torch.cuda.is_available():
        print("[WARNING] CUDA requested but not available. Falling back to CPU.")
        device = 'cpu'

    model.to(device)
    print(f"[TRAIN] Model moved to device: {device}")

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    print(f"[TRAIN] Using Adam optimizer with learning_rate={learning_rate}")

    model.train()
    print("[TRAIN] Model set to training mode")

    total_start_time = time.time()

    for epoch in range(epochs):
        epoch_start_time = time.time()
        print(f"\n[TRAIN] Starting Epoch {epoch+1}/{epochs} at {datetime.now().strftime('%H:%M:%S')}")

        epoch_loss = 0.0
        batch_count = 0

        for batch_idx, (inputs, transcripts) in enumerate(dataloader):
            batch_start_time = time.time()
            print(f"[TRAIN] Epoch {epoch+1}, Batch {batch_idx+1}/{len(dataloader)}")

            inputs = inputs.to(device)
            inputs = inputs.contiguous()
            print(f"[TRAIN] Input shape: {inputs.shape}")

            # Dummy target: using class 0 for all examples
            targets = torch.zeros(inputs.size(0), dtype=torch.long).to(device)
            print(f"[TRAIN] Created dummy targets with shape: {targets.shape}")

            # Forward pass
            print("[TRAIN] Starting forward pass")
            outputs = model(inputs)
            print(f"[TRAIN] Forward pass complete, output shape: {outputs.shape}")

            # Compute loss
            loss = criterion(outputs, targets)
            print(f"[TRAIN] Batch loss: {loss.item():.4f}")

            # Backward pass
            print("[TRAIN] Starting backward pass")
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            print("[TRAIN] Backward pass complete")

            epoch_loss += loss.item()
            batch_count += 1

            batch_time = time.time() - batch_start_time
            print(f"[TRAIN] Batch processed in {batch_time:.2f} seconds")

            # Print progress every 5 batches or for the last batch
            if (batch_idx + 1) % 5 == 0 or batch_idx + 1 == len(dataloader):
                avg_loss = epoch_loss / batch_count
                print(f"[TRAIN] Epoch {epoch+1}, Progress: {batch_idx+1}/{len(dataloader)} batches, Avg Loss: {avg_loss:.4f}")

        epoch_time = time.time() - epoch_start_time
        epoch_avg_loss = epoch_loss / len(dataloader)
        print(f"[TRAIN] Epoch {epoch+1}/{epochs} completed in {epoch_time:.2f} seconds")
        print(f"[TRAIN] Epoch {epoch+1} average loss: {epoch_avg_loss:.4f}")

    total_time = time.time() - total_start_time
    print(f"\n[TRAIN] Training complete! Total time: {total_time:.2f} seconds")
    print(f"[TRAIN] Average time per epoch: {total_time/epochs:.2f} seconds")

def evaluate(model, dataloader, device='cpu'):
    """
    Evaluate the model using a dummy accuracy metric.

    For a real ASR system, evaluation should involve decoding the output and computing metrics like WER.
    """
    print(f"\n[EVAL] Starting evaluation on {device}")

    # Check if CUDA is available if device is set to 'cuda'
    if device == 'cuda' and not torch.cuda.is_available():
        print("[WARNING] CUDA requested but not available. Falling back to CPU.")
        device = 'cpu'

    model.to(device)
    model.eval()
    print("[EVAL] Model set to evaluation mode")

    correct = 0
    total = 0
    eval_start_time = time.time()

    with torch.no_grad():
        for batch_idx, (inputs, transcripts) in enumerate(dataloader):
            print(f"[EVAL] Processing batch {batch_idx+1}/{len(dataloader)}")

            batch_start_time = time.time()
            inputs = inputs.to(device)
            print(f"[EVAL] Input shape: {inputs.shape}")

            # Dummy targets for demonstration
            targets = torch.zeros(inputs.size(0), dtype=torch.long).to(device)

            # Forward pass
            print("[EVAL] Running forward pass")
            outputs = model(inputs)
            print(f"[EVAL] Output shape: {outputs.shape}")

            # Get predictions
            _, predicted = torch.max(outputs, 1)
            print(f"[EVAL] Predictions shape: {predicted.shape}")

            # Update statistics
            batch_total = targets.size(0)
            batch_correct = (predicted == targets).sum().item()
            total += batch_total
            correct += batch_correct

            batch_time = time.time() - batch_start_time
            batch_accuracy = 100 * batch_correct / batch_total
            print(f"[EVAL] Batch {batch_idx+1} processed in {batch_time:.2f} seconds")
            print(f"[EVAL] Batch {batch_idx+1} accuracy: {batch_accuracy:.2f}% ({batch_correct}/{batch_total})")

    accuracy = 100 * correct / total if total > 0 else 0
    eval_time = time.time() - eval_start_time
    print(f"\n[EVAL] Evaluation completed in {eval_time:.2f} seconds")
    print(f"[EVAL] Overall accuracy: {accuracy:.2f}% ({correct}/{total})")


In [None]:
import torch
# from asr_training import SimpleASRModel, load_data, create_dataloader, train, evaluate
from datasets import load_dataset
import time
from datetime import datetime
from torch.utils.data import random_split

def load_common_voice():
    """
    Load and prepare the Common Voice dataset from Kaggle or local directory.
    This dataset provides diverse English accents.
    """
    print(f"[DATASET] Starting Common Voice dataset loading at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    start_time = time.time()

    # If downloaded from Kaggle
    # common_voice_path = "path/to/common_voice"

    # Alternative: Load directly from Hugging Face
    # common_voice = load_dataset("mozilla-foundation/common_voice_11_0", "en")
    try:
        print("[DATASET] Attempting to load Common Voice dataset from Hugging Face")
        print("[DATASET] Parameters: MikhailT/lj-speech, language=en, num_proc=1, split=train")

        common_voice = load_dataset(
            "MikhailT/lj-speech",
            "default",
            num_proc=1,  # Reduce parallel processes
            trust_remote_code=True,  # Trust the remote code
            split="full",  # Use the training split
        )

        load_time = time.time() - start_time
        print(f"[DATASET] Successfully loaded Common Voice dataset in {load_time:.2f} seconds")
        print(f"[DATASET] Dataset size: {len(common_voice)} examples")

        # Print sample information
        if len(common_voice) > 0:
            print(f"[DATASET] First example keys: {list(common_voice[0].keys())}")
            # Print audio information for the first example
            if 'audio' in common_voice[0]:
                print(f"[DATASET] First example audio sampling rate: {common_voice[0]['audio']['sampling_rate']} Hz")
                print(f"[DATASET] First example audio array shape: {len(common_voice[0]['audio']['array'])}")
            # Print text information for the first example
            if 'sentence' in common_voice[0]:
                print(f"[DATASET] First example sentence: '{common_voice[0]['sentence'][:50]}{'...' if len(common_voice[0]['sentence']) > 50 else ''}'")

    except Exception as e:
        print(f"[ERROR] Failed to load dataset: {str(e)}")
        raise

    return common_voice

def main():
    print("\n[MAIN] Starting ASR model training and evaluation")
    print(f"[MAIN] Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"[MAIN] PyTorch version: {torch.__version__}")
    print(f"[MAIN] CUDA available: {torch.cuda.is_available()}")

    # Dataset identifiers
    common_voice_dataset_id = "MikhailT/lj-speech"

    # Disable cuDNN
    print("[MAIN] Disabling cuDNN")
    torch.backends.cudnn.enabled = False
    print(f"[MAIN] cuDNN enabled: {torch.backends.cudnn.enabled}")

    # Step 1: Load dataset
    print("\n[MAIN] Step 1: Loading Common Voice dataset")
    try:
        dataset_start_time = time.time()
        dataset = load_common_voice()
        dataset_time = time.time() - dataset_start_time
        print(f"[MAIN] Dataset loading completed in {dataset_time:.2f} seconds")
    except Exception as e:
        print(f"[MAIN] Failed to load dataset: {e}")
        return

    # Step 2: Create DataLoader
    print("\n[MAIN] Step 2: Creating DataLoader")
    try:
        batch_size = 10
        print(f"[MAIN] Creating DataLoader with batch_size={batch_size}")
        dataloader_start_time = time.time()
        # Assuming 'dataset' is your full dataset
        dataset_size = len(dataset)
        train_size = int(0.9 * dataset_size)
        val_size = dataset_size - train_size

        # Split the dataset into train and validation subsets
        train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

        # Create DataLoaders for each subset
        dataloader_train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
        dataloader_val = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
        dataloader_time = time.time() - dataloader_start_time
        print(f"[MAIN] DataLoader creation completed in {dataloader_time:.2f} seconds")
        print(f"[MAIN] Number of batches in train: {len(dataloader_train)}")
        print(f"[MAIN] Number of batches in train: {len(dataloader_val)}")
    except Exception as e:
        print(f"[MAIN] Failed to create DataLoader: {e}")
        return

    # Step 3: Define model hyperparameters
    print("\n[MAIN] Step 3: Defining model hyperparameters")
    input_dim = 1      # Example: number of MFCC features (placeholder)
    hidden_dim = 60   # Hidden dimension for LSTM
    output_dim = 10    # Example: number of output classes (placeholder)

    print(f"[MAIN] Model hyperparameters:")
    print(f"[MAIN] - input_dim: {input_dim}")
    print(f"[MAIN] - hidden_dim: {hidden_dim}")
    print(f"[MAIN] - output_dim: {output_dim}")

    # Step 4: Initialize the model
    print("\n[MAIN] Step 4: Initializing the model")
    try:
        model_start_time = time.time()
        model = SimpleASRModel(input_dim, hidden_dim, output_dim)
        model_time = time.time() - model_start_time
        print(f"[MAIN] Model initialization completed in {model_time:.2f} seconds")
        print(f"[MAIN] Model architecture:\n{model}")

        # Count model parameters
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"[MAIN] Total parameters: {total_params:,}")
        print(f"[MAIN] Trainable parameters: {trainable_params:,}")
    except Exception as e:
        print(f"[MAIN] Failed to initialize model: {e}")
        return

    # Step 5: Set the device
    print("\n[MAIN] Step 5: Setting up device")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[MAIN] Using device: {device}")

    if device.type == 'cuda':
        print(f"[MAIN] CUDA Device: {torch.cuda.get_device_name(0)}")
        print(f"[MAIN] CUDA Memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
        print(f"[MAIN] CUDA Memory cached: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")

    # Step 6: Train the model
    print("\n[MAIN] Step 6: Training the model")
    try:
        epochs = 5
        learning_rate = 0.001
        print(f"[MAIN] Training parameters:")
        print(f"[MAIN] - epochs: {epochs}")
        print(f"[MAIN] - learning_rate: {learning_rate}")
        print(f"[MAIN] - device: {device}")

        train_start_time = time.time()
        train(model, dataloader_train, epochs=epochs, learning_rate=learning_rate, device=device)
        train_time = time.time() - train_start_time
        print(f"[MAIN] Training completed in {train_time:.2f} seconds")
        print(f"[MAIN] Average time per epoch: {train_time/epochs:.2f} seconds")

        if device.type == 'cuda':
            print(f"[MAIN] CUDA Memory after training: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
    except Exception as e:
        print(f"[MAIN] Error during training: {e}")
        return

    # Step 7: Evaluate the model
    print("\n[MAIN] Step 7: Evaluating the model")
    try:
        eval_start_time = time.time()
        evaluate(model, dataloader_val, device=device)
        eval_time = time.time() - eval_start_time
        print(f"[MAIN] Evaluation completed in {eval_time:.2f} seconds")
    except Exception as e:
        print(f"[MAIN] Error during evaluation: {e}")
        return

    # Step 8: Summarize execution
    total_time = time.time() - dataset_start_time
    print("\n[MAIN] Execution summary:")
    print(f"[MAIN] Total execution time: {total_time:.2f} seconds")
    print(f"[MAIN] - Dataset loading: {dataset_time:.2f} seconds ({dataset_time/total_time*100:.1f}%)")
    # print(f"[MAIN] - DataLoader creation: {dataloader_time:.2f} seconds ({dataloader_time/total_time*100:.1f}%)")
    print(f"[MAIN] - Model initialization: {model_time:.2f} seconds ({model_time/total_time*100:.1f}%)")
    print(f"[MAIN] - Training: {train_time:.2f} seconds ({train_time/total_time*100:.1f}%)")
    print(f"[MAIN] - Evaluation: {eval_time:.2f} seconds ({eval_time/total_time*100:.1f}%)")
    print(f"[MAIN] Process completed successfully at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    return model

if __name__ == "__main__":
    try:
        model = main()
        print("[MAIN] Model training and evaluation completed successfully")
        # Save the model as pickle
        import pickle
        with open("asr_model.pkl", "wb") as f:
            pickle.dump(model, f)
    except Exception as e:
        print(f"[FATAL] Unhandled exception in main: {e}")
        import traceback
        print(traceback.format_exc())

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[COLLATE] Audio length statistics - Min: 43677, Max: 203421, Avg: 144489.80
[COLLATE] Final tensor shape: torch.Size([10, 203421, 1])
[COLLATE] Sample transcript: 'and ready at any instant to hazard great danger to...'
[TRAIN] Epoch 1, Batch 15/1179
[TRAIN] Input shape: torch.Size([10, 203421, 1])
[TRAIN] Created dummy targets with shape: torch.Size([10])
[TRAIN] Starting forward pass
[TRAIN] Forward pass complete, output shape: torch.Size([10, 10])
[TRAIN] Batch loss: 1.9411
[TRAIN] Starting backward pass
[TRAIN] Backward pass complete
[TRAIN] Batch processed in 64.77 seconds
[TRAIN] Epoch 1, Progress: 15/1179 batches, Avg Loss: 2.1108
[COLLATE] Processing batch of size 10
[COLLATE] Using 'audio.array' field for audio data
[COLLATE] Using 'spoken_text' field for text data
[COLLATE] Maximum sequence length in batch: 212125
[COLLATE] Audio length statistics - Min: 68253, Max: 212125, Avg: 153603.40
[COLLATE] Final tensor s

KeyboardInterrupt: 