In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The goal of this assignment is to develop an Automatic Speech Recognition (ASR) system using a public speech dataset and a deep learning-based model. You will preprocess the dataset, train a speech-to-text model, and evaluate its performance.

In [None]:
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch

Load pre-trained model and processor

In [None]:
model_name = "facebook/wav2vec2-large-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
model.eval()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=1024, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder

Load and preprocess an audio file

In [None]:
# Function to load and process audio
def load_audio(file_path, sample_rate):
    waveform, sample_rate = torchaudio.load(file_path, sample_rate)

    # Resample if needed
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Convert stereo to mono if needed
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0)

    return waveform.squeeze()

Convert speech to text

In [None]:
# Function to transcribe audio
def transcribe(audio_file):
    waveform = load_audio(audio_file,sample_rate)

    # Ensure correct shape: [1, sequence_length]
    waveform = waveform.unsqueeze(0)  # Add batch dimension -> Shape: [1, sequence_length]

    # Process the input
    input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values

    # Ensure correct shape before feeding into the model
    input_values = input_values.squeeze(1)  # Shape: [batch_size, sequence_length]

    # Model inference
    with torch.no_grad():
        logits = model(input_values).logits

    # Decode the predicted IDs
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription

In [None]:
waveform, sample_rate = torchaudio.load("/content/drive/MyDrive/download.wav")


In [None]:
sample_rate

16000

In [None]:
waveform = torch.mean(waveform, dim=0)

In [None]:
waveform.unsqueeze(0)

tensor([[0.0183, 0.0180, 0.0180,  ..., 0.0018, 0.0019, 0.0032]])

In [None]:
waveform.shape

torch.Size([54400])

Test with an audio sample

In [None]:
audio_file_path = '/content/drive/MyDrive/download.wav'  # Replace with your test audio file
text_output = transcribe(audio_file_path)
print("Transcription:", text_output)

Transcription: CURIOSITY BESIDE ME AT THIS MOMENT


from torchaudio.datasets import LIBRISPEECH
test_dataset = LIBRISPEECH(root="./content/LibriSpeech/", url="test-clean", download=True)

# Evaluate on a subset (e.g., first 20 samples)
num_samples = 20
ground_truths = []
predictions = []

for i in range(num_samples):
    waveform, sample_rate, transcript, _, _, _ = test_dataset[i]

    # Process and transcribe
    waveform = load_audio(waveform, sample_rate)
    predicted_text = transcribe(waveform)

    # Store ground truth and predictions
    ground_truths.append(transcript.lower())
    predictions.append(predicted_text)

    print(f"Sample {i+1}:")
    print(f"Ground Truth: {transcript}")
    print(f"Predicted: {predicted_text}")

# Compute Word Error Rate (WER)
wer = wer_metric.compute(predictions=predictions, references=ground_truths)
print(f"Word Error Rate (WER): {wer:.2%}")