In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import json
import torch
import librosa
import soundfile as sf
from transformers import (
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    WhisperForConditionalGeneration,
    WhisperProcessor,
)

In [3]:
# Set model name and device
MODEL_NAME = "openai/whisper-large-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# Load appropriate processor and model
if "whisper" in MODEL_NAME:
    processor = WhisperProcessor.from_pretrained(MODEL_NAME)
    model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
else:
    processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
    model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/4.29k [00:00<?, ?B/s]

In [5]:
def transcribe_audio(audio_data):
    """Transcribes audio data using the selected model."""
    sr = 16000  # Target sample rate

    if "whisper" in MODEL_NAME:
        input_features = processor(audio_data, return_tensors="pt", sampling_rate=sr).input_features.to(device)
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    else:
        input_values = processor(audio_data, return_tensors="pt", sampling_rate=sr).input_values.to(device)
        with torch.no_grad():
            logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]

    return transcription

In [6]:
def transcribe_long_audio(audio_path, chunk_duration=30):
    """Splits long audio into chunks and transcribes each chunk."""
    speech, sr = librosa.load(audio_path, sr=16000)
    total_duration = librosa.get_duration(y=speech, sr=sr)
    num_chunks = int(total_duration / chunk_duration) + 1
    full_transcription = ""

    for i in range(num_chunks):
        start_time = i * chunk_duration
        end_time = min((i + 1) * chunk_duration, total_duration)
        chunk = speech[int(start_time * sr):int(end_time * sr)]
        transcription = transcribe_audio(chunk)
        full_transcription += transcription + " "

    return full_transcription

In [7]:
# Path to the dataset and audio folder inside Google Drive
DATASET_PATH = "/content/drive/MyDrive/processed_audio/dataset.json"

In [8]:
# Load dataset
with open(DATASET_PATH, "r", encoding="utf-8") as f:
    dataset = json.load(f)

# Transcribe each audio file (assuming audio paths are relative to 'test' folder)
AUDIO_FOLDER = "/content/drive/MyDrive/processed_audio/"

In [9]:
for entry in dataset:
    audio_path = os.path.join(AUDIO_FOLDER, entry["audio"])
    print(f"Transcribing: {audio_path}")
    entry["text"] = transcribe_long_audio(audio_path)

# Save updated dataset
with open(DATASET_PATH, "w", encoding="utf-8") as f:
    json.dump(dataset, f, indent=4)

print("Transcription complete! Updated dataset saved.")

Transcribing: /content/drive/MyDrive/processed_audio/VRS_Audio_1.wav


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcribing: /content/drive/MyDrive/processed_audio/VRS_Audio_2.wav
Transcribing: /content/drive/MyDrive/processed_audio/VRS_Audio_3.wav
Transcribing: /content/drive/MyDrive/processed_audio/VRS_Audio_4.wav
Transcription complete! Updated dataset saved.
