In [1]:
MODEL_ID = "openai/whisper-tiny"
ACCESS_TOKEN_FILE = "access_token.txt"
AUDIO_SAVE_DIR = "audios"
NUM_SAMPLES = 20000

In [2]:
import uuid
from pathlib import Path
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
import librosa
import soundfile as sf
import torch
from datasets import load_dataset, Audio
from transformers import WhisperProcessor, WhisperFeatureExtractor, WhisperTokenizer, WhisperForConditionalGeneration, pipeline
from renumics import spotlight

In [3]:
# read the acces token for downloading the dataset
access_token = Path(ACCESS_TOKEN_FILE).read_text()

In [4]:
cv_13 = load_dataset("mozilla-foundation/common_voice_13_0", "en", use_auth_token=access_token, streaming=False)

Found cached dataset common_voice_13_0 (/home/daniel/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/en/13.0.0/22809012aac1fc9803eaffc44122e4149043748e93933935d5ea19898587e4d7)


  0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
train_length = len(cv_13["train"])
train_indices = np.random.choice(np.arange(train_length), size=NUM_SAMPLES)

In [6]:
cv = cv_13["train"].select(train_indices)

In [7]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_ID)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_ID, language="en", task="transcribe")
# processor = WhisperProcessor.from_pretrained(MODEL_ID, language="en", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID).to(device)

# print(model.config.forced_decoder_ids)

model.config.forced_decoder_ids = tokenizer.get_decoder_prompt_ids() # Specify the task as we always want to use german and transcribe
# model.config.forced_decoder_ids = None
# model.config.suppress_tokens = []
model.config.language = "<|en|>"
model.config.task = "transcribe"

pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor, device=device)

In [8]:
keys_to_save = ["sentence", "up_votes", "down_votes", "age", "gender", "accent", "locale", "segment", "variant"]

audio_save_dir = Path(AUDIO_SAVE_DIR)
if  not audio_save_dir.is_dir():
    audio_save_dir.mkdir()
else:
    shutil.rmtree(audio_save_dir)
    audio_save_dir.mkdir()


data = []
for s in tqdm(cv):
    new_audio = librosa.resample(s["audio"]["array"], orig_sr=s["audio"]["sampling_rate"], target_sr=16000)
    file_stem = str(uuid.uuid4())
    cur_data = {}
    for k in keys_to_save:
        cur_data[k] = s[k]
    prediction = pipe(new_audio)["text"]
    cur_data["prediction"] = prediction
    target_path = audio_save_dir / (file_stem + ".wav")
    cur_data["audio"] = target_path
    sf.write(target_path, new_audio, 16000)
    data.append(cur_data)

100%|█████████████████████████████████████| 20000/20000 [44:09<00:00,  7.55it/s]


In [9]:
df = pd.DataFrame(data)

In [32]:
df["audio"] = df["audio"].astype("string") # otherwise overflow in serializing json

In [34]:
df.to_json("predictions.json", orient="records")