### Step 0: Preprocessing audio file to 16kHz frequency

In [None]:
# read and transform german audio corpus
import pandas as pd
import numpy as np
import librosa
from io import BytesIO


# Step 1: Read the .parquet file
audio_data = pd.read_parquet('data/train-00000-of-00084.parquet')

audio_arrays = []

# Step 2: Convert the audio bytes into a NumPy array with a sampling rate of 16000 Hz
desired_sample_rate = 16000

for audio_dict in audio_data['audio']:
    
    audio_bytes = audio_dict['bytes'] 
    audio_path = audio_dict['path'] 
    
        # Convert binary audio to a NumPy array
    audio_array, original_sample_rate = librosa.load(BytesIO(audio_bytes), sr=None)
    
    # Resample the audio to 16000 Hz
    if original_sample_rate != desired_sample_rate:
        audio_array = librosa.resample(y=audio_array, orig_sr=original_sample_rate, target_sr=desired_sample_rate)

    audio_dict = {'path': audio_path, 'array': np.array(audio_array), 'sampling_rate': desired_sample_rate}
    audio_arrays.append(audio_dict)

# Step 3: remove binary audio representation and expand array representation

audio_data = audio_data.drop('audio', axis = 1)
audio_data = audio_data.rename({'transkription': 'labels'}, axis = 1)
audio_data.insert(0, 'audio', audio_arrays) # insert dict with new audio representation 


# Step 4: save file
batch_size = 5000
for i in range(0, len(audio_data), batch_size):
    audio_data[i:i+batch_size].to_parquet(f'data/german_{i//batch_size}.parquet')

### Step 1: Load preprocessed data

In [1]:
from datasets import Dataset, DatasetDict

# Step 1: Load the Parquet files
train_dataset = Dataset.from_parquet(['data/german_0.parquet', 'data/german_1.parquet'])
test_dataset = Dataset.from_parquet('data/german_2.parquet')

# Step 2: transform audio from list to array
train_dataset = train_dataset.with_format("np", columns=["audio"], output_all_columns=True)
test_dataset = test_dataset.with_format("np", columns=["audio"], output_all_columns=True)

# Step 3: combine to DatasetDict

dataset_ger = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

### Step 4: load Whisper large-v3 model 

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, pipeline
from transformers import WhisperProcessor

device = "cuda:0" if torch.cuda.is_available() else "cpu"  # set device by checking for cuda enabled GPU or select CPU if not available
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# specify path to your (fine-tuned) model
model_path = "openai/whisper-large-v3"# if you want to load pre-trained model or select path of fine-tuned model :'./whisper-finetuned-model/' (ideally .safetensors)
processor = WhisperProcessor.from_pretrained(model_path, language="german", task="transcribe") # specify language and task

# Load and initalize the model
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer = processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

### Step 5: run GPU inference latency test

In [None]:
from evaluate import load
import time
import pandas as pd


def get_predict_time(sample):
        'return GPU inference latency per audio sample'
        start = time.time()
        prediction = pipe(sample, generate_kwargs={"language": "german"})
        end = time.time()
        return prediction['text'], end - start

def get_wer(predictions, references):
        'return Word error rate (WER) based on ls of predicted and true transcript words'
        len_ref = len(references)
        len_pred = len(predictions)

        if  len_pred > len_ref:
            predictions = predictions[:len_ref]
        else:
            references = references[:len_pred]

        return wer.compute(predictions= predictions, references= references),len_ref
        

# initialize placeholder to store latency and Word Error Rate per sample
ls_latency = []
ls_wer = []
wer = load("wer")

# iterate through test data 
for n in range(len(dataset_ger['test'])):

    # Step 1: define current sample
    sample = dataset_ger['test'][n]['audio']['array']
    sample_sentence = dataset_ger['test'][n]['labels']
    
	# Step 2: track inference time and transcription of sample
    text, latency = get_predict_time(sample)

    # Step 3: calc error (WER)
    wer_score, len_ref = get_wer(predictions = [x for x in text.split(' ') if x], 
                        references = [x for x in sample_sentence.split(' ') if x])

    # Step 4: store results
    time_per_token = latency/len_ref
    ls_latency.append(time_per_token)  # add currrent latency to list
    ls_wer.append(wer_score)
    
device_name= torch.cuda.get_device_name(device=device)
pd.DataFrame({f'latency {device_name} ft': ls_latency}).to_csv(f'performance_results/{device_name}_latency_ft.csv', index= False)
pd.DataFrame({f'WER {device_name} ft': ls_wer}).to_csv(f'performance_results/{device_name}_wer_ft.csv', index= False)