In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install --upgrade datasets
!pip install torchaudio
!pip install librosa
!pip install jiwer
!pip install --upgrade evaluate datasets
!pip install --upgrade transformers huggingface_hub
!pip install pydub

**PREPROCESSING NEW DATA FOR MODEL EVALUATION**

In [None]:
import pandas as pd

eval_test_df = pd.read_csv('/content/drive/MyDrive/Twi_ASR/evaluation/eval_dataset.csv')

In [None]:
from datasets import Dataset

# Converting pandas DataFrame to Hugging Face Dataset
eval_test = Dataset.from_pandas(eval_test_df)

In [None]:
"""from pydub import AudioSegment

# Load video file
video_file = "/content/drive/MyDrive/evaluation/eval_audio/eval_audio_2.flac"
audio = AudioSegment.from_file(video_file, format="flac")  # Adjust format if necessary

# Export as MP3
audio.export("/content/drive/MyDrive/evaluation/eval_audio/eval_audio_2.wav", format="wav")

print("Audio extracted successfully!")"""


In [None]:
import torchaudio
import os

# Define base path to the raw_audio folder
base_path = "/content/drive/MyDrive/evaluation/eval_audio"

def speech_file_to_array_fn(batch):

    speech_array, sampling_rate = torchaudio.load(os.path.join(base_path, batch["audio"]))
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["sentence"]

    return batch

In [None]:
eval_test = eval_test.map(speech_file_to_array_fn, remove_columns=eval_test.column_names, num_proc=1)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [None]:
import librosa
import numpy as np

def resample_audio(batch):
    """
    Resample audio data in 'speech' to 16kHz.
    """
    # Ensure 'speech' and 'sampling_rate' exist in batch
    if "speech" in batch and "sampling_rate" in batch:
        audio = np.asarray(batch["speech"])  # Convert speech to NumPy array
        orig_sr = batch["sampling_rate"]     # Original sampling rate

        # Perform resampling
        batch["speech"] = librosa.resample(y=audio, orig_sr=orig_sr, target_sr=16000)
        batch["sampling_rate"] = 16000  # Update sampling rate

    return batch

In [None]:
eval_test = eval_test.map(resample_audio, num_proc=1)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [None]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values

    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

In [None]:
eval_test = eval_test.map(prepare_dataset, remove_columns=eval_test.column_names, batch_size=8, num_proc=1, batched=True)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]



**ALREADY PROCESSED DATA FOR MODEL EVALUATION**

In [None]:
from datasets import load_from_disk

eval_test = load_from_disk("/content/drive/MyDrive/Twi_ASR/PD_21237:33102_2688:4033/train_dataset")

In [None]:
import pandas as pd

reference = pd.read_csv('/content/drive/MyDrive/Twi_ASR/train.csv')
reference = reference.iloc[21237:33102]

In [None]:
from datasets import Dataset

# Converting pandas DataFrame to Hugging Face Dataset
reference = Dataset.from_pandas(reference)

**SETUP FOR MODEL EVALUATION**

In [None]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("/content/drive/MyDrive/Twi_ASR/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [None]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [None]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained("/content/drive/MyDrive/Twi_ASR/wav2vec2-large-xlsr-twi_PD/checkpoint-22260").to("cuda")

In [None]:
import torch

input_dict = processor(eval_test[900]["input_values"], return_tensors="pt", padding=True)
logits = model(input_dict.input_values.to("cuda")).logits
pred_ids = torch.argmax(logits, dim=-1)[0]

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [None]:
print("Prediction:")
print(processor.decode(pred_ids))

print("\nReference:")
print(reference[900]["sentence"].lower())