In [1]:
import os
import glob
import random
import pandas as pd
import torchaudio
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
from torch.optim import AdamW
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from itertools import product
from sklearn.model_selection import train_test_split
import string
from IPython.display import Audio, display

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
word_to_digit = {
    "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4,
    "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9
}

def extract_number_from_transcription(transcription):

    transcription = transcription.translate(str.maketrans('', '', string.punctuation))
    transcription = transcription.strip().lower()

    if transcription.isdigit():
        return int(transcription)

    for word in transcription.split(): 
        if word in word_to_digit:
            return word_to_digit[word]

    return None

In [4]:
def load_data(data_dir):
    wav_files = glob.glob(f"{data_dir}/*.wav")
    data = []
    
    for wav_file in wav_files:
        label = os.path.basename(wav_file).split('_')[0]
        data.append((wav_file, label))
        
    return pd.DataFrame(data, columns=['wavfile', 'label'])

data_dir = '/kaggle/input/spoken-digits/recordings'
data = load_data(data_dir)

train_data, test_data = train_test_split(
    data, 
    test_size=0.2, 
    stratify=data['label']
)

train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [5]:
class AudioDataset(Dataset):
    def __init__(self, df, processor, target_sample_rate=16000):
        self.df = df
        self.processor = processor
        self.target_sample_rate = target_sample_rate

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]['wavfile']
        label = self.df.iloc[idx]['label']
        audio_data, sample_rate = torchaudio.load(audio_path)
        
        if sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.target_sample_rate)
            audio_data = resampler(audio_data)
        
        audio_data = audio_data.squeeze().numpy()
        
        inputs = self.processor(audio_data, return_tensors="pt", sampling_rate=self.target_sample_rate)
        
        label_text = str(word_to_digit[label])
        return inputs.input_values.squeeze(0), label_text

In [6]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_speech_to_text")
model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_speech_to_text").to(device)

OSError: microsoft/speecht5_speech_to_text is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
def pre_dataloader(batch):
    audios, labels = zip(*batch)
    
    audio_features = torch.stack(audios)
    label_ids = processor(labels, return_tensors="pt", padding=True).input_ids
    
    return audio_features, label_ids

In [None]:
train_dataset = AudioDataset(train_data, processor)
test_dataset = AudioDataset(test_data, processor)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=pre_dataloader)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=pre_dataloader)

In [None]:
def train_model(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    
    for inputs, labels in tqdm(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_values=inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        predicted_ids = torch.argmax(outputs.logits, dim=-1)
        predicted_texts = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        true_texts = [processor.decode(l, skip_special_tokens=True) for l in labels]

        for pred_text, true_text in zip(predicted_texts, true_texts):
            pred_digit = extract_number_from_transcription(pred_text)
            true_digit = extract_number_from_transcription(true_text)
            if pred_digit == true_digit:
                correct_predictions += 1
        total_samples += len(labels)

    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions / total_samples
    print(f"Training Loss: {avg_loss}, Training Accuracy: {accuracy * 100}")

    return accuracy * 100

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-4)

train_model(model, train_loader, optimizer, device)