In [1]:
%matplotlib widget
from transformers import WhisperFeatureExtractor
import matplotlib.pyplot as plt
import numpy as np
import torch
import time
import os
import librosa
from tqdm.notebook import tqdm
from threading import Thread

In [2]:
def parse_alphadigit_phn(phn_path):
    label = []
    with open(phn_path, 'r') as f:
        for idx, phn_string in enumerate(f):
            if idx >= 2:
                components = phn_string.strip().split(' ')
                start = int(components[0])
                stop = int(components[1])
                phoneme = components[2]
                label.append({
                    'start': start,
                    'stop': stop,
                    'phoneme': phoneme
                })
    return label
def parse_alphadigit_txt(txt_path):
    label = []
    with open(txt_path, 'r') as f:
        return f.readline().strip().split(' ')
def get_alphadigit_data(folder_path):
    labeled_audio_list = []
    
    letter_folder = 'trans'
    audio_folder = 'speech'
    phonetic_folder = 'labels'

    count = 0
    audio_folder_path = os.path.join(folder_path, audio_folder)
    for subfolder in os.listdir(audio_folder_path):
        subfolder_path = os.path.join(audio_folder_path, subfolder)
        for audio_file in os.listdir(subfolder_path):
            count += 1
    print(count)
    pbar = tqdm(total=count)
    
    audio_folder_path = os.path.join(folder_path, audio_folder)
    for subfolder in os.listdir(audio_folder_path):
        subfolder_path = os.path.join(audio_folder_path, subfolder)
        for audio_file in os.listdir(subfolder_path):
            filename = audio_file[:-4]

            letter_path = os.path.join(folder_path, letter_folder, subfolder, filename + '.txt')
            phonetic_path = os.path.join(folder_path, phonetic_folder, subfolder, filename + '.phn')

            audio_path = os.path.join(folder_path, phonetic_folder, subfolder, audio_file)
            try:
                phonetic_label = parse_alphadigit_phn(phonetic_path)
                letter_label = parse_alphadigit_txt(letter_path)

                labeled_audio_list.append({
                    'name': filename,
                    'subfolder': subfolder,
                    'audio_path': audio_path,
                    'phonetic_label': phonetic_label,
                    'letter_label': letter_label,
                })
            except:
                print(f'Cannot find .phn or .txt for {filename}')
            pbar.update(1)
    return labeled_audio_list
    
labeled_audio_list = get_alphadigit_data('datasets/cslu_alphadigits_LDC2008S06/cslu_alphadigits')

78044


  0%|          | 0/78044 [00:00<?, ?it/s]

In [3]:
labeled_audio_list

[{'name': 'AD-1.getid',
  'subfolder': '0',
  'audio_path': 'datasets/cslu_alphadigits_LDC2008S06/cslu_alphadigits\\labels\\0\\AD-1.getid.wav',
  'phonetic_label': [{'start': 0, 'stop': 75, 'phoneme': '.pau'},
   {'start': 75, 'stop': 160, 'phoneme': 'w'},
   {'start': 160, 'stop': 210, 'phoneme': '^'},
   {'start': 210, 'stop': 520, 'phoneme': 'n'},
   {'start': 520, 'stop': 670, 'phoneme': 'z'},
   {'start': 670, 'stop': 880, 'phoneme': 'I'},
   {'start': 880, 'stop': 965, 'phoneme': '9r'},
   {'start': 965, 'stop': 1065, 'phoneme': 'oU'},
   {'start': 1065, 'stop': 1355, 'phoneme': '.pau'},
   {'start': 1355, 'stop': 1555, 'phoneme': 'ei'},
   {'start': 1555, 'stop': 1710, 'phoneme': 'tc'},
   {'start': 1710, 'stop': 1775, 'phoneme': 'th'},
   {'start': 1775, 'stop': 2960, 'phoneme': '.pau'}],
  'letter_label': ['one', 'zero', 'eight']},
 {'name': 'AD-1.p1',
  'subfolder': '0',
  'audio_path': 'datasets/cslu_alphadigits_LDC2008S06/cslu_alphadigits\\labels\\0\\AD-1.p1.wav',
  'phonet

In [4]:
def get_letter_labels(labeled_audio_list):
    letter_list = set()
    for labeled_audio in tqdm(labeled_audio_list):
        for letter in labeled_audio['letter_label']:
            letter_list.add(letter)
    return list(letter_list)

def get_phoneme_labels(labeled_audio_list):
    phoneme_list = set()
    for labeled_audio in tqdm(labeled_audio_list):
        for phoneme in labeled_audio['phonetic_label']:
            phoneme_list.add(phoneme['phoneme'])
    return list(phoneme_list)
letters = get_letter_labels(labeled_audio_list)
phonemes = get_phoneme_labels(labeled_audio_list)

  0%|          | 0/78044 [00:00<?, ?it/s]

  0%|          | 0/78044 [00:00<?, ?it/s]

In [11]:
len(phonemes)

59

In [32]:
from torch.utils.data import Dataset
class AudioDataset(Dataset):
    def __init__(self, labeled_audio_list=None):
        super().__init__()

        
        self.feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-tiny')

SyntaxError: incomplete input (4031102474.py, line 3)

In [26]:
# Import supporting scripts
from src.models import ModelUtils, WhisperForLetterClassification
from src.configs import WflcConfigs
from src.trainer import eval_model

In [29]:
wflc_tiny_config = WflcConfigs.get_config('wflc_tiny_phonetics')

In [30]:
wflc_tiny_config

{'model_name': 'WhisperForLetterClassification_Tiny',
 'pretrained_whisper': 'openai/whisper-tiny',
 'audio_config': {'sample_rate': 16000, 'window_length': 1500},
 'encoder': {'module_list': ['whisper.encoder']},
 'projector': {'module_list': ['whisper.projector']},
 'classifier': {'module_list': [{'name': 'Linear',
    'kwargs': {'in_features': 256, 'out_features': 59, 'bias': True}}]}}