In [69]:
import os
import soundfile as sf     # for reading audio files
import speechpy           # for speech processing
from tqdm.notebook import tqdm     # for progress bars
import numpy as np        # for numerical operations
import matplotlib.pyplot as plt   # for plotting

# Set directories for training and testing data
testing_dir = os.getcwd()
testing_dir = os.path.join(testing_dir, '.\\TIMIT\\TEST')
training_dir = os.getcwd()
training_dir = os.path.join(training_dir, '.\\TIMIT\\TRAIN')

def get_raw_phoneme_dataset(working_dir):
    phoneme_signals = []     # list for storing phoneme signals
    phoneme_labels = []      # list for storing phoneme labels
    dialect_folders = os.listdir(working_dir)
    for dialect_folder in tqdm(dialect_folders):
        voice_folders = os.listdir(os.path.join(working_dir, dialect_folder))
        for voice_sample_folder in voice_folders:
            current_dir = os.path.join(working_dir, dialect_folder)
            current_dir = os.path.join(current_dir, voice_sample_folder)

            voice_samples = [file for file in os.listdir(current_dir) if file.endswith('.WAV')]
            for voice_sample in voice_samples:
                voice_sample_path = os.path.join(current_dir, voice_sample)
                signal, fs = sf.read(voice_sample_path)
                
                # Uncomment below to Pre-emphasize signal
#                 signal = speechpy.processing.preemphasis(signal, cof=0.98)
                
                phoneme_transcript_path = os.path.join(current_dir, voice_sample)[:-3] + 'PHN'
                f = open(phoneme_transcript_path, encoding='utf-8')
                lines = f.readlines()
                for line in lines:
                    data = line.split(' ')
                    phoneme_start_idx = int(data[0])
                    phoneme_stop_idx = int(data[1])
                    phoneme = data[2].strip()
                    
                    phoneme_signal = np.asarray(signal[phoneme_start_idx:phoneme_stop_idx])
                
                    # Add phoneme signal and label to list
                    phoneme_signals.append(phoneme_signal)
                    phoneme_labels.append(phoneme)
    return np.asarray(phoneme_signals, dtype=object), np.asarray(phoneme_labels)
        
# Load training and testing phoneme signals
print("Loading Training Dataset")
training_signals, training_labels = get_raw_phoneme_dataset(training_dir)
print("Loading Testing Dataset")
testing_signals, testing_labels = get_raw_phoneme_dataset(testing_dir)


Loading Training Dataset


  0%|          | 0/8 [00:00<?, ?it/s]

Loading Testing Dataset


  0%|          | 0/8 [00:00<?, ?it/s]

In [70]:
# Saving Training Dataset to File
np.save('./dataset/training_signals.npy', training_signals)
np.save('./dataset/training_labels.npy', training_labels)

# Saving Testing Dataset to File
np.save('./dataset/testing_signals.npy', testing_signals)
np.save('./dataset/testing_labels.npy', testing_labels)

In [71]:
new_phenome_lookup = {
    'iy':'i',
    'ih':'i',
    's':'s',
    'd':'d',
    'eh':'e',
    'ey':'e',
    'ch':'ch',
    'hh':'h',
    'k':'k',
    'ow':'o',
    'ae':'a',
    'aa':'a',
    'ah':'a',
    'ao':'a',
    'oy':'a',
    'er':'r',
    'v':'v',
    'w':'v',
    'z':'z'
}
alphabet_labels = [
    's iy',
    's ih',
]
alphabet_readings = [
    'c',
    'c',
    
    
]

In [72]:
import pandas as pd

readings_path = './dataset/reading.xlsx'
def get_readings_list(readings_path):
    readings_list = []
    readings_labels = []
    readings = pd.read_excel(readings_path)
    for i, key in tqdm(enumerate(readings.key)):
        readings_list.append(readings['reading'][i].split(' '))
        readings_labels.append(key)
#         print(key, readings['reading'][i].split(' '))
    return readings_list, readings_labels
        
        
readings_list, readings_label = get_readings_list(readings_path)

0it [00:00, ?it/s]

In [73]:
# Generates a dataset based on stitching audio samples together 
# according to the phoneme sequences in readings_list
# - Returns signals, labels
def generate_dataset(readings_list, readings_label, signals, labels):
    phoneme_lookup_dict = {}
    
    # Find all the unique phonemes that are present
    phoneme_set = []
    print("Finding all unique phonemes presented in readings_list")
    for reading in tqdm(readings_list):
        for phoneme in reading:
            if not phoneme in phoneme_set:
                phoneme_set.append(phoneme)
    print(f'Unique phonemes  : {phoneme_set}')
    
    # Create a dictionary of signals for each phonemes
    phoneme_samples_dict = {}
    print(labels)
    for i, label in enumerate(tqdm(labels)):
        signal = signals[i]
        if label in phoneme_set:
            if not label in phoneme_samples_dict:
                phoneme_samples_dict[label] = [signal]
#                 print(phoneme_samples_dict[label])
            else:
                phoneme_samples_dict[label].append(signal)
    for label in phoneme_samples_dict:
        phoneme_samples_dict[label] = np.asarray(phoneme_samples_dict[label], dtype=object)
        print(type(phoneme_samples_dict[label][0]))
#     for reading in tqdm(readings_list):
#         concat_signal = []
#         for phoneme in reading:
#             print(phoneme)
#         return
generate_dataset(readings_list, readings_labels, training_signals, training_labels)
        

Finding all unique phonemes presented in readings_list


  0%|          | 0/8 [00:00<?, ?it/s]

Unique phonemes  : ['s', 'iy', 'ih', 'd', 'eh', 'ch', 'ey', 'hh', 'ay']
['h#' 'sh' 'ix' ... 'p' 's' 'h#']


  0%|          | 0/177080 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
