In [1]:
import tensorflow as tf
from tensorflow import keras
import scipy
import scipy.io.wavfile as wav
import numpy as np
import wave
import matplotlib.pyplot as plt
import noisereduce
import speechpy
import os
from scipy.cluster.vq import vq, kmeans2, kmeans, whiten
from scipy.spatial import distance
from tqdm.notebook import tqdm
from hmmlearn import hmm
import ruptures as rpt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
import soundfile as sf
import sounddevice as sd
import pickle 
from IPython.display import display
import pandas as pd
import random
import pyaudio
from scipy.io.wavfile import write
from sklearn.utils import shuffle
import librosa
import IPython



In [2]:
testing_dir = os.getcwd()
testing_dir = os.path.join(testing_dir, '.\\TIMIT\\TEST')

training_dir = os.getcwd()
training_dir = os.path.join(training_dir, '.\\TIMIT\\TRAIN')



def load_phoneme_dataset(working_dir):
    phoneme_signals = {}
    dialect_folders = os.listdir(working_dir)
    for dialect_folder in tqdm(dialect_folders, leave=False):
        voice_folders = os.listdir(os.path.join(working_dir, dialect_folder))
        for voice_sample_folder in tqdm(voice_folders, leave=False):
            current_dir = os.path.join(working_dir, dialect_folder)
            current_dir = os.path.join(current_dir, voice_sample_folder)

            voice_samples = [file for file in os.listdir(current_dir) if file.endswith('.WAV')]
            for voice_sample in voice_samples:
                voice_sample_path = os.path.join(current_dir, voice_sample)
                signal, fs = sf.read(voice_sample_path)
#                 signal = np.zeros_like(signal)
#                 signal[40000] = 10
#                 print(signal.shape)
    
                # Pre-emphasize signal
                signal_preemphasized = speechpy.processing.preemphasis(signal, cof=0.98)

                # FFT
#                 frames = speechpy.processing.stack_frames(signal_preemphasized, sampling_frequency=fs, frame_length=0.02, frame_stride=0.01, zero_padding=True)
#                 fft = speechpy.processing.fft_spectrum(frames, 256)
    
#                 fig, ax = plt.subplots(nrows=2, ncols=1, constrained_layout=True)
#                 fig.supxlabel('Time')
#                 fig.set_size_inches(15, 10)
#                 ax[0].plot(signal_preemphasized)
#                 ax[0].set_xlim(0, signal_preemphasized.shape[0])
                
#                 fig.colorbar(ax[1].pcolor(fft.T))
                

                phoneme_transcript_path = os.path.join(current_dir, voice_sample)[:-3] + 'PHN'
                f = open(phoneme_transcript_path, encoding='utf-8')
                lines = f.readlines()
                for line in lines:
                    data = line.split(' ')
                    phoneme_start_idx = int(data[0])
                    phoneme_stop_idx = int(data[1])
                    phoneme = data[2].strip()
                    
#                     ax[0].axvline(x=phoneme_start_idx, color='red')
#                     ax[1].axvline(x=phoneme_start_idx/160, color='red')
                    
                    phoneme_signal_preemphasized = signal_preemphasized[int(phoneme_start_idx):int(phoneme_stop_idx)]

                
                    if phoneme in phoneme_signals:
#                         print(phoneme_fft)
                        phoneme_signals[phoneme].append(phoneme_signal_preemphasized)
                        
                    else:
                        phoneme_signals[phoneme] = []
                        phoneme_signals[phoneme].append(phoneme_signal_preemphasized)
#                     print(phoneme_signals[phoneme])
    return fs, phoneme_signals

try:
    training_phoneme_signals
except NameError:
    print('training phoneme signals hasn\'t been loaded yet')
    try:
        with open('training_phoneme_signals.pkl', 'rb') as handle:
            training_phoneme_signals = pickle.load(handle)
    except:
        print('training_phoneme_signals.pkl not found, start loading dataset')
        fs, training_phoneme_signals = load_phoneme_dataset(training_dir)
        with open('training_phoneme_signals.pkl', 'wb') as handle:
            pickle.dump(training_phoneme_signals, handle)
            
try:
    testing_phoneme_signals
except NameError:
    print('testing phoneme signals hasn\'t been loaded yet')
    try:
        with open('testing_phoneme_signals.pkl', 'rb') as handle:
            testing_phoneme_signals = pickle.load(handle)
    except:
        print('testing_phoneme_signals.pkl not found, start loading dataset')
        fs, testing_phoneme_signals = load_phoneme_dataset(testing_dir)
        with open('testing_phoneme_signals.pkl', 'wb') as handle:
            pickle.dump(testing_phoneme_signals, handle)

training phoneme signals hasn't been loaded yet
testing phoneme signals hasn't been loaded yet


In [3]:
training_phoneme_signals.keys()

dict_keys(['h#', 'sh', 'ix', 'hv', 'eh', 'dcl', 'jh', 'ih', 'd', 'ah', 'kcl', 'k', 's', 'ux', 'q', 'en', 'gcl', 'g', 'r', 'w', 'ao', 'epi', 'dx', 'axr', 'l', 'y', 'uh', 'n', 'ae', 'm', 'oy', 'ax', 'dh', 'tcl', 'iy', 'v', 'f', 't', 'pcl', 'ow', 'hh', 'ch', 'bcl', 'b', 'aa', 'em', 'ng', 'ay', 'th', 'ax-h', 'ey', 'p', 'aw', 'er', 'nx', 'z', 'el', 'uw', 'pau', 'zh', 'eng'])

In [4]:
keywords_list = [
    ['s', 's'],
    ['i', 'iy'],
    ['i', 'ih'],
    ['d', 'd'],
    ['h', 'hh'],
    ['e', 'eh'],
    ['e', 'ay'],
    ['ch', 'ch'],
    ['k', 'k'],
    ['n', 'n'],
    ['o', 'ow'],
    ['a', 'ah'],
    ['r', 'r'],
    ['v', 'v'],
    ['z', 'z'],
    ['silence', 'h#'],
]

for key in training_phoneme_signals.keys():
    found = False
    for phoneme_pair in keywords_list:
        if phoneme_pair[1] == key:
            found = True
        if found:
            break
    
    if not found:
        keywords_list.append(['other', key])
print(keywords_list, '\n')

unique_keywords = []
for phoneme_pair in keywords_list:
    if phoneme_pair[0] not in unique_keywords:
        unique_keywords.append(phoneme_pair[0])
print(unique_keywords)

[['s', 's'], ['i', 'iy'], ['i', 'ih'], ['d', 'd'], ['h', 'hh'], ['e', 'eh'], ['e', 'ay'], ['ch', 'ch'], ['k', 'k'], ['n', 'n'], ['o', 'ow'], ['a', 'ah'], ['r', 'r'], ['v', 'v'], ['z', 'z'], ['silence', 'h#'], ['other', 'sh'], ['other', 'ix'], ['other', 'hv'], ['other', 'dcl'], ['other', 'jh'], ['other', 'kcl'], ['other', 'ux'], ['other', 'q'], ['other', 'en'], ['other', 'gcl'], ['other', 'g'], ['other', 'w'], ['other', 'ao'], ['other', 'epi'], ['other', 'dx'], ['other', 'axr'], ['other', 'l'], ['other', 'y'], ['other', 'uh'], ['other', 'ae'], ['other', 'm'], ['other', 'oy'], ['other', 'ax'], ['other', 'dh'], ['other', 'tcl'], ['other', 'f'], ['other', 't'], ['other', 'pcl'], ['other', 'bcl'], ['other', 'b'], ['other', 'aa'], ['other', 'em'], ['other', 'ng'], ['other', 'th'], ['other', 'ax-h'], ['other', 'ey'], ['other', 'p'], ['other', 'aw'], ['other', 'er'], ['other', 'nx'], ['other', 'el'], ['other', 'uw'], ['other', 'pau'], ['other', 'zh'], ['other', 'eng']] 

['s', 'i', 'd', 'h', '

In [5]:
def plot_signal_array(signal_array):
    fig, ax = plt.subplots(nrows=1, ncols=1, constrained_layout=True)
    fig.supxlabel('Time')
    fig.set_size_inches(15, 5)
    ax.plot(signal_array)
    
def plot_pcolor(signal_array):
    fig, ax = plt.subplots(nrows=1, ncols=1, constrained_layout=True)
    fig.supxlabel('Time')
    fig.set_size_inches(15, 5)
    ax.pcolor(signal_array)

In [None]:
import datetime
testing_dir = os.getcwd()
testing_dir = os.path.join(testing_dir, '.\\TIMIT\\TEST')

training_dir = os.getcwd()
training_dir = os.path.join(training_dir, '.\\TIMIT\\TRAIN')



def create_phoneme_dataset(working_dir, prev_frames, future_frames):
    signals_list = []
    labels_list = []
    dialect_folders = os.listdir(working_dir)
    for dialect_folder in tqdm(dialect_folders, leave=False):
        voice_folders = os.listdir(os.path.join(working_dir, dialect_folder))
        for voice_sample_folder in tqdm(voice_folders, leave=False):
            current_dir = os.path.join(working_dir, dialect_folder)
            current_dir = os.path.join(current_dir, voice_sample_folder)

            voice_samples = [file for file in os.listdir(current_dir) if file.endswith('.WAV')]
            for voice_sample in voice_samples:
                voice_sample_path = os.path.join(current_dir, voice_sample)
                signal, fs = sf.read(voice_sample_path)

                # Pre-emphasize signal
#                 signal_preemphasized = speechpy.processing.preemphasis(signal, cof=0.98)
            
#                 mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.025, frame_stride=0.01,
#                                  num_filters=40, fft_length=256, low_frequency=0, high_frequency=None)

                current_signals_list = []
                current_labels_list = []
                phoneme_transcript_path = os.path.join(current_dir, voice_sample)[:-3] + 'PHN'
                f = open(phoneme_transcript_path, encoding='utf-8')
                lines = f.readlines()
                for line in lines:
                    data = line.split(' ')
                    phoneme_start_idx = int(data[0])
                    phoneme_stop_idx = int(data[1])
                    phoneme = data[2].strip()
                    
                    label = np.zeros(len(unique_keywords))
                    for phoneme_pair in keywords_list:
                        if phoneme_pair[1] == phoneme:
                            phoneme = phoneme_pair[0]
                            break
                    label[unique_keywords.index(phoneme)] = 1
                    
                    for frame in signal[int(phoneme_start_idx):int(phoneme_stop_idx)]:
                        current_signals_list.append(frame)
                        current_labels_list.append(label)
                print(len(current_signals_list))
                for i in range(len(current_signals_list)):
                    if i >= prev_frames and i < len(current_signals_list) - future_frames:
#                         if i%100:
#                             filename = f'./local_recordings/{unique_keywords[current_labels_list[i].tolist().index(1)]}-{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}.wav'
#                             sf.write(filename, current_signals_list[i - prev_frames: i + future_frames + 1], 16000)
                        signals_list.append(current_signals_list[i - prev_frames: i + future_frames + 1])
                        labels_list.append(unique_keywords[current_labels_list[i].tolist().index(1)])
                        
    return fs, signals_list, labels_list

fs, x_train_signal, y_train_signal = create_phoneme_dataset(testing_dir, 30*160, 10*160)

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

63440
58000
79520
56160
60000
52960
49520
56560
53680
39120
84400
66480
70640
73520
