# Installations

Install and unzip TIMIT dataset from google drive

In [None]:
! gdown 15Nq4PdOY7h8AP54ge3EurUPKa91jvezc

In [1]:
! unzip timit.zip -d ./timit/

'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import IPython.display as ipd
import librosa
import soundfile as sf

import sklearn
from tqdm import tqdm

from typing import List, Tuple, Set
%matplotlib inline

# Dataset

In [2]:
timit_path = "./timit/"
timit_data_path = "./timit/data/"

In [3]:
df_train = pd.read_csv(os.path.join(timit_path, 'train_data.csv'))
df_test = pd.read_csv(os.path.join(timit_path, 'test_data.csv'))
df = pd.concat([df_train, df_test])
df = df[df['is_converted_audio'] == False]

In [4]:
df_train = df[df['test_or_train'] == "TRAIN"]
df_test  = df[df['test_or_train'] == "TEST"]

In [5]:
df_train

Unnamed: 0,index,test_or_train,dialect_region,speaker_id,filename,path_from_data_dir,path_from_data_dir_windows,is_converted_audio,is_audio,is_word_file,is_phonetic_file,is_sentence_file
1,2.0,TRAIN,DR4,MMDM0,SI1311.PHN,TRAIN/DR4/MMDM0/SI1311.PHN,TRAIN\\DR4\\MMDM0\\SI1311.PHN,False,False,False,True,False
2,3.0,TRAIN,DR4,MMDM0,SI1311.WRD,TRAIN/DR4/MMDM0/SI1311.WRD,TRAIN\\DR4\\MMDM0\\SI1311.WRD,False,False,True,False,False
3,4.0,TRAIN,DR4,MMDM0,SX321.PHN,TRAIN/DR4/MMDM0/SX321.PHN,TRAIN\\DR4\\MMDM0\\SX321.PHN,False,False,False,True,False
4,5.0,TRAIN,DR4,MMDM0,SX321.WRD,TRAIN/DR4/MMDM0/SX321.WRD,TRAIN\\DR4\\MMDM0\\SX321.WRD,False,False,True,False,False
5,6.0,TRAIN,DR4,MMDM0,SI681.TXT,TRAIN/DR4/MMDM0/SI681.TXT,TRAIN\\DR4\\MMDM0\\SI681.TXT,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
23093,23094.0,TRAIN,DR8,MRDM0,SX245.TXT,TRAIN/DR8/MRDM0/SX245.TXT,TRAIN\\DR8\\MRDM0\\SX245.TXT,False,False,False,,True
23094,23095.0,TRAIN,DR8,MRDM0,SI1044.PHN,TRAIN/DR8/MRDM0/SI1044.PHN,TRAIN\\DR8\\MRDM0\\SI1044.PHN,False,False,False,,False
23096,23097.0,TRAIN,DR8,MRDM0,SX245.WAV,TRAIN/DR8/MRDM0/SX245.WAV,TRAIN\\DR8\\MRDM0\\SX245.WAV,False,True,False,,False
23098,23099.0,TRAIN,DR8,MRDM0,SX335.WAV,TRAIN/DR8/MRDM0/SX335.WAV,TRAIN\\DR8\\MRDM0\\SX335.WAV,False,True,False,,False


In [6]:
df_test

Unnamed: 0,index,test_or_train,dialect_region,speaker_id,filename,path_from_data_dir,path_from_data_dir_windows,is_converted_audio,is_audio,is_word_file,is_phonetic_file,is_sentence_file
0,1.0,TEST,DR4,MGMM0,SX139.WAV,TEST/DR4/MGMM0/SX139.WAV,TEST\\DR4\\MGMM0\\SX139.WAV,False,True,False,False,False
2,3.0,TEST,DR4,MGMM0,SX139.TXT,TEST/DR4/MGMM0/SX139.TXT,TEST\\DR4\\MGMM0\\SX139.TXT,False,False,False,False,True
3,4.0,TEST,DR4,MGMM0,SI499.WRD,TEST/DR4/MGMM0/SI499.WRD,TEST\\DR4\\MGMM0\\SI499.WRD,False,False,True,False,False
4,5.0,TEST,DR4,MGMM0,SX319.WRD,TEST/DR4/MGMM0/SX319.WRD,TEST\\DR4\\MGMM0\\SX319.WRD,False,False,True,False,False
5,6.0,TEST,DR4,MGMM0,SX319.PHN,TEST/DR4/MGMM0/SX319.PHN,TEST\\DR4\\MGMM0\\SX319.PHN,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8394,8395.0,TEST,DR8,MPAM0,SX19.WAV,TEST/DR8/MPAM0/SX19.WAV,TEST\\DR8\\MPAM0\\SX19.WAV,False,True,False,False,False
8395,8396.0,TEST,DR8,MPAM0,SX109.TXT,TEST/DR8/MPAM0/SX109.TXT,TEST\\DR8\\MPAM0\\SX109.TXT,False,False,False,False,True
8397,8398.0,TEST,DR8,MPAM0,SX289.WRD,TEST/DR8/MPAM0/SX289.WRD,TEST\\DR8\\MPAM0\\SX289.WRD,False,False,True,False,False
8398,8399.0,TEST,DR8,MPAM0,SX109.WAV,TEST/DR8/MPAM0/SX109.WAV,TEST\\DR8\\MPAM0\\SX109.WAV,False,True,False,False,False


In [None]:
phon61_map39 = {
    'iy':'iy',  'ih':'ih',  'eh':'eh',  'ae':'ae',   'ix':'ih',  'ax':'ah',  'ah':'ah',   'uw':'uw',
    'ux':'uw',  'uh':'uh',  'ao':'aa',  'aa':'aa',   'ey':'ey',  'ay':'ay',  'oy':'oy',   'aw':'aw',
    'ow':'ow',  'l':'l',    'el':'l',   'r':'r',     'y':'y',    'w':'w',    'er':'er',   'axr':'er',
    'm':'m',    'em':'m',   'n':'n',    'nx':'n',    'en':'n',   'ng':'ng',  'eng':'ng',  'ch':'ch',
    'jh':'jh',  'dh':'dh',  'b':'b',    'd':'d',     'dx':'dx',  'g':'g',    'p':'p',     't':'t',
    'k':'k',    'z':'z',    'zh':'sh',  'v':'v',     'f':'f',    'th':'th',  's':'s',     'sh':'sh',
    'hh':'hh',  'hv':'hh',  'pcl':'h#', 'tcl':'h#',  'kcl':'h#', 'qcl':'h#', 'bcl':'h#',  'dcl':'h#',
    'gcl':'h#', 'h#':'h#',  '#h':'h#',  'pau':'h#',  'epi':'h#', 'nx':'n',   'ax-h':'ah', 'q':'h#'
}

PHONETIC_KEY: str = "phonetic_file"
AUDIO_KEY:    str = "audio_file"
WORD_KEY:     str = "word_file"


def convert_phon61_to_phon39(sentence: str) -> str:
    tokens = [phon61_map39[x] for x in sentence.split()]
    return " ".join(tokens)


def read_text_file(filepath: str) -> str:
    with open(filepath) as f:
        tokens = [line.split()[-1] for line in f]
        return " ".join(tokens)


def merge_item_data(df: pd.DataFrame) -> dict:
    data = {}

    for idx, row in tqdm(df.iterrows()):
        path = row['path_from_data_dir']
        entry_id = path.split('.')[0]
        entry_id = "-".join(entry_id.split('/')[-2:])

        if entry_id not in data:
            data[entry_id] = {}

        if row['is_audio'] is True:
            data[entry_id][AUDIO_KEY] = os.path.join(timit_data_path, path)
        elif row['is_word_file'] is True:
            data[entry_id][WORD_KEY] = read_text_file(os.path.join(timit_data_path, path))
        elif row['is_phonetic_file'] is True:
            phoneme_sentence = read_text_file(os.path.join(timit_data_path, path))
            data[entry_id][PHONETIC_KEY] = convert_phon61_to_phon39(phoneme_sentence)
    return data


def transform_dataset(df: pd.DataFrame) -> dict[str]:
    merged_data = merge_item_data(df)

    phonetic_files: List[str] = []
    audio_files:    List[str] = []
    word_files:     List[str] = []
    keys_set:       Set[str]  = set([PHONETIC_KEY, AUDIO_KEY, WORD_KEY])

    for key, value in merged_data.items():
        if set(value) == keys_set:
            phonetic_files.append(value[PHONETIC_KEY])
            audio_files.append(value[AUDIO_KEY])
            word_files.append(value[WORD_KEY])

    return {
        AUDIO_KEY: audio_files,
        WORD_KEY: word_files,
        PHONETIC_KEY: phonetic_files
    }

In [8]:
train = transform_dataset(df_train)
test = transform_dataset(df_test)

18480it [00:02, 6708.01it/s]
6720it [00:01, 5233.71it/s]


In [9]:
test.keys()

dict_keys(['audio_file', 'word_file', 'phonetic_file'])