## Import Relevant Libraries

In [1]:
import os, multiprocessing
from tqdm import tqdm
from sphfile import SPHFile
from multiprocessing import Pool
import pandas as pd
from sph_to_wav import DirectoryManager, process_directory_manager_wv, process_directory_manager_et, process_directory_manager_sa, process_directory_manager_do

## Assign Directories

In [2]:
# Path to Test Folder STM & SPH Folders
test_dataset_sph1 = r"C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\sph"
test_dataset_stm1 = r"C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\stm"

# Path to Dev Folder STM & SPH Folders
test_dataset_sph2 = r"C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\sph"
test_dataset_stm2 = r"C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\stm"

# Path to Train Folder STM & SPH Folders
training_dataset_sph = r"C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\train\sph"
training_dataset_stm = r"C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\train\stm"

# Created Path to Wav Folders were SPH to Wav Files are stored
test_dataset_wav1 = r"C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav"
test_dataset_wav2 = r"C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav"
test_dataset_wav3 = r"C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\train\wav"

# Number of CPUs on Host Machine
max_workers = multiprocessing.cpu_count()

## Define sph to wav converter for Train and Test Dataset

#### Define function converter For Tedlium 1 Test Folder

In [3]:
sph_files = [os.path.join(test_dataset_sph1, sph_file) for sph_file in os.listdir(test_dataset_sph1)]

# Main function to set up the pool and pass the shared lock
if __name__ == "__main__":
    folder_lock = multiprocessing.Manager().Lock()
    dm_jobs = [DirectoryManager(sph_file, folder_lock) for sph_file in sph_files]
    with Pool(max_workers) as pool:
        # Use pool.imap to call write_wav method for each DirectoryManager instance in dm_jobs
        list(tqdm(pool.imap(process_directory_manager_wv, dm_jobs), total=len(dm_jobs)))

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:02<00:00,  5.49it/s]


#### Define function converter For Tedlium 1 Dev Folder

In [4]:
sph_files = [os.path.join(test_dataset_sph2, sph_file) for sph_file in os.listdir(test_dataset_sph2)]

# Main function to set up the pool and pass the shared lock
if __name__ == "__main__":
    folder_lock = multiprocessing.Manager().Lock()
    dm_jobs = [DirectoryManager(sph_file, folder_lock) for sph_file in sph_files]
    with Pool(max_workers) as pool:
        # Use pool.imap to call write_wav method for each DirectoryManager instance in dm_jobs
        list(tqdm(pool.imap(process_directory_manager_wv, dm_jobs), total=len(dm_jobs)))

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.69it/s]


#### Define function converter For Tedlium 1 Training Folder

In [5]:
sph_files = [os.path.join(training_dataset_sph, sph_file) for sph_file in os.listdir(training_dataset_sph)]

# Main function to set up the pool and pass the shared lock
if __name__ == "__main__":
    folder_lock = multiprocessing.Manager().Lock()
    dm_jobs = [DirectoryManager(sph_file, folder_lock) for sph_file in sph_files]
    with Pool(max_workers) as pool:
        # Use pool.imap to call write_wav method for each DirectoryManager instance in dm_jobs
        list(tqdm(pool.imap(process_directory_manager_wv, dm_jobs), total=len(dm_jobs)))

100%|████████████████████████████████████████████████████████████████████████████████| 774/774 [01:07<00:00, 11.52it/s]


## Define method to convert stm to Transcript Dataframe using Multiprocessing

#### Process Transcript from Test and Dev Folder as Test dataset

In [8]:
stm_files1 = [os.path.join(test_dataset_stm1, stm_file) for stm_file in os.listdir(test_dataset_stm1)]
stm_files2 = [os.path.join(test_dataset_stm2, stm_file) for stm_file in os.listdir(test_dataset_stm2)]

# Main function to set up the pool and pass the shared lock
if __name__ == "__main__":
    folder_lock = multiprocessing.Manager().Lock()
    dm_jobs = [DirectoryManager(stm_file, folder_lock) for stm_file in stm_files1]
    with Pool(max_workers) as pool:
        # Use pool.imap to call write_wav method for each DirectoryManager instance in dm_jobs
        result1 = list(tqdm(pool.imap(process_directory_manager_et, dm_jobs), total=len(dm_jobs)))

    dm_jobs = [DirectoryManager(stm_file, folder_lock) for stm_file in stm_files2]
    with Pool(max_workers) as pool:
        # Use pool.imap to call write_wav method for each DirectoryManager instance in dm_jobs
        result2 = list(tqdm(pool.imap(process_directory_manager_et, dm_jobs), total=len(dm_jobs)))

    transcript_df = pd.concat(result1 + result2, axis=0, ignore_index=True)
    transcript_df.to_parquet("test_transcripts.parquet", engine = 'pyarrow')
    with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
        print(transcript_df[:100])

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:01<00:00,  9.29it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00,  8.16it/s]

      Start      End                           File  \
0     0.000   17.820    AimeeMullins_2009P_Segment1   
1    17.820   28.810    AimeeMullins_2009P_Segment2   
2    28.810   40.266    AimeeMullins_2009P_Segment3   
3    40.266   41.418    AimeeMullins_2009P_Segment4   
4    41.418   48.340    AimeeMullins_2009P_Segment5   
5    48.340   49.340    AimeeMullins_2009P_Segment6   
6    49.340   56.390    AimeeMullins_2009P_Segment7   
7    56.390   65.810    AimeeMullins_2009P_Segment8   
8    65.810   75.190    AimeeMullins_2009P_Segment9   
9    75.190   76.226   AimeeMullins_2009P_Segment10   
10   76.226   85.350   AimeeMullins_2009P_Segment11   
11   85.350   86.326   AimeeMullins_2009P_Segment12   
12   86.326   95.050   AimeeMullins_2009P_Segment13   
13   95.050  104.050   AimeeMullins_2009P_Segment14   
14  104.050  104.300   AimeeMullins_2009P_Segment15   
15  104.300  109.730   AimeeMullins_2009P_Segment16   
16  109.730  121.230   AimeeMullins_2009P_Segment17   
17  121.23




#### Process Transcript for Training data

In [10]:
stm_files = [os.path.join(training_dataset_stm, stm_file) for stm_file in os.listdir(training_dataset_stm)]

# Main function to set up the pool and pass the shared lock
if __name__ == "__main__":
    folder_lock = multiprocessing.Manager().Lock()
    dm_jobs = [DirectoryManager(stm_file, folder_lock) for stm_file in stm_files]
    with Pool(max_workers) as pool:
        # Use pool.imap to call write_wav method for each DirectoryManager instance in dm_jobs
        result = list(tqdm(pool.imap(process_directory_manager_et, dm_jobs), total=len(dm_jobs)))

    transcript_df = pd.concat(result, axis=0, ignore_index=True)
    transcript_df.to_parquet("train_transcripts.parquet", engine = 'pyarrow')
    with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
        print(transcript_df[:100])

100%|███████████████████████████████████████████████████████████████████████████████| 774/774 [00:06<00:00, 115.24it/s]


     Start     End                          File  \
0    16.13   24.16      AaronHuey_2010X_Segment1   
1    26.00   34.53      AaronHuey_2010X_Segment2   
2    48.62   58.78      AaronHuey_2010X_Segment3   
3    86.85   96.23      AaronHuey_2010X_Segment4   
4    96.97  110.59      AaronHuey_2010X_Segment5   
5   123.83  133.94      AaronHuey_2010X_Segment6   
6   134.46  142.02      AaronHuey_2010X_Segment7   
7   145.77  151.91      AaronHuey_2010X_Segment8   
8   218.35  222.27      AaronHuey_2010X_Segment9   
9   223.12  232.68     AaronHuey_2010X_Segment10   
10  233.24  240.06     AaronHuey_2010X_Segment11   
11  241.60  249.81     AaronHuey_2010X_Segment12   
12  287.68  298.71     AaronHuey_2010X_Segment13   
13  409.38  418.03     AaronHuey_2010X_Segment14   
14  429.39  434.99     AaronHuey_2010X_Segment15   
15  457.79  464.32     AaronHuey_2010X_Segment16   
16  465.45  479.00     AaronHuey_2010X_Segment17   
17  480.17  488.24     AaronHuey_2010X_Segment18   
18  488.70  

## Slice Audio files into Segments based on time slices of Transcipt

In [11]:
wav_files1 = [os.path.join(test_dataset_wav1, wav_file) for wav_file in os.listdir(test_dataset_wav1)]
wav_files2 = [os.path.join(test_dataset_wav2, wav_file) for wav_file in os.listdir(test_dataset_wav2)]

# Main function to set up the pool and pass the shared lock
if __name__ == "__main__":
    folder_lock = multiprocessing.Manager().Lock()
    transcript_df = pd.read_parquet("test_transcripts.parquet", engine = 'pyarrow')
    dm_jobs = [DirectoryManager(wav_file, folder_lock) for wav_file in wav_files1]
    with Pool(max_workers) as pool:
        # Use pool.imap to call write_wav method for each DirectoryManager instance in dm_jobs
        list(tqdm(pool.starmap_async(process_directory_manager_sa, [(dm_job, transcript_df) for dm_job in dm_jobs]).get()))

    dm_jobs = [DirectoryManager(wav_file, folder_lock) for wav_file in wav_files2]
    with Pool(max_workers) as pool:
        # Use pool.imap to call write_wav method for each DirectoryManager instance in dm_jobs
        list(tqdm(pool.starmap_async(process_directory_manager_sa, [(dm_job, transcript_df) for dm_job in dm_jobs]).get()))

100%|██████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 6410.86it/s]


In [12]:
wav_files3 = [os.path.join(test_dataset_wav3, wav_file) for wav_file in os.listdir(test_dataset_wav3)]

# Main function to set up the pool and pass the shared lock
if __name__ == "__main__":
    folder_lock = multiprocessing.Manager().Lock()
    transcript_df = pd.read_parquet("train_transcripts.parquet", engine = 'pyarrow')
    dm_jobs = [DirectoryManager(wav_file, folder_lock) for wav_file in wav_files3]
    with Pool(max_workers) as pool:
        # Use pool.imap to call write_wav method for each DirectoryManager instance in dm_jobs
        list(tqdm(pool.starmap_async(process_directory_manager_sa, [(dm_job, transcript_df) for dm_job in dm_jobs]).get()))

100%|████████████████████████████████████████████████████████████████████████████████████████| 774/774 [00:00<?, ?it/s]


### Delete Redundant Audio_Segments based on Transcript interval for Audio Segments

In [78]:
# Main function to set up the pool and pass the shared lock
if __name__ == "__main__":
    folder_lock = multiprocessing.Manager().Lock()
    transcript_df = pd.read_parquet("train_transcripts.parquet", engine = 'pyarrow')

    outliers = []
    wav_dir = os.path.join(os.path.dirname(test_dataset_wav3), 'wav_segmented')
    for index, row in tqdm(transcript_df.iterrows()):
        wav_path = os.path.join(wav_dir, f'{row["File"]}.wav')
        '''
        Outliers are files and transcript whose transcript value are "ignore_time_segment_in_scoring" or 
        file size is less than 100 KB. 100KB becase files less than 100KB mostly hold poorly audible or
        inaudible words therefore considered noise
        '''
        if row['Transcript'] == 'ignore_time_segment_in_scoring' or (os.path.exists(wav_path) and os.path.getsize(wav_path) / 1024 < 100):
            print(f'Added Path: {wav_path}')
            outliers.append(wav_path)
    
    dm_jobs = [DirectoryManager(wav_file, folder_lock) for wav_file in outliers]
    with Pool(max_workers) as pool:
        # Use pool.imap to call write_wav method for each DirectoryManager instance in dm_jobs
        list(tqdm(pool.imap(process_directory_manager_do, dm_jobs)))

    outlier_filenames = [os.path.splitext(os.path.basename(wav_filepath))[0] for wav_filepath in outliers]   

    b_len = len(transcript_df)
    transcript_df = transcript_df[(transcript_df['Transcript'] != 'ignore_time_segment_in_scoring') & (~transcript_df['File'].isin(outlier_filenames))][['File','Transcript']]
    a_len = len(transcript_df)
    print(f'No of Rows Removed: {b_len - a_len}')
    transcript_df.reset_index(drop=True).to_parquet("train_ft_transcript.parquet", engine = 'pyarrow')

56803it [00:37, 1505.63it/s]
0it [00:00, ?it/s]


No of Rows Removed: 0


In [79]:
# Main function to set up the pool and pass the shared lock
if __name__ == "__main__":
    folder_lock = multiprocessing.Manager().Lock()
    transcript_df = pd.read_parquet("test_transcripts.parquet", engine = 'pyarrow')

    outliers = []
    wav_dir = os.path.join(os.path.dirname(test_dataset_wav1), 'wav_segmented')
    for index, row in tqdm(transcript_df.iterrows()):
        '''
        Outliers are files and transcript whose transcript value are "ignore_time_segment_in_scoring" or 
        file size is less than 100 KB. 100KB becase files less than 100KB mostly hold poorly audible or
        inaudible words therefore considered noise
        '''
        wav_path = os.path.join(wav_dir, f'{row["File"]}.wav')
        if row['Transcript'] == 'ignore_time_segment_in_scoring' or (os.path.exists(wav_path) and os.path.getsize(wav_path) / 1024 < 100): 
            print(f'Added Path: {wav_path}')
            outliers.append(wav_path)

    wav_dir = os.path.join(os.path.dirname(test_dataset_wav2), 'wav_segmented')
    for index, row in tqdm(transcript_df.iterrows()):
        '''
        Outliers are files and transcript whose transcript value are "ignore_time_segment_in_scoring" or 
        file size is less than 100 KB. 100KB becase files less than 100KB mostly hold poorly audible or
        inaudible words therefore considered noise
        '''
        wav_path = os.path.join(wav_dir, f'{row["File"]}.wav')
        if row['Transcript'] == 'ignore_time_segment_in_scoring' or (os.path.exists(wav_path) and os.path.getsize(wav_path) / 1024 < 100):
            print(f'Added Path: {wav_path}')
            outliers.append(wav_path)
    
    dm_jobs = [DirectoryManager(wav_file, folder_lock) for wav_file in outliers]
    with Pool(max_workers) as pool:
        # Use pool.imap to call write_wav method for each DirectoryManager instance in dm_jobs
        list(tqdm(pool.imap(process_directory_manager_do, dm_jobs)))

    outlier_filenames = [os.path.splitext(os.path.basename(wav_filepath))[0] for wav_filepath in outliers]
    
    b_len = len(transcript_df)
    transcript_df = transcript_df[(transcript_df['Transcript'] != 'ignore_time_segment_in_scoring') & (~transcript_df['File'].isin(outlier_filenames))][['File','Transcript']]
    a_len = len(transcript_df)
    print(f'No of Rows Removed: {b_len - a_len}')
    
    transcript_df.reset_index(drop=True).to_parquet("test_ft_transcript.parquet", engine = 'pyarrow')

292it [00:00, 1314.72it/s]

Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\AimeeMullins_2009P_Segment1.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\AimeeMullins_2009P_Segment4.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\AimeeMullins_2009P_Segment6.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\AimeeMullins_2009P_Segment10.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\AimeeMullins_2009P_Segment12.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\AimeeMullins_2009P_Segment15.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\AimeeMullins_2009P_Segment23.wav
Added Path: C:\Users\Dell\Docu

869it [00:00, 2354.17it/s]

Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\BillGates_2010_Segment148.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\BillGates_2010_Segment150.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\BillGates_2010_Segment152.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\BillGates_2010_Segment154.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\BillGates_2010_Segment163.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\BillGates_2010_Segment165.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\BillGates_2010_Segment174.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speec

1109it [00:00, 1930.37it/s]

Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\DanielKahneman_2010_Segment133.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\DanielKahneman_2010_Segment135.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\DanielKahneman_2010_Segment140.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\DanielKahneman_2010_Segment145.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\DanielKahneman_2010_Segment151.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\DanielKahneman_2010_Segment156.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\DanielKahneman_2010_Segment163.wav
Added Path: C

1315it [00:00, 1625.05it/s]

Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\JaneMcGonigal_2010_Segment63.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\JaneMcGonigal_2010_Segment101.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\JaneMcGonigal_2010_Segment113.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\MichaelSpecter_2010_Segment1.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\MichaelSpecter_2010_Segment15.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\MichaelSpecter_2010_Segment17.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\MichaelSpecter_2010_Segment26.wav
Added Path: C:\Users\D

2060it [00:01, 1978.31it/s]

Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\TomWujec_2010U_Segment43.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\AlGore_2009_Segment1.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\AlGore_2009_Segment6.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\AlGore_2009_Segment10.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\AlGore_2009_Segment25.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\AlGore_2009_Segment27.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\test\wav_segmented\AlGore_2009_Segment29.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLI


404it [00:00, 4014.36it/s]

Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\AimeeMullins_2009P_Segment1.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\AimeeMullins_2009P_Segment4.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\AimeeMullins_2009P_Segment6.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\AimeeMullins_2009P_Segment10.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\AimeeMullins_2009P_Segment12.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\AimeeMullins_2009P_Segment15.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\AimeeMullins_2009P_Segment23.wav
Added Path: C:\Users\Dell\Documents\G

1113it [00:00, 2711.58it/s]

Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\DanBarber_2010_Segment348.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\DanBarber_2010_Segment350.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\DanBarber_2010_Segment352.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\DanBarber_2010_Segment354.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\DanBarber_2010_Segment356.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\DanBarber_2010_Segment358.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\DanBarber_2010_Segment360.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recog

1394it [00:00, 2416.97it/s]

Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\JamesCameron_2010_Segment104.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\JaneMcGonigal_2010_Segment1.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\JaneMcGonigal_2010_Segment15.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\JaneMcGonigal_2010_Segment63.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\JaneMcGonigal_2010_Segment101.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\JaneMcGonigal_2010_Segment113.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\MichaelSpecter_2010_Segment1.wav
Added Path: C:\Users\Dell\Documen

1643it [00:00, 2033.96it/s]

Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\AlGore_2009_Segment52.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\AlGore_2009_Segment54.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\AlGore_2009_Segment56.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\AlGore_2009_Segment58.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\AlGore_2009_Segment62.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\BarrySchwartz_2005G_Segment1.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\BarrySchwartz_2005G_Segment18.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Datase

2060it [00:00, 2120.84it/s]

Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\DavidMerrill_2009_Segment16.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\DavidMerrill_2009_Segment19.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\DavidMerrill_2009_Segment21.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\DavidMerrill_2009_Segment23.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\DavidMerrill_2009_Segment25.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\DavidMerrill_2009_Segment27.wav
Added Path: C:\Users\Dell\Documents\GitHub\Speech Recognition\Dataset\TEDLIUM_release1\dev\wav_segmented\DavidMerrill_2009_Segment29.wav
Added Path: C:\Users\Dell\Documents\GitHu


796it [00:07, 113.14it/s]


No of Rows Removed: 398
