In [3]:
%matplotlib inline

# Split long audio into special duration

This module can use for processing long audio split. You need modify the splice_
duration that you want for how long each audio clip, and modify output directory
you want and input directory you have.


In [4]:
# -*- coding: utf-8 -*-
""" Split long audio into special duration

This module can use for processing long audio split. You need modify the splice_
duration that you want for how long each audio clip, and modify output directory
you want and input directory you have.

################################################################################
# Author: Weikun Han <weikunhan@gmail.com>
# Crate Date: 02/20/2018        
# Update:
# Reference: https://github.com/jhetherly/EnglishSpeechUpsampler
################################################################################
"""

import os
import tqdm
import sox

# Please modify input path  to locate you file
DATASETS_ROOT_DIR = './datasets'
OUTPUT_DIR = os.path.join(DATASETS_ROOT_DIR, 'TEDLIUM_5S')
NOISE_OUTPUT_DIR = os.path.join(DATASETS_ROOT_DIR, 
                               'TEDLIUM_noise_sample_5S')

# Please modify setting for splice duration
splice_duration = 5

# List sub-folder for datasets
input_folder = ['TEDLIUM', 'TEDLIUM_noise_sample']

# Check location to save datasets
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
if not os.path.exists(NOISE_OUTPUT_DIR):
    os.makedirs(NOISE_OUTPUT_DIR)

print('Will send spliced audio to {}'.format(OUTPUT_DIR))
print('Will send noise spliced audio to {}'.format(NOISE_OUTPUT_DIR))

# Loop over all files within the TEDLIUM directory first and 
# loop over all files within the TEDLIUM_noise_sample directory second
for directory in input_folder:
    input_tmp_path = os.path.join(DATASETS_ROOT_DIR, directory)
    
    if directory is 'TEDLIUM':
        output_tmp_path = OUTPUT_DIR
    elif directory is 'TEDLIUM_noise_sample':
        output_tmp_path = NOISE_OUTPUT_DIR
    
    for filename in os.listdir(input_tmp_path):
        input_filename = os.path.join(input_tmp_path, filename)

        # Check if path is an existing regular file
        if not os.path.isfile(input_filename):
            continue
        filename_base = os.path.splitext(filename)[0]

        # This is the total audio track duration
        duration = sox.file_info.duration(input_filename)

        # Computer number of iterations for splicing
        n_iterations = int(duration / splice_duration)

        # Computer number of digits need for name the new file
        digits_number = len(str(int(duration)))

        # Create number format depends on number digits
        format_number = '{{:0{}d}}'.format(digits_number)

        # Create final file format
        filename_template = '{{}}_{}-{}.wav'.format(format_number, format_number)

        print('On file {}'.format(filename_base))

        for i in tqdm.trange(n_iterations):

            # create trasnformer
            splice = sox.Transformer()
            begin = int(i * splice_duration)
            end = int(begin + splice_duration)
            output_filename = filename_template.format(filename_base,
                                                       begin, 
                                                       end)
            output_filename = os.path.join(output_tmp_path, output_filename)
            splice.trim(begin, end)
            splice.build(input_filename, output_filename)

 14%|█▍        | 24/173 [00:00<00:00, 239.15it/s]

Will send spliced audio to ./datasets/TEDLIUM_5S
Will send noise spliced audio to ./datasets/TEDLIUM_noise_sample_5S
On file AaronHuey_2010X


100%|██████████| 173/173 [00:00<00:00, 204.27it/s]
 33%|███▎      | 20/60 [00:00<00:00, 191.59it/s]

On file AbigailWashburn_2012U


100%|██████████| 60/60 [00:00<00:00, 196.83it/s]
 10%|█         | 21/201 [00:00<00:00, 200.90it/s]

On file AaronKoblin_2011


100%|██████████| 201/201 [00:01<00:00, 192.73it/s]
 10%|█         | 21/204 [00:00<00:00, 209.79it/s]

On file AbrahamVerghese_2011G


100%|██████████| 204/204 [00:00<00:00, 246.89it/s]
 30%|███       | 30/100 [00:00<00:00, 295.19it/s]

On file 911Mothers_2010W


100%|██████████| 100/100 [00:00<00:00, 286.42it/s]
 36%|███▌      | 27/76 [00:00<00:00, 262.81it/s]

On file AaronOConnell_2011


100%|██████████| 76/76 [00:00<00:00, 274.12it/s]
 38%|███▊      | 29/76 [00:00<00:00, 281.02it/s]

On file AaronOConnell_2011_noise_sample


100%|██████████| 76/76 [00:00<00:00, 286.74it/s]
 48%|████▊     | 29/60 [00:00<00:00, 285.62it/s]

On file AbigailWashburn_2012U_noise_sample


100%|██████████| 60/60 [00:00<00:00, 286.51it/s]
 13%|█▎        | 26/204 [00:00<00:00, 251.13it/s]

On file AbrahamVerghese_2011G_noise_sample


100%|██████████| 204/204 [00:00<00:00, 274.22it/s]
 32%|███▏      | 32/100 [00:00<00:00, 312.73it/s]

On file 911Mothers_2010W_noise_sample


100%|██████████| 100/100 [00:00<00:00, 315.53it/s]
 16%|█▌        | 32/201 [00:00<00:00, 313.57it/s]

On file AaronKoblin_2011_noise_sample


100%|██████████| 201/201 [00:00<00:00, 282.66it/s]
 14%|█▍        | 25/173 [00:00<00:00, 244.05it/s]

On file AaronHuey_2010X_noise_sample


100%|██████████| 173/173 [00:00<00:00, 215.65it/s]
