In [None]:
!pip install django
!pip install install sox

In [None]:
from django.urls import path
import os
from six.moves import urllib
import argparse
import re
import tempfile
import shutil
import subprocess
import tarfile
import io
from tqdm import tqdm
from typing import Optional

from __future__ import print_function

import json
from multiprocessing import Pool
from pathlib import Path
from typing import Optional

import sox
from tqdm import tqdm

In [None]:
def add_data_opts(parser):
    data_opts = parser.add_argument_group("General Data Options")
    data_opts.add_argument('--manifest-dir', default='./', type=str,
                           help='Output directory for manifests')
    data_opts.add_argument('--min-duration', default=1, type=int,
                           help='Prunes training samples shorter than the min duration (given in seconds, default 1)')
    data_opts.add_argument('--max-duration', default=15, type=int,
                           help='Prunes training samples longer than the max duration (given in seconds, default 15)')
    parser.add_argument('--num-workers', default=4, type=int, help='Number of workers for processing data.')
    parser.add_argument('--sample-rate', default=16000, type=int, help='Sample rate')
    return parser
def create_manifest(
        data_path: str,
        output_name: str,
        manifest_path: str,
        num_workers: int,
        min_duration: Optional[float] = None,
        max_duration: Optional[float] = None,
        file_extension: str = "wav"):
    data_path = os.path.abspath(data_path)
    file_paths = list(Path(data_path).rglob(f"*.{file_extension}"))
    file_paths = order_and_prune_files(
        file_paths=file_paths,
        min_duration=min_duration,
        max_duration=max_duration,
        num_workers=num_workers
    )
    output_path = Path(manifest_path) / output_name
    output_path.parent.mkdir(exist_ok=True, parents=True)

    manifest = {
        'root_path': data_path,
        'samples': []
    }
    for wav_path in tqdm(file_paths, total=len(file_paths)):
        wav_path = wav_path.relative_to(data_path)
        transcript_path = wav_path.parent.with_name("txt") / wav_path.with_suffix(".txt").name
        manifest['samples'].append({
            'wav_path': wav_path.as_posix(),
            'transcript_path': transcript_path.as_posix()
        })

    output_path.write_text(json.dumps(manifest), encoding='utf8')

def _duration_file_path(path):
    return path, sox.file_info.duration(path)


def order_and_prune_files(
        file_paths,
        min_duration,
        max_duration,
        num_workers):
    print("Gathering durations...")
    with Pool(processes=num_workers) as p:
        duration_file_paths = list(tqdm(p.imap(_duration_file_path, file_paths), total=len(file_paths)))
    print("Sorting manifests...")
    if min_duration and max_duration:
        print("Pruning manifests between %d and %d seconds" % (min_duration, max_duration))
        duration_file_paths = [(path, duration) for path, duration in duration_file_paths if
                               min_duration <= duration <= max_duration]

    total_duration = sum([x[1] for x in duration_file_paths])
    print(f"Total duration of split: {total_duration:.4f}s")
    return [x[0] for x in duration_file_paths]  # Remove durations

In [None]:
VOXFORGE_URL_16kHz = 'http://www.repository.voxforge1.org/downloads/en/Trunk/Audio/Main/16kHz_16bit/'

parser = argparse.ArgumentParser(description='Processes and downloads VoxForge dataset.')
parser = add_data_opts(parser)
parser.add_argument("--target-dir", default='voxforge_dataset/', type=str, help="Directory to store the dataset.")
# args = parser.parse_args()
args, unknown = parser.parse_known_args()

In [None]:
def _get_recordings_dir(sample_dir, recording_name):
    wav_dir = os.path.join(sample_dir, recording_name, "wav")
    if os.path.exists(wav_dir):
        return "wav", wav_dir
    flac_dir = os.path.join(sample_dir, recording_name, "flac")
    if os.path.exists(flac_dir):
        return "flac", flac_dir
    raise Exception("wav or flac directory was not found for recording name: {}".format(recording_name))

In [None]:
def prepare_sample(recording_name, url, target_folder):
    """
    Downloads and extracts a sample from VoxForge and puts the wav and txt files into :target_folder.
    """
    wav_dir = os.path.join(target_folder, "wav")
    if not os.path.exists(wav_dir):
        os.makedirs(wav_dir)
    txt_dir = os.path.join(target_folder, "txt")
    if not os.path.exists(txt_dir):
        os.makedirs(txt_dir)
    # check if sample is processed
    filename_set = set(['_'.join(wav_file.split('_')[:-1]) for wav_file in os.listdir(wav_dir)])
    if recording_name in filename_set:
        return

    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    content = response.read()
    response.close()
    with tempfile.NamedTemporaryFile(suffix=".tgz", mode='wb') as target_tgz:
        target_tgz.write(content)
        target_tgz.flush()
        dirpath = tempfile.mkdtemp()

        tar = tarfile.open(target_tgz.name)
        tar.extractall(dirpath)
        tar.close()

        recordings_type, recordings_dir = _get_recordings_dir(dirpath, recording_name)
        tgz_prompt_file = os.path.join(dirpath, recording_name, "etc", "PROMPTS")

        if os.path.exists(recordings_dir) and os.path.exists(tgz_prompt_file):
            transcriptions = open(tgz_prompt_file).read().strip().split("\n")
            transcriptions = {t.split()[0]: " ".join(t.split()[1:]) for t in transcriptions}
            for wav_file in os.listdir(recordings_dir):
                recording_id = wav_file.split('.{}'.format(recordings_type))[0]
                
                transcription_key = recording_name + "/mfc/" + recording_id
            
                if transcription_key not in transcriptions:
                    continue
                utterance = transcriptions[transcription_key]

                target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(recording_name, recording_id))
          
                
                target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id))
              
                with io.FileIO(target_txt_file, "w") as file:
                    file.write(utterance.encode('utf-8'))
                original_wav_file = os.path.join(recordings_dir, wav_file)   
                shutil.copyfile(original_wav_file, target_wav_file)
        shutil.rmtree(dirpath)

In [None]:
if __name__ == '__main__':
    target_dir = args.target_dir
    sample_rate = args.sample_rate

    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    request = urllib.request.Request(VOXFORGE_URL_16kHz)
    response = urllib.request.urlopen(request)
    content = response.read()
    all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8"))
    for f in tqdm(all_files, total=len(all_files)):
        prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + f, target_dir)
   

In [None]:
from fastai.imports import *

print('Creating manifests...')
create_manifest(
    data_path=target_dir,
    output_name='voxforge_train_manifest.json',
    manifest_path=args.manifest_dir,
    min_duration=args.min_duration,
    max_duration=args.max_duration,
    num_workers=args.num_workers
)