# Get the LibriSpeech Clean Dataset

For the purpose of this project, we use ~500 hrs of 'clean' dataset obtained from LibriSpeech Clean data for training. This includes:
1. train-clean-360
2. train-clean-100
3. dev-clean
4. test-clean

These were downloaded and merged into a single folder, which consisted of 1252 fist level folders, here saved into the training-clean folder. Each of these folders contained the audio files for a particular speaker. The audio files were in .flac format.

We then create a vocabulary using text transcripts

In [3]:
import sys
import glob
import os
import pandas as pd
import numpy as np
import librosa
import collections
import matplotlib.pyplot as plt
import random

sys.path.append("/training-clean".join(sys.path[0].split("/training-clean")[:-1]))
from utils import *

In [4]:
def read_txt(file_path):
    total_lines = []
    with open(file_path, 'r') as f:
        lines = f.readlines()
        for l in lines:
            word_arr = l.lower().strip().split()
            assert len(word_arr) >= 2, l
            word_arr = word_arr[1:]
            total_lines.append(word_arr)
    return total_lines

def create_word_counter(root_folder):
    pattern = root_folder + '*/*/*.txt'
    arr = glob.glob(pattern)
    arr.sort()
    assert len(arr) == 2866
    total_lines = []
    for txt in arr:
        total_lines.extend(read_txt(txt))

    counter = collections.Counter()
    for line in total_lines:
        for word in line:
            counter[word] += 1
    return counter

# Get alignment files

Download the ```.txt``` alignment files openly available on GitHub. These files contain the start and end times of each word in the audio file. We use these files to create a vocabulary of words.

In [5]:
def handle_single_alignment_file(p1):
    obj_list = []
    with open(p1, 'r') as f:
        lines = f.readlines()
        lines = [l.strip() for l in lines if l.strip()]
        for l in lines:
            key, text, time_str = l.strip()
            text_string = text[1:-1]
            time_string = time_string[1:-1]
            text_array = text_string.lower().split(",")
            time_array = [float(c) for c in time_string.split(",")]
            assert len(time_array) == len(text_array)
            ans_time_array = []
            ans_word_array = []
            for i in range(len(text_array)):
                word = text_array[i]
                if i > 0:
                    start_time = time_array[i-1]
                else:
                    start_time = 0

                end_time = time_array[i]
                if word:
                    ans_word_array.append(word)
                    ans_time_array.append((start_time, end_time))
            audio_duration = time_array[-1]
            obj = {"key": key, "word_array": ans_word_array, "time_array": ans_time_array, "duration": audio_duration}
            obj_list.append(obj)
    return obj_list

def convert_alignment_files(alignment_folder):
    #step 1: get all the alginment files
    names = ['test-clean', 'train-clean-360', 'train-clean-100', 'dev-clean']
    txt_file_arr = []
    for name in names:
        pattern = alignment_folder + name + '/*/*.txt'
        txt_file_arr.extend(glob.glob(pattern))
    assert len(txt_file_arr) == 2866

    all_objs = []
    for f in txt_file_arr:
        all_objs.extend(handle_single_alignment_file(f))
    assert len(all_objs) == 137833
    return all_objs

def create_word2keys(all_objs):
    word2keys = collections.defaultdict(list)
    for obj in all_objs:
        key = obj["key"]
        word_array = obj["word_array"]
        for word in word_array:
            word2keys[word].append(key)
    return word2keys