In [1]:
import os
from joblib import Parallel, delayed
from os.path import join
from pathlib import Path
import re
import shutil
import subprocess
import num2words
import pydub
from pydub import AudioSegment

In [2]:
import logging

# Logger setup
logger = logging.getLogger()
logging.basicConfig(level="INFO", format="%(levelname)s: %(filename)s: %(message)s")

## List files

``` python
dv_set = Dataset(path, dev_split, tokenizer, 1)
tr_set = Dataset(path, train_split, tokenizer, bucket_size, ascending=ascending)
```

``` python
class LibriDataset(Dataset):
    def __init__(self, path, split, tokenizer, bucket_size, ascending=False):
        # Setup
        self.path = path
        self.bucket_size = bucket_size

        # List all wave files
        file_list = []
        for s in split:
            split_list = list(Path(join(path, s)).rglob("*.flac"))
            assert len(split_list) > 0, "No data found @ {}".format(join(path,s))
            file_list += split_list
        # Read text
        text = Parallel(n_jobs=READ_FILE_THREADS)(
            delayed(read_text)(str(f)) for f in file_list)
        #text = Parallel(n_jobs=-1)(delayed(tokenizer.encode)(txt) for txt in text)
        text = [tokenizer.encode(txt) for txt in text]

        # Sort dataset by text length
        #file_len = Parallel(n_jobs=READ_FILE_THREADS)(delayed(getsize)(f) for f in file_list)
        self.file_list, self.text = zip(*[(f_name, txt)
                                          for f_name, txt in sorted(zip(file_list, text), reverse=not ascending, key=lambda x:len(x[1]))])

    def __getitem__(self, index):
        if self.bucket_size > 1:
            # Return a bucket
            index = min(len(self.file_list)-self.bucket_size, index)
            return [(f_path, txt) for f_path, txt in
                    zip(self.file_list[index:index+self.bucket_size], self.text[index:index+self.bucket_size])]
        else:
            return self.file_list[index], self.text[index]

    def __len__(self):
        return len(self.file_list)
```

```
file data/LibriSpeech/dev-clean/1272/128104/1272-128104-0000.flac
src_file data/LibriSpeech/dev-clean/1272/128104/1272-128104.trans.txt
idx 1272-128104-0000
MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL
```

## Ted Srt

### from scraper/data to data/TedSrt

In [3]:
AUDIO_EXTENSION = 'mp3'
READ_FILE_THREADS = 1
path = 'data/TedSrt'
split = 'train'

src_path = 'scraper/data'

In [4]:
def clean_text(text):
    '''
    Text processing to clean text before saving as label
    to lowercase, convert years to words, convert digits to words, remove symbols
    '''
    text = text.lower().strip('\n')
    text = re.sub(r'[^\w\s]', ' ', text)
    text = ' '.join([num2words.num2words(i, to='year') if (i.isdigit() & (len(i) == 4)) else i for i in text.split()]) # year to words
    text = ' '.join([num2words.num2words(i) if i.isdigit() else i for i in text.split()]) # num to words
    text = re.sub(' +', ' ', text) # remove redundant spaces
    text = text.replace('-', ' ')
    return text

def to_ms(string):
    '''
    Convert string '00:00:00,000' to milliseconds
    to be used for audio slicing
    '''
    string = string.replace(',','')
    hour, minute, second = string.split(':')
    second = int(second)
    second += int(hour) * 3600 * 1000
    second += int(minute) * 60 * 1000
    second = second
    return second

def txt_to_trans(txt_file, file_name, text_processing=clean_text):
    '''
    Convert txt file to transcript format ready to be read into Dataset
    lines formatted as 'filename-idx text_label'
    return lines and time_slices
    '''
    file = open(txt_file, 'r')
    lines = file.readlines()
    file.close()
    
    transcript = []
    time_slices = []

    for i in range(len(lines)):
        idx = re.search('^[\d]+$', lines[i].strip('\ufeff'))
        if idx:
            idx = idx[0]
            time_frame = re.findall('[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3}', lines[i+1])
            if time_frame:
                start, end = to_ms(time_frame[0]), to_ms(time_frame[1])
                time_slices.append((idx, (start, end)))

                text = lines[i+2]
                text = text_processing(text)
                new_line = f"{file_name}-{idx} {text}"
                transcript.append(new_line)
                
    return transcript, time_slices

def save_trans(transcript, output_path):
    '''
    save trans to output_path
    '''
    if not os.path.exists(os.path.dirname(output_path)):
        os.makedirs(os.path.dirname(output_path))

        with open(output_path, 'w+') as f:
            for line in transcript:
                f.write(f"{line}\n")
            f.close()


In [41]:
def check_ms_accuracy(time_slices, occurrence_threshold=20):
    for time_slice in time_slices:
        occurrence_820 = sum(1 for elem in time_slices if elem[1][0] % 1000 == 820)
    if occurrence_820 > 20:
        return occurrence_820
    return 0

def check_intro_timing(audio_duration, audio_srt_duration):
    duration_diff = audio_duration - audio_srt_duration
    if duration_diff > 7:
        return duration_diff
    return 0

In [42]:
shutil.rmtree(join(path, split), ignore_errors=True)
folder_list = os.listdir(src_path)
for idx, curr_folder in enumerate(folder_list):
    file_name = str(idx) #save the transcript as num, can be changed to folder name
    output_path = join(path, split, file_name)
    txt_output_path = join(output_path, file_name + '.trans.txt')
    
    logging.info(f"{idx}. Creating transcript for {curr_folder}...")
    txt_path = list(Path(join(src_path, curr_folder)).rglob('*.txt'))[0]
    transcript, time_slices = txt_to_trans(txt_path, file_name)
    
    logging.info(f"{idx}. Slicing audio for {curr_folder}...")
    audio_path = list(Path(join(src_path, curr_folder)).rglob('*.' + AUDIO_EXTENSION))[0]
    audio_file = AudioSegment.from_file(audio_path, AUDIO_EXTENSION)
    
    # check whether srt is accurate, remove those not accurate to milliseconds (all time that starts with __,820ms)
    ms_not_accurate = check_ms_accuracy(time_slices)
    if ms_not_accurate:
        logging.warning(f"{idx}. Likely srt not accurate with {ms_not_accurate} 820s. Deleting foler {output_path}")
        shutil.rmtree(output_path, ignore_errors=True)
        continue
    
    # check whether srt matches audio, remove those without taking water drop intro into account
    audio_duration = audio_file.duration_seconds
    audio_srt_duration = time_slices[-1][-1][-1] / 1000
    intro_not_matched = check_intro_timing(audio_duration, audio_srt_duration)
    if intro_not_matched:
        logging.warning(f"{idx}. Likely srt not matching with time slices. Deleting foler {output_path}")
        shutil.rmtree(output_path, ignore_errors=True)
        continue
    
    # writing output
    save_trans(transcript, txt_output_path)
    for idx, time_slice in time_slices:
        audio_slice = audio_file[time_slice[0]:time_slice[1]]
        audio_output_path = join(output_path, f"{file_name}-{idx}.{AUDIO_EXTENSION}")
        audio_slice.export(audio_output_path, format=AUDIO_EXTENSION)

INFO: 2397158020.py: 0. Creating transcript for alan_kay_a_powerful_idea_about_ideas...
INFO: 2397158020.py: 0. Slicing audio for alan_kay_a_powerful_idea_about_ideas...
INFO: 2397158020.py: 1. Creating transcript for alan_russell_the_potential_of_regenerative_medicine...
INFO: 2397158020.py: 1. Slicing audio for alan_russell_the_potential_of_regenerative_medicine...
INFO: 2397158020.py: 2. Creating transcript for alison_jackson_an_unusual_glimpse_at_celebrity...
INFO: 2397158020.py: 2. Slicing audio for alison_jackson_an_unusual_glimpse_at_celebrity...
INFO: 2397158020.py: 3. Creating transcript for allison_hunt_how_to_get_a_new_hip...
INFO: 2397158020.py: 3. Slicing audio for allison_hunt_how_to_get_a_new_hip...
INFO: 2397158020.py: 4. Creating transcript for anand_agarawala_rethink_the_desktop_with_bumptop...
INFO: 2397158020.py: 4. Slicing audio for anand_agarawala_rethink_the_desktop_with_bumptop...
INFO: 2397158020.py: 5. Creating transcript for arthur_benjamin_a_performance_of_m

In [None]:
assert False, "breakpoint"

In [None]:
audio_file = AudioSegment.from_file('data/audio.mp3', AUDIO_EXTENSION)

In [9]:
audio_file.duration_seconds

814.2

## Video to wav

In [None]:
def video_to_wav(video_file, output_dir=''):
    """ calls ffmpeg to convert *.mp4 to *.wav if wav not exists
    """
    # check if wav file exists
    wav_file = video_file.replace('.mp4', '.wav')
    wav_file = os.path.join(output_dir, wav_file)

    if not os.path.isfile(wav_file):
        command = f'ffmpeg -i {video_file} -ac 1 -ar 20000 -vn {wav_file} -hide_banner -loglevel error' # -ab audio bitrate -ac audio channels -ar audio sampling rate -vn disable video
        subprocess.call(command, shell=True)
    
def dir_to_wav(video_dir):
    """ convert *.mp4 in directory to *.wav
    """
    for file in os.listdir(video_dir):
        if file.endswith('.mp4'):
            video_to_wav(file)
            
video_dir = '.'
dir_to_wav(video_dir)

## SRT

In [None]:
def clean_text(text):
    text = text.lower().strip('\n')
    text = re.sub(r'[^\w\s]', ' ', text)
    text = ' '.join([num2words.num2words(i, to='year') if (i.isdigit() & (len(i) == 4)) else i for i in text.split()]) # year to words
    text = ' '.join([num2words.num2words(i) if i.isdigit() else i for i in text.split()]) # num to words
    text = re.sub(' +', ' ', text) # remove redundant spaces
    text = text.replace('-', ' ')
    return text

def to_ms(string):
    string = string.replace(',','')
    hour, minute, second = string.split(':')
    second = int(second)
    second += int(hour) * 3600 * 1000
    second += int(minute) * 60 * 1000
    second = second
    # print(hour, minute, second)
    return second

In [None]:
def srt_to_txt(srt_file, srt_dir=''):
    filepath = os.path.join(srt_dir, srt_file)
    file = open(filepath, 'r')
    lines = file.readlines()
    file.close()
    
    transcript = []
    time_slices = []

    for i in range(len(lines)):
        number = re.search('^[\d]+$', lines[i].strip('\ufeff'))
        if number:
            number = number[0]
            time = re.findall('[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3}', lines[i+1])
            if time:
                start, end = to_ms(time[0]), to_ms(time[1])
                time_slices.append((number, (start, end)))

                text = lines[i+2]
                text = clean_text(text)
                new_line = f"{srt_file.strip('.srt')}-{number} {text}"
                transcript.append(new_line)
                
    return transcript, time_slices
    
srt_file = 'TanyaBoucicaut_WYSRTheirEyesWereWatchingGod_2021E.srt'
srt_dir = 'data'
transcript, time_slices = srt_to_txt(srt_file, srt_dir)




In [None]:
wavfile = 'data/TanyaBoucicaut_WYSRTheirEyesWereWatchingGod_2021E-600k.wav'
wav = AudioSegment.from_wav(wavfile)

for num, time_slice in time_slices:
    wav_slice = wav[time_slice[0]:time_slice[1]]
    wav_slice.export(f"{wavfile.strip('.wav')}-{num}.wav", format="wav")

In [None]:
mp4_version = AudioSegment.from_file("data/audio.mp3", "mp3")

In [None]:
# write to file
with open('your_file.txt', 'w') as f:
    for item in transcript:
        f.write("%s\n" % item)