In [1]:
import os
import re
import subprocess
import num2words
import pydub
from pydub import AudioSegment

## Video to wav

In [47]:
def video_to_wav(video_file, output_dir=''):
    """ calls ffmpeg to convert *.mp4 to *.wav if wav not exists
    """
    # check if wav file exists
    wav_file = video_file.replace('.mp4', '.wav')
    wav_file = os.path.join(output_dir, wav_file)

    if not os.path.isfile(wav_file):
        command = f'ffmpeg -i {video_file} -ac 1 -ar 20000 -vn {wav_file} -hide_banner -loglevel error' # -ab audio bitrate -ac audio channels -ar audio sampling rate -vn disable video
        subprocess.call(command, shell=True)
    
def dir_to_wav(video_dir):
    """ convert *.mp4 in directory to *.wav
    """
    for file in os.listdir(video_dir):
        if file.endswith('.mp4'):
            video_to_wav(file)
            
video_dir = '.'
dir_to_wav(video_dir)

## SRT

In [35]:
def clean_text(text):
    text = text.lower().strip('\n')
    text = re.sub(r'[^\w\s]', ' ', text)
    text = ' '.join([num2words.num2words(i, to='year') if (i.isdigit() & (len(i) == 4)) else i for i in text.split()]) # year to words
    text = ' '.join([num2words.num2words(i) if i.isdigit() else i for i in text.split()]) # num to words
    text = re.sub(' +', ' ', text) # remove redundant spaces
    text = text.replace('-', ' ')
    return text

def to_ms(string):
    string = string.replace(',','')
    hour, minute, second = string.split(':')
    second = int(second)
    second += int(hour) * 3600 * 1000
    second += int(minute) * 60 * 1000
    second = second
    # print(hour, minute, second)
    return second

In [None]:
def srt_to_txt(srt_file, srt_dir=''):
    filepath = os.path.join(srt_dir, srt_file)
    file = open(filepath, 'r')
    lines = file.readlines()
    file.close()
    
    transcript = []
    time_slices = []

    for i in range(len(lines)):
        number = re.search('^[\d]+$', lines[i].strip('\ufeff'))
        if number:
            number = number[0]
            time = re.findall('[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3}', lines[i+1])
            if time:
                start, end = to_ms(time[0]), to_ms(time[1])
                time_slices.append((number, (start, end)))

                text = lines[i+2]
                text = clean_text(text)
                new_line = f"{srt_file.strip('.srt')}-{number} {text}"
                transcript.append(new_line)
                
    return transcript, time_slices
    
srt_file = 'TanyaBoucicaut_WYSRTheirEyesWereWatchingGod_2021E.srt'
srt_dir = 'data'
transcript, time_slices = srt_to_txt(srt_file, srt_dir)




In [39]:
wavfile = 'data/TanyaBoucicaut_WYSRTheirEyesWereWatchingGod_2021E-600k.wav'
wav = AudioSegment.from_wav(wavfile)

for num, time_slice in time_slices:
    wav_slice = wav[time_slice[0]:time_slice[1]]
    wav_slice.export(f"{wavfile.strip('.wav')}-{num}.wav", format="wav")

In [41]:
# write to file
with open('your_file.txt', 'w') as f:
    for item in transcript:
        f.write("%s\n" % item)