In [None]:
import subprocess
import os
import whisper
import re

In [None]:
model = whisper.load_model("base")

In [None]:
def audioToText(model: whisper, audio) -> str:
    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # detect the spoken language
    _, probs = model.detect_language(mel)
    # print(f"Detected language: {max(probs, key=probs.get)}")

    # decode the audio
    options = whisper.DecodingOptions(fp16 = False)
    result = whisper.decode(model, mel, options)

    # print the recognized text
    return result.text

In [None]:
def videoToText(input_video_path: str = 'raw_video', output_audio_path: str = 'mp3', output_text_path: str = "text") -> None:
    if (os.path.exists(output_audio_path) == False):
        os.mkdir(output_audio_path)
    if (os.path.exists(output_text_path) == False):
        os.mkdir(output_text_path)

    # 1) iterate through all the mp4 files in the folder and segment them into 30 seconds audios
    for file in os.listdir(input_video_path):
        if file.endswith(".mp4"):
            # get the file name
            file_name = file.split(".")[0]
            # get the file path with os.getcwd()
            file_path = os.getcwd() + "/" + input_video_path + "/" + file_name + ".mp4"
            # output .mp3 file
            output_file = file_name + ".mp3"
            # convert the video to audio
            subprocess.run(["ffmpeg", "-i", file_path, output_audio_path + '//' + output_file], shell=True)
            # segment the audio
            subprocess.run(["ffmpeg", "-i", output_audio_path + '//' + output_file, "-f", "segment", "-segment_time", "30", "-c", "copy", output_audio_path + '//' + file_name + "_%03d.mp3"], shell=True)

    # 2) iterate through all the mp3 files in the folder and convert them to text
    for file in os.listdir(output_audio_path):
        if file.endswith(".mp3"):
            # get the file name
            file_name = file.split(".")[0]
            # get the file path with os.getcwd()
            file_path = os.getcwd() + "/" + output_audio_path + "/" + file_name + ".mp3"
            # convert the audio to text
            text = audioToText(model, file_path)
            # write the text to a file
            with open(output_text_path + "/" + file_name + ".txt", "w") as f:
                f.write(text)

In [None]:
def getFileName(file_path: str = 'raw_video') -> list:
    file_name_list = []
    for file in os.listdir(file_path):
        if file.endswith(".mp4"):
            file_name = file.split(".")[0]
            file_name_list.append(file_name)
    return file_name_list

In [None]:
def recordFileName(file_path: str = 'raw_video') -> None:
    file_name_list = getFileName(file_path)
    with open("record.txt", "w") as f:
        for file_name in file_name_list:
            f.write(file_name + "\n")

In [None]:
def combineTextFiles(fileNameRecords: str = 'record.txt', input_text_path: str = "text", output_text_path: str = "result_text") -> None:
   # read record.txt and group the file text as one
    with open(fileNameRecords, "r") as f:
        file_name_list = f.read().splitlines()
        for file_name in file_name_list:
            # match all text files with the same file name using regex
            file_name_pattern = file_name + "_\d{3}.txt"
            # get all the text files with the same file name
            file_name_list = [file for file in os.listdir(input_text_path) if re.match(file_name_pattern, file)]
            # sort the file name list
            file_name_list.sort()
            # concatenate the text files
            with open(output_text_path + "/" + file_name + ".txt", "w") as f:
                for file in file_name_list:
                    with open(input_text_path + "/" + file, "r") as f1:
                        f.write(f1.read() + "\n")

In [None]:
# 1) record the file name for regex matching later
recordFileName()
# 2) convert the video to text based on the 30 seconds segmentation
videoToText()
# 3) combine the text files into one according the recorded file name
combineTextFiles()