In [58]:
import os
import torch
import pandas as pd
import datetime
import soundfile as sf
from pydub import AudioSegment
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

In [112]:
def parse_time(t):
    # start_time, end_time = map(int, t.split("_"))
    # dur = end_time - start_time
    # return start_time, dur
    start_time, end_time = map(int, t.split("_"))
    start_time /= 1000
    start_hour = start_time//3600%24
    start_min = start_time//60%60
    start_sec = start_time%60
    start = "%02d:%02d:%06.3f"%(start_hour, start_min, start_sec)
    
    end_time /= 1000
    end_hour = end_time//3600%24
    end_min = end_time//60%60
    end_sec = end_time%60
    end = "%02d:%02d:%06.3f"%(end_hour, end_min, end_sec)

    return f"{start} --> {end}".replace(".", ",")

    
def trans2flac(audio_path, audio_name, output_path, format):
    audio_type = audio_name.split(".")[-1][1:]
    exp_file = audio_name.split(".")[0]
    audio = AudioSegment.from_file(os.path.join(audio_path, audio_name), format=audio_type)
    audio.export(f'{os.path.join(output_path, exp_file)}.{format}', format = str(format))
    return f"{exp_file}.{format}"

def speech2text(path):
    # load audio
    audio_input, sample_rate = sf.read(path)
    # pad input values and return pt tensor
    input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values.to(DEVICE)
    # retrieve logits & take argmax
    pred = model(input_values)
    logits = pred.logits
    predicted_ids = torch.argmax(logits, dim=-1)[0]

    # transcribe
    transcription = processor.decode(predicted_ids)

    return transcription

In [113]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
audio_name = "Interstellar"
base = os.path.abspath(os.path.join(os.path.abspath("."), ".."))
AUDIO_FOLDER = os.path.join(base, "Data", "outputs", audio_name, "Preprocess", "Split")
CLEAN_SPEECH_PATH = os.path.join(base, "Data", "Clean splits", audio_name)
TRANSCIPTION_PATH = os.path.join(base, "Data", "Transciption")
print(AUDIO_FOLDER)
if not os.path.exists(CLEAN_SPEECH_PATH):
    os.mkdir(CLEAN_SPEECH_PATH)

e:\Graduate\2021-2022 Term 2\AIPI540\Individual Project\Data\outputs\Interstellar\Preprocess\Split


In [114]:
# load pretrained model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(DEVICE)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [115]:
AUDIO_NAMES = []
for root, dir, files in os.walk(AUDIO_FOLDER):
    AUDIO_NAMES += files
AUDIO_TIME = list([parse_time(time.split(".")[0]) for time in AUDIO_NAMES])
CLEAN_SPEECH = [trans2flac(AUDIO_FOLDER, audio_name, CLEAN_SPEECH_PATH, "flac") for audio_name in AUDIO_NAMES]
AUDIO_PATH = [os.path.join(CLEAN_SPEECH_PATH, audio_name) for audio_name in CLEAN_SPEECH]

In [116]:
subtitles = pd.DataFrame(columns=["Time" , "Path", "Text"])
subtitles["Time"] = AUDIO_TIME
subtitles["Path"] = AUDIO_PATH
subtitles["Text"] = subtitles["Path"].apply(lambda x: speech2text(x))

In [117]:
subtitles.head(10)

Unnamed: 0,Time,Path,Text
0,"00:00:00,000 --> 00:00:04,040",e:\Graduate\2021-2022 Term 2\AIPI540\Individua...,DON'T YOU KNOW WHO WE ARE NO PROFESSOR I DON
1,"00:00:10,690 --> 00:00:12,257",e:\Graduate\2021-2022 Term 2\AIPI540\Individua...,NATTER
2,"00:00:12,257 --> 00:00:14,534",e:\Graduate\2021-2022 Term 2\AIPI540\Individua...,SAME MATTER YOU FLEWFOR
3,"00:00:14,534 --> 00:00:18,853",e:\Graduate\2021-2022 Term 2\AIPI540\Individua...,RUUH
4,"00:00:18,853 --> 00:00:25,061",e:\Graduate\2021-2022 Term 2\AIPI540\Individua...,HE
5,"00:00:25,061 --> 00:00:31,116",e:\Graduate\2021-2022 Term 2\AIPI540\Individua...,I HEARD THEY SHUT YOU DOWN FER O REFUSING TO D...
6,"00:00:31,116 --> 00:00:37,647",e:\Graduate\2021-2022 Term 2\AIPI540\Individua...,FUR WHEN THEY REALIZED THAT KILLING OUT TOF PE...
7,"00:00:37,647 --> 00:00:42,729",e:\Graduate\2021-2022 Term 2\AIPI540\Individua...,WHI I BECAUSE PUBLIC OPINION WOULDN'T ALLOW SP...
8,"00:00:04,040 --> 00:00:06,557",e:\Graduate\2021-2022 Term 2\AIPI540\Individua...,AND OW MY FATHER FFOR MISTER GAN
9,"00:00:42,729 --> 00:00:48,183",e:\Graduate\2021-2022 Term 2\AIPI540\Individua...,NOT WHELE YOUR STRAGGLIN TOWOLT ON PA


In [118]:
filename = f"{audio_name}.srt"
# save = subtitles.drop(["Path"], axis = 1)
# save.loc[save["Text"]=="", ["Text"]] = " "
# save.to_csv(os.path.join(TRANSCIPTION_PATH, filename), index=False)
f = open(os.path.join(TRANSCIPTION_PATH, filename), "w")
for idx, row in subtitles.iterrows():
    f.write(f'{idx+1}\n{row["Time"]}\n{row["Text"]}\n\n')
f.close()