In [28]:
#!pip install spleeter-gpu
import os
import gc
import pydub
import wave as we
import numpy as np
import matplotlib.pyplot as plt 
import subprocess
from pydub import AudioSegment
# from spleeter.separator import Separator

In [29]:
def extract_audio(videos_file_path, file_name, export_path, format = "wav"):
    try:
        audio_name = file_name.split(".")[0]
        video_path = os.path.join(videos_file_path, file_name)
        audio_path = os.path.join(export_path, audio_name) + f".{format}"
        cmd = f'ffmpeg -i \"{video_path}\" -f \"{format}\" -vn -ac 1 -ar {SAMPLE_RATE} -y \"{audio_path}\"'
        subprocess.call(cmd)
    except Exception as ex:
        print("Error: ", ex)
    
    return audio_name + f".{format}"

def normalize(audio_path, audio_file, output_path, format = "wav"):
    try:
        audio_name = audio_file.split(".")[0]
        input_path = os.path.join(audio_path, audio_file)
        output_path = f"{os.path.join(output_path, audio_name)}-normalized.{format}"
        cmd = f'ffmpeg-normalize \"{input_path}\" -o \"{output_path}\" --sample-rate {SAMPLE_RATE}'
        subprocess.call(cmd)
    except Exception as ex:
        print("Error: ", ex)

def vocal_spearation(audio_path, output_path):
    try:
        cmd = f'python inference.py --input \"{audio_path}\" --output \"{output_path}\" --gpu 0 -B 4'
        subprocess.call(cmd)
    except Exception as ex:
        print("Error: ", ex)

# def vocal_separation(audio_path, audio_name, output_path = None):
#     if output_path == None:
#         output_path = audio_path

#     separator = Separator('spleeter:2stems')
#     separator.separate_to_file(os.path.join(audio_path, audio_name), output_path)
# vocal_separation(DATA_PATH, audio, OUTPUT_PATH)

def split_audio(audio_path, audio_name, save_path, min_silence_len = 1000, silence_thresh = -16, keep_silence = 100):
    #read audio
    audio_type = os.path.splitext(audio_name)[-1][1:]
    audio = AudioSegment.from_file(os.path.join(audio_path, audio_name), format = audio_type)

    #normalize audio
    normalized_audio = match_target_amplitude(audio, silence_thresh)

    #create folder to store the result segments if not exist
    folder = os.path.join(audio_path, f"{min_silence_len} {silence_thresh}")
    if not os.path.exists(folder):
        os.mkdir(folder)
    
    #split segment
    not_silence_ranges = pydub.silence.detect_nonsilent(normalized_audio, min_silence_len = min_silence_len, silence_thresh = silence_thresh, seek_step = 1)

    #cut the slice from the original audio and save it
    for idx in range(len(not_silence_ranges)):
        current_start_pos = max(0, not_silence_ranges[idx][0] - keep_silence)
        # current_end_pos = round(not_silence_ranges[idx][1])
        current_end_pos = round(not_silence_ranges[idx][1]) if idx == len(not_silence_ranges)-1 else not_silence_ranges[idx + 1][0]-keep_silence

        new = audio[current_start_pos:current_end_pos] 

        #segment is too small
        if len(new) <= 500:
            continue
        
        #name the segment using its time
        file_name = f"{current_start_pos}_{current_end_pos}.{audio_type}"
        save_name = os.path.join(folder, file_name)
        new.export(save_name, format = audio_type)
    audio = audio.empty()
    #plot the splits on the graph
    save_path = f"{save_path}\\{min_silence_len} {silence_thresh}.png"
    create_plot(os.path.join(audio_path, audio_name), not_silence_ranges, save_path)

In [30]:
#util functions

def time_trans(t):
    h = t//3600
    m = t//60
    s = int(t%60)
    ms = round(t - int(t), 3) * 1000
    return "%02d-%02d-%02d,%03d" % (h, m, s, ms)

#adjust target amplitude
def match_target_amplitude(sound, target_dBFS):
    change_in_dBFS = target_dBFS - sound.dBFS
    return sound.apply_gain(change_in_dBFS)
    
def wavread(path):
    wavfile =  we.open(path,"rb")
    params = wavfile.getparams()
    framesra,frameswav= params[2],params[3]
    nchannels, sampwidth, framesra, frameswav = params[:4]

    datawav = wavfile.readframes(frameswav)
    wavfile.close()
    datause = np.frombuffer(datawav,dtype = np.short)

    if nchannels == 2:
        datause.shape = -1,2
        datause = datause[:, 0]
    datause = datause.T
    time = np.arange(0, frameswav) * (1.0/framesra)
    return datause,time,nchannels

def create_plot(path, not_silence_ranges, save_path):
    wavdata,wavtime,_ = wavread(path)

    #plot the sound wave
    plt.figure(figsize=(120, 20))
    plt.plot(wavtime,wavdata,color = 'green')
    plt.xlabel('Time')
    plt.ylabel('Amplitude')

    #plot the split points
    split_points = np.array(not_silence_ranges).flatten()/1000
    for i in range(len(split_points)):
        if i%2==0:
            plt.axvline(split_points[i], color = "red")
        else:
            plt.axvline(split_points[i], color = "blue")

    plt.savefig(save_path)
    plt.clf()
    plt.close("all")
    del wavdata, wavtime
    gc.collect()
    # plt.show()
    

def test_normalize_first(audio_file):
    audio_name = audio_file.split('.')[0]
    audio_format = audio_file.split('.')[1]

    nor_sep_folder = os.path.join(AUDIO_PATH, "Normalized")
    nor_sep_fig = os.path.join(FIG_PATH, "Normalized")
    if not os.path.exists(nor_sep_folder):
        os.mkdir(nor_sep_folder)
    if not os.path.exists(nor_sep_fig):
        os.mkdir(nor_sep_fig)

    print("Normalizing...")
    normalize(AUDIO_PATH, audio_file, nor_sep_folder, audio_format)
    print("Done")

    print("Separating...")
    normalized_name = f"{audio_name}-normalized.{audio_format}"
    audio_path = os.path.join(nor_sep_folder, normalized_name)
    vocal_spearation(audio_path, nor_sep_folder)
    print("Done")

    print("Spliting")
    normalized_vocal_name = f"{audio_name}-normalized_Vocals.{audio_format}"
    for min_silence in MIN_SILENCE_LEN:
        print("Spliting with minimum silence: ", min_silence)
        for sil_thresh in SILENCE_THRESH:
            print("silence threshold: ", sil_thresh)
            split_audio(nor_sep_folder, normalized_vocal_name, nor_sep_fig, min_silence_len = min_silence, silence_thresh = sil_thresh)
    print("Done")

def test_voc_first(audio_file):
    audio_name = audio_file.split('.')[0]
    audio_format = audio_file.split('.')[1]
    
    sep_nor_folder = os.path.join(AUDIO_PATH, "Vocal")
    sep_nor_fig = os.path.join(FIG_PATH, "Vocal")
    if not os.path.exists(sep_nor_folder):
        os.mkdir(sep_nor_folder)
    if not os.path.exists(sep_nor_fig):
        os.mkdir(sep_nor_fig)
    audio_path = os.path.join(AUDIO_PATH, audio_file)

    print("Separating...")
    vocal_spearation(audio_path, sep_nor_folder)
    print("Done")

    print("Normalizing...")
    voc_file = f"{audio_name}_Vocals.{audio_format}"
    normalize(sep_nor_folder, voc_file, sep_nor_folder, audio_format)
    voc_normalized_name = f"{audio_name}_Vocals-normalized.{audio_format}"
    print("Done")

    print("Spliting")
    for min_silence in MIN_SILENCE_LEN:
        print("Spliting with minimum silence: ", min_silence)
        for sil_thresh in SILENCE_THRESH:
            print("silence threshold: ", sil_thresh)
            split_audio(sep_nor_folder, voc_normalized_name, sep_nor_fig, min_silence_len = min_silence, silence_thresh = sil_thresh)
    print("Done")
    

In [35]:
DATA_PATH = 'E:\\Graduate\\2021-2022 Term 2\\AIPI540\\Individual Project\\Data\\videos'
AUDIO_PATH = 'E:\\Graduate\\2021-2022 Term 2\\AIPI540\\Individual Project\\Data\\outputs'
FIG_PATH = 'E:\\Graduate\\2021-2022 Term 2\\AIPI540\\Individual Project\\imgs\\split'
SAMPLE_RATE = 16000
MIN_SILENCE_LEN = [1500]
SILENCE_THRESH = [-70]
video_name = "return1.mp4"
audio_format = "wav"

def test():
    print("Extracting audio..")
    audio_name = extract_audio(DATA_PATH, video_name, AUDIO_PATH, audio_format)
    print("Done")

    # print("Testing normalized first..")
    # #First normalize then separate vocal
    # test_normalize_first(audio_name)
    # print("Test Done")

    print("Testing vocal separation first..")
    #First separate vocal then normalize
    test_voc_first(audio_name)
    print("Test Done")

In [36]:
test()

Extracting audio..
Done
Testing vocal separation first..
Separating...
Done
Normalizing...
Done
Spliting
Spliting with minimum silence:  1500
silence threshold:  -70
Done
Test Done
