In [None]:
import ffmpeg
import librosa
import librosa.display
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
from pydub import AudioSegment
import subprocess
import tempfile

#Description: Displays Waveplot
#Input: File Location, numpy.ndarray (audio), sampling rate
#Return: None
#Uses: Librosa, Matplotlib
def displayWaveplot(file, audio, sr):
    plt.figure(figsize=(16, 4))
    plt.title('Waveplot for {}'.format(file))
    librosa.display.waveplot(audio, sr=sr)
    plt.tight_layout()
#Description: Displays Spectrogram
#Input: File Location, numpy.ndarray (audio), sampling rate
#Return: None
#Uses: Librosa, Matplotlib
def displaySpectrogram(file, audio, sr):
    x = librosa.stft(audio)
    db = librosa.amplitude_to_db(abs(x))
    plt.figure(figsize=(16, 4))
    plt.title('Spectrogram for {}'.format(file))
    librosa.display.specshow(db, sr=sr, x_axis='time', y_axis='hz')
    plt.colorbar()
    plt.tight_layout()
#Description: Displays Chromagram
#Input: File Location, numpy.ndarray (audio), sampling rate
#Return: None
#Uses: Librosa, Matplotlib
def displayChromagram(file, audio, sr):
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr, tuning=0, norm=2,
                                             hop_length=hop_size, n_fft=n_fft)
    
    plt.figure(figsize=(16, 8))
    plt.subplot(2, 1, 1)
    plt.title('Chroma Representation of {}'.format(file))
    librosa.display.specshow(chroma, x_axis='time',
                             y_axis='chroma', cmap='gray_r', hop_length=hop_size)
    plt.colorbar()
    plt.tight_layout()
#Description: Displays Warping Path
#Input: video path, audio path, sampling rate
#Return: None
#Uses: Librosa, Matplotlib   
def displayWarpingPath(video, audio, sr, hop_size):
    video_chroma = defineChromagram(video, sr)
    audio_chroma = defineChromagram(audio, sr)
    #Align
    D, wp = librosa.sequence.dtw(X=video_chroma, Y=audio_chroma, metric='cosine')

    fig, (ax_video, ax_audio) = plt.subplots(2, 1, figsize=(16, 8))

    # Plot x_1
    librosa.display.waveplot(video, sr=sr, x_axis='time', ax=ax_video)
    ax_video.set(title='Video Waveplot')

    # Plot x_2
    librosa.display.waveplot(audio, sr=sr, x_axis='time', ax=ax_audio)
    ax_audio.set(title='Audio Waveplot')
    plt.tight_layout()

    trans_figure = fig.transFigure.inverted()
    lines2 = []
    arrows = 100
    points_idx = np.int16(np.round(np.linspace(0, wp.shape[0] - 1, arrows)))

    audio_times = []
    video_times = []
    # for tp1, tp2 in zip((wp[points_idx, 0]) * hop_size, (wp[points_idx, 1]) * hop_size):
    for tp1, tp2 in wp[points_idx] * hop_size / sr:
        # get position on axis for a given index-pair
        coord1 = trans_figure.transform(ax_video.transData.transform([tp1, 0]))
        coord2 = trans_figure.transform(ax_audio.transData.transform([tp2, 0]))
        audio_times.append(coord1[0])
        video_times.append(coord2[0])

        # draw a line
        line = matplotlib.lines.Line2D((coord1[0], coord2[0]),
                                       (coord1[1], coord2[1]),
                                       transform=fig.transFigure,
                                       color='r')
        lines2.append(line)

    fig.lines = lines2
    plt.tight_layout()
#Description: Computes a chromagram
#Input: numpy.ndarray (audio), sampling rate
#Return: numpy.ndarray (Normalized energy for each chroma bin at each frame)
#Uses: Librosa
def defineChromagram(audio, sr):
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr, tuning=0, norm=2,
                                             hop_length=hop_size, n_fft=n_fft)
    return chroma
#Description: Runs .bat file to combine video and audio
#Input: Location of audio file, Location of Video File, Save Location
#Return: None
#Uses: ffmpeg
def combine(audio_file, video_file, save_location):
    cmd = ['ffmpeg','-y','-i',video_file,'-i',audio_file,'-map', '0:v','-map', '1:a','-c:v','copy','-c:a','aac','-b:a','160k',save_location]
    subprocess.run(cmd)
# Description: Runs .bat file to extract audio file from video
# Input: Location of Video File, Save Location
#Return: None
#Uses: ffmpeg
def extract(video_file, save_location):
    cmd = ['ffmpeg', '-y', '-loglevel', 'quiet', '-i', video_file, save_location]
    subprocess.run(cmd)

# Vars
n_fft = 4410 # ftt window size
hop_size = 512 # hop length
sampling_rate = 44100 # sampling rate
duration_limit = 120 # maximum duration of audio-video clips used to synchronize in seconds

# CHANGE THESE!
audio_file = 'PATH TO AUDIO FILE'
video_file = 'PATH TO VIDEO FILE'
video_audio_file = 'SAME AS video_file but .wav'
synced_audio = 'PATH TO SAVE SYNCED AUDIO . . . ex. synced_audio.wav'
save_file = 'PATH TO OUTPUT SYNCED AND COMBINED .MP4 FILE'

##############---------LOAD INPUT---------##############

# Load audio file
audio, _ = librosa.load(audio_file, sr=sampling_rate, mono=True, duration=duration_limit)

# Load video file, creates .wav file of the video audio
handle, video_audio_file = tempfile.mkstemp(suffix='.wav')
os.close(handle)
extract(video_file, video_audio_file)
video, _ = librosa.load(video_audio_file, sr=sampling_rate, mono=True, duration=duration_limit)
os.unlink(video_audio_file)

##############---------FEATURE EXTRACTION---------##############

# Feature Extraction for Audio
audio_chroma = defineChromagram(audio, sampling_rate)

# Feature Extraction for Video
video_chroma = defineChromagram(video, sampling_rate)

##############---------RQA---------##############

# Performs subsequence DTW
xsim = librosa.segment.cross_similarity(audio_chroma, video_chroma, mode='affinity', metric='cosine')
L_score, L_path = librosa.sequence.rqa(xsim, np.inf, np.inf, backtrack=True)

audio_times = []
video_times = []
diff_times = []
for v,a in L_path * hop_size / sampling_rate:
    A = float(a)
    V = float(v)
    audio_times.append(A)
    video_times.append(V)
    diff_times.append((A-V))    
    
##############---------PLOTS---------##############
displayWaveplot(video_audio_file, video, sampling_rate)
displaySpectrogram(video_audio_file, video, sampling_rate)
displayChromagram(video_audio_file, video, sampling_rate)
displayWaveplot(audio_file, audio, sampling_rate)
displaySpectrogram(audio_file, audio, sampling_rate)
displayChromagram(audio_file, audio, sampling_rate)
displayWarpingPath(video, audio, sampling_rate, hop_size)
    
##############---------SYNC PROCESS---------##############

diff_times = np.array(diff_times)
# Find mean of time differences
mean = np.average(diff_times)
std = np.std(diff_times)
diff_times = [d for d in diff_times if np.abs(d-mean)<(0.5*std)]
diff = np.average(diff_times)

# Setting move option
move = False # True = move audio left...False = move audio right
if (diff > 0): move = True
else: move = False
    
# Sync
audio = AudioSegment.from_wav(audio_file)

if (move):
    # Trim diff seconds from beginning
    final = audio[diff*1000:]
else:
    # Add diff seconds of silence to beginning
    silence = AudioSegment.silent(duration=-diff*1000)
    final = silence + audio

# Export synced audio
final.export(synced_audio, format='wav')

##############---------COMBINE PROCESS---------##############
combine(synced_audio, video_file, save_file)
print("Successfully synced and combined")