In [None]:
# Initial installation/preparation steps. Only need to be run once per environment.

#!pip install librosa boto3 requests tqdm opencv-python torch
#!git clone https://github.com/huggingface/pytorch-pretrained-BigGAN.git # Get biggan repo

# If used on a GPU environment, make sure to not install default pytorch but pytorch for cuda.

In [None]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import sys
from tqdm import tqdm
import cv2 as cv
import librosa
from sklearn.decomposition import PCA, KernelPCA
sys.path.append("./pytorch-pretrained-BigGAN")
from pytorch_pretrained_biggan import (BigGAN, one_hot_from_names, truncated_noise_sample)

model = BigGAN.from_pretrained('biggan-deep-512')
truncation = 0.5
class_labels = ['soap bubble', "eagle", "goldfish"]
class_vector = one_hot_from_names(class_labels, batch_size=len(class_labels))
noise_vector = truncated_noise_sample(truncation=truncation, batch_size=1)
model.to('cuda')
# Noise shape is 128

filename = "riddle.wav"
sound_data, sampling_rate = librosa.load(filename)

In [None]:
seconds = len(sound_data)/sampling_rate
video_frame_count = int(seconds*30)
samples_per_frame = int(sampling_rate/30)

frequency_samples = []
max_db = 0
for i in range(0, sound_data.shape[0]-samples_per_frame, samples_per_frame):
    X = np.fft.rfft(sound_data[i:samples_per_frame+i])
    Xdb = 20*np.log10(np.abs(X))
    frequency_samples.append(np.nan_to_num(Xdb, nan=0, posinf=50, neginf=-50))
frequency_samples = np.array(frequency_samples)
max_db = np.max(frequency_samples)
frequency_samples /= max_db
print(max_db)
frequency_samples.shape, video_frame_count

In [None]:
max_frequencies = np.max(frequency_samples)
min_frequency = np.min(frequency_samples)
normed_frequencies = (frequency_samples-min_frequency)/(np.max(max_frequencies)-min_frequency)
pca = KernelPCA(n_components=128, kernel="rbf", gamma=0.1)

transformed = pca.fit_transform(normed_frequencies)

t_max = np.max(transformed)
t_min = np.min(transformed)
transformed = (transformed-t_min)/(t_max-t_min)
transformed -= np.mean(transformed, axis=0)
transformed *= 2

low_frequencies = transformed

In [None]:
frames_per_class = video_frame_count//(len(class_labels)-1)
classes_from = class_vector[:-1]
classes_to = class_vector[1:]

i = 0
images = []
for class_from, class_to in zip(classes_from, classes_to):
    interpolations = np.linspace(class_from, class_to, frames_per_class)
    for interpolation in interpolations:
        noise_vector += low_frequencies[i]*0.9
        noise_vector = np.clip(noise_vector, -1, 1)
        noise_vector *= 0.9
        i += 1
        torch_noise = torch.from_numpy(np.float32(noise_vector))
        torch_noise = torch_noise.to('cuda')
        torch_class = torch.from_numpy(np.array([interpolation]))
        torch_class = torch_class.to('cuda')
        with torch.no_grad():
            output = model(torch_noise, torch_class, truncation)
            output = output.to("cpu")
            img = output[0]
        np_img = img.permute(1, 2, 0).numpy()
        img_max = np.max(np_img)
        img_min = np.min(np_img)
        np_img = (np_img - img_min) / (img_max - img_min)
        images.append(img)
        print(i)
    

In [None]:
rgb_images = []
for img in images:
    img = img.permute(1, 2, 0).numpy()
    max_val = np.max(img)
    min_val = np.min(img)
    img = (img - min_val) / (max_val - min_val)
    rgb_images.append(img)
video_writer = cv.VideoWriter('sound_walk.avi', cv.VideoWriter_fourcc(*'MJPG'), 30, (512, 512))
for img in rgb_images:
    img = cv.cvtColor(img, cv.COLOR_RGB2BGR)
    video_writer.write((img * 255).astype(np.uint8))
video_writer.release()

In [None]:
#TODO: Adjust this to script above

from moviepy.editor import *
videoclip = VideoFileClip("filename.mp4")
audioclip = AudioFileClip("audioname.mp3")

new_audioclip = CompositeAudioClip([audioclip])
videoclip.audio = new_audioclip
videoclip.write_videofile("new_filename.mp4")