In [None]:
# Initial installation/preparation steps. Only need to be run once per environment.
!pip install librosa boto3 requests tqdm opencv-python torch nltk pytorch_pretrained_biggan tensorflow-addons ftfy ffmpeg-python
!git clone https://github.com/huggingface/pytorch-pretrained-BigGAN.git # Get biggan repo

# If used on a GPU environment, make sure to not install default pytorch but pytorch for cuda.

In [None]:
!git clone https://github.com/huggingface/pytorch-pretrained-BigGAN.git # Get biggan repo

In [None]:
!git clone https://github.com/PLEXATIC/stable-diffusion-tensorflow-digcrea #get Stable diffusion

In [None]:
#move files
import os
import shutil

source = '/content/stable-diffusion-tensorflow-digcrea'
destination = '/content'
 
# gather all files
allfiles = os.listdir(source)
 
# iterate on all files to move them to destination folder
for f in allfiles:
    src_path = os.path.join(source, f)
    dst_path = os.path.join(destination, f)
    shutil.move(src_path, dst_path)

In [None]:
#delete old folder
os.rmdir(source)

In [None]:
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import sys
import nltk
from tqdm import tqdm
import cv2 as cv
import librosa
from sklearn.decomposition import PCA, KernelPCA
sys.path.append("./pytorch-pretrained-BigGAN")
from pytorch_pretrained_biggan import (BigGAN, one_hot_from_names, truncated_noise_sample)
from stable_diffusion_tf.stable_diffusion import StableDiffusion
from PIL import Image
import math

nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
truncation = 0.35
extra_detail = 0.6 # Higher Value = more detail. valid range: [0;1[
max_frequency_level = 11000 # All Frequencies higher than this will not be considered.
low_frequency_skip = 16 # skip the first n herz
frequency_band_growth_rate = 1.015
smoothing_factor = 0.1 # How much the noise will be smothened. 0 = no smoothing, 1 = full smoothing
iterations = 2 # How many times to apply the smoothing algorithm. Higher value = more smoothing
debug = False # wether or not to display the weights for the weighted sum at each timestep.

# Free and royalty free music form pixabay.com
class_labels = ['soap bubble', "mushroom"] # List of labels from imagenet. See https://gist.github.com/yrevar/942d3a0ac09ec9e5eb3a for a full list.
filename = "sound.wav"

In [None]:
model = BigGAN.from_pretrained('biggan-deep-512')
class_vector = one_hot_from_names(class_labels, batch_size=len(class_labels))
model.to('cuda')
# Noise shape is 128

sound_data, sampling_rate = librosa.load(filename, sr=None)

sound_data = sound_data[:sampling_rate * 10]

In [None]:
seconds = len(sound_data)/sampling_rate
video_frame_count = int(math.ceil(seconds*30))
step_size = int(math.ceil(sampling_rate/30))
samples_per_frame = sampling_rate
#samples_per_frame = step_size * 5

softmax_rep_factors = [1]

def softmax(x):
    return np.exp(x)/sum(np.exp(x))

frequency_samples = []
smoothing_level = 1
for i in range(0, sound_data.shape[0]-(samples_per_frame+smoothing_level), step_size):
    X = np.fft.rfft(sound_data[i:samples_per_frame+i])
    Xdb = 20*np.log10(np.abs(X))
    db_sum = np.sum(Xdb)
    top_n_freqs = np.zeros_like(Xdb)
    softmax_xdb = Xdb.copy()
    for softmax_factor in softmax_rep_factors:
        softmax_xdb = softmax(softmax_xdb)
        top_n_freqs += softmax_xdb * softmax_factor
        softmax_xdb[np.argmax(softmax_xdb)] = -10e3
    Xdb = top_n_freqs
    Xdb = Xdb ** (1-extra_detail)
    biggest_prob_index = np.argmax(Xdb)
    biggest_probability = Xdb[biggest_prob_index]
    frequency_sums = []
    sum_range = 10
    precise_sum_range = 1.0
    sum_start = low_frequency_skip
    max_freq = min(max_frequency_level, sampling_rate//2)
    sum_increment = frequency_band_growth_rate
    while sum_range*sum_increment + sum_start < max_freq:
        precise_sum_range *= sum_increment
        sum_range = int(precise_sum_range)
        new_index = sum_start + sum_range
        if len(Xdb[sum_start:new_index]) > 1:
            frequency_sums.append(np.mean(Xdb[sum_start:new_index]))
        sum_start = new_index
    frequency_samples.append(np.nan_to_num(np.array(frequency_sums), nan=0, posinf=50, neginf=-50))
frequency_samples = np.array(frequency_samples)

#Smoothing the noise
noise_vector = truncated_noise_sample(truncation=truncation, batch_size=frequency_samples.shape[1])
reference_noise = noise_vector[0]
original_noises = []
for i in range(len(noise_vector)):
    original_noises.append(noise_vector[i])
sorted_noises = []

while len(sorted_noises) < len(noise_vector):
    # Use the correlation of x and reference_noise as measure
    distances = []
    for x in original_noises:
        correlation = np.corrcoef(reference_noise, x)[0,1]
        distances.append(correlation)
    closest_noise_index = np.argmin(distances)

    sorted_noises.append(original_noises[closest_noise_index])
    reference_noise = original_noises.pop(closest_noise_index)
noise_vector = sorted_noises

for _ in range(iterations):
    for i in range(1, len(noise_vector)):
        noise_vector[i] = noise_vector[i-1] * smoothing_factor + noise_vector[i] * (1-smoothing_factor)

In [None]:
## Actually generate biggan video
frames_per_class = video_frame_count//(len(class_labels)-1)
classes_from = class_vector[:-1]
classes_to = class_vector[1:]

i = 0
images = []
for class_from, class_to in zip(classes_from, classes_to):
    interpolations = np.linspace(class_from, class_to, frames_per_class)
    for interpolation in interpolations:
        if i >= len(frequency_samples):
            break
        interpolation_factors = frequency_samples[i]
        # Take weighted sum of noise vectors using interpolation factors
        
        final_noise = np.zeros_like(noise_vector[0])
        for _, interpolation_factor in enumerate(interpolation_factors):
            final_noise += noise_vector[_] * interpolation_factor
        noise_vec = np.clip(final_noise, -1, 1)
        #noise_vec = torch.from_numpy(noise_vec).unsqueeze(0).to('cuda')
        i += 1
        torch_noise = torch.from_numpy(np.float32(noise_vec)).unsqueeze(0)
        torch_noise = torch_noise.to('cuda')
        torch_class = torch.from_numpy(np.array([interpolation]))
        torch_class = torch_class.to('cuda')
        with torch.no_grad():
            output = model(torch_noise, torch_class, truncation)
            output = output.to("cpu")
            img = output[0]
        np_img = img.permute(1, 2, 0).numpy()
        img_max = np.max(np_img)
        img_min = np.min(np_img)
        np_img = (np_img - img_min) / (img_max - img_min)
        images.append(img)
        print(i/len(frequency_samples))

In [None]:
rgb_images = []
for img in images:
    img = img.permute(1, 2, 0).numpy()
    max_val = np.max(img)
    min_val = np.min(img)
    img = (img - min_val) / (max_val - min_val)
    img = cv.resize(img, (512, 512))
    rgb_images.append(img)

In [None]:
generator = StableDiffusion(
img_height=512,
img_width=512,
jit_compile=False,  # You can try True as well (different performance profile)
)

In [None]:
#create video
video_writer = cv.VideoWriter('sound_walk.avi', cv.VideoWriter_fourcc(*'MJPG'), 30, (512, 512))
sd_img = []

interpolation_steps = 5
interpolation_factors = np.linspace(0, 1, interpolation_steps)
previous_img = None
try:
    for img in tqdm(rgb_images[::interpolation_steps]):
        img = img * 255.0
        #create stable diffusion img
        img = generator.generate(
            seed=42,
            prompt="Photorealistic, epic, focused, sharp, cinematic lighting, 4k, 8k, octane rendering, legendary, fantasy, trippy, LSD",
            num_steps=10,
            unconditional_guidance_scale=7.5,
            temperature=0.0,
            batch_size=1,
            input_image=img,
            input_image_strength=0.5
        )[0]
        pil_img = Image.fromarray(img)
        display(pil_img)
        img = cv.cvtColor(img, cv.COLOR_RGB2BGR)
    
        if previous_img is not None and interpolation_steps > 1:
            for f in interpolation_factors[1:]:
                interpolated_image = f*img + (1-f)*previous_img
                interpolated_image = np.zeros_like(img)
                for c in range(interpolated_image.shape[-1]):
                    interpolated_image[:, :, c] = f*img[:, :, c] + (1-f)*previous_img[:, :, c]
                #interpolated_image[:, :, c] = np.clip(interpolated_image[:, :, c], 0.0, 1.0)
                video_writer.write(-(interpolated_image * 255).astype(np.uint8))
        previous_img = img.copy()
        video_writer.write(-(img * 255).astype(np.uint8))
        sd_img.append((img * 255))
except:
    print("interrupted or failed")
video_writer.release()

In [None]:
import ffmpeg
import os
input_video = ffmpeg.input('sound_walk.avi')
input_audio = ffmpeg.input('sound.wav')
result_name = "final_result_n.mp4"
if os.path.exists(result_name):
    os.remove(result_name)
ffmpeg.concat(input_video, input_audio, v=1, a=1).output(result_name).run()