In [None]:
# Initial installation/preparation steps. Only need to be run once per environment.
!pip install librosa boto3 requests tqdm opencv-python torch nltk pytorch_pretrained_biggan tensorflow-addons ftfy
!git clone https://github.com/huggingface/pytorch-pretrained-BigGAN.git # Get biggan repo

# If used on a GPU environment, make sure to not install default pytorch but pytorch for cuda.

In [None]:
!git clone https://github.com/huggingface/pytorch-pretrained-BigGAN.git # Get biggan repo

In [None]:
!git clone https://github.com/divamgupta/stable-diffusion-tensorflow #get Stable diffusion

In [None]:
#move files
import os
import shutil

source = '/content/stable-diffusion-tensorflow'
destination = '/content'
 
# gather all files
allfiles = os.listdir(source)
 
# iterate on all files to move them to destination folder
for f in allfiles:
    src_path = os.path.join(source, f)
    dst_path = os.path.join(destination, f)
    shutil.move(src_path, dst_path)


In [None]:
#delete old folder
os.rmdir(source)

In [None]:
import torch
import matplotlib.pyplot as plt
import numpy as np
import sys
import nltk
from tqdm import tqdm
import cv2 as cv
import librosa
from sklearn.decomposition import PCA, KernelPCA
sys.path.append("./pytorch-pretrained-BigGAN")
from pytorch_pretrained_biggan import (BigGAN, one_hot_from_names, truncated_noise_sample)
from stable_diffusion_tf.stable_diffusion import StableDiffusion
from PIL import Image

model = BigGAN.from_pretrained('biggan-deep-512')
truncation = 0.5
nltk.download('wordnet')
class_labels = ['soap bubble', "eagle", "goldfish"]
nltk.download('omw-1.4')
class_vector = one_hot_from_names(class_labels, batch_size=len(class_labels))
noise_vector = truncated_noise_sample(truncation=(truncation * 3), batch_size=1)
print(noise_vector.shape)
model.to('cuda')
# Noise shape is 128

filename = "sound.mp3"
sound_data, sampling_rate = librosa.load(filename)

In [None]:
seconds = len(sound_data)/sampling_rate
video_frame_count = int(seconds*30)
samples_per_frame = int(sampling_rate/30)

frequency_samples = []
max_db = 0
for i in range(0, sound_data.shape[0]-samples_per_frame, samples_per_frame):
    X = np.fft.rfft(sound_data[i:samples_per_frame+i])
    Xdb = 20*np.log10(np.abs(X))
    frequency_samples.append(np.nan_to_num(Xdb, nan=0, posinf=50, neginf=-50))
frequency_samples = np.array(frequency_samples)
max_db = np.max(frequency_samples)
frequency_samples /= max_db
print(max_db)
frequency_samples.shape, video_frame_count

In [None]:
max_frequencies = np.max(frequency_samples)
min_frequency = np.min(frequency_samples)
normed_frequencies = (frequency_samples-min_frequency)/(np.max(max_frequencies)-min_frequency)
pca = KernelPCA(n_components=128, kernel="rbf", gamma=0.1)

transformed = pca.fit_transform(normed_frequencies)

t_max = np.max(transformed)
t_min = np.min(transformed)
transformed = (transformed-t_min)/(t_max-t_min)
transformed -= np.mean(transformed, axis=0)
transformed *= 2

low_frequencies = transformed

In [None]:
frames_per_class = video_frame_count//(len(class_labels)-1)
classes_from = class_vector[:-1]
classes_to = class_vector[1:]

i = 0
images = []
for class_from, class_to in zip(classes_from, classes_to):
    interpolations = np.linspace(class_from, class_to, frames_per_class)
    for interpolation in interpolations:
        noise_vector += low_frequencies[i]*0.9
        noise_vector = np.clip(noise_vector, -1, 1)
        noise_vector *= 0.9
        i += 1
        torch_noise = torch.from_numpy(np.float32(noise_vector))
        torch_noise = torch_noise.to('cuda')
        torch_class = torch.from_numpy(np.array([interpolation]))
        torch_class = torch_class.to('cuda')
        with torch.no_grad():
            output = model(torch_noise, torch_class, truncation)
            output = output.to("cpu")
            img = output[0]
        np_img = img.permute(1, 2, 0).numpy()
        img_max = np.max(np_img)
        img_min = np.min(np_img)
        np_img = (np_img - img_min) / (img_max - img_min)
        images.append(img)
        print(i)
    

In [None]:
rgb_images = []
for img in images:
    img = img.permute(1, 2, 0).numpy()
    max_val = np.max(img)
    min_val = np.min(img)
    img = (img - min_val) / (max_val - min_val)
    #scale img to 512x512
    img = cv.resize(img, (512, 512))
    #append output to list
    rgb_images.append(img)

In [None]:
generator = StableDiffusion(
img_height=512,
img_width=512,
jit_compile=False,  # You can try True as well (different performance profile)
)

In [None]:
from tqdm import tqdm

In [None]:
#create video
video_writer = cv.VideoWriter('sound_walk.avi', cv.VideoWriter_fourcc(*'MJPG'), 30, (512, 512))
sd_img = []

for img in rgb_images[:100]:
  img = img * 255
  img = cv.cvtColor(img, cv.COLOR_RGB2BGR)
  print(np.max(img))
  #create stable diffusion img
  img = generator.generate(
      seed=42,
      prompt="LSD",
      num_steps=3,
      unconditional_guidance_scale=0,
      temperature=0.5,
      batch_size=1,
      input_image=img,
      input_image_strength=0.999
  )

  video_writer.write((img * 255).astype(np.uint8))
  sd_img.append((img * 255))

  pil_img = Image.fromarray(img[0])
  display(pil_img)

video_writer.release()

In [None]:
from moviepy.editor import *
videoclip = VideoFileClip("sound_walk.avi")
audioclip = AudioFileClip("sound.mp3")

new_audioclip = CompositeAudioClip([audioclip])
videoclip.audio = new_audioclip
videoclip.write_videofile("sound_walk_new.mp4")