# StyleGan2 Audio Reactive

This notebook can be used to generate stylegan2 video based on audio layers

Author : [@sioudayassin](https://twitter.com/SioudaYassin)

The latent generation code is from [@robertluxemburg](https://twitter.com/robertluxemburg)



License : [CC BY-NC 4.0 ](https://creativecommons.org/licenses/by-nc/4.0/deed.fr)


**Main Libs used :**
- [StyleGan2](https://github.com/NVlabs/stylegan)
- [Spleeter](https://github.com/deezer/spleeter) 



# **READ CAREFULLY BEFORE LAUNCHING**

To use the generator you will have to host the model files by yourself on Google Drive

Once uploaded modify the cell **Double click me to add the path to your model files** by adding Google drive path to your file



Here you can find the download links to those used in this collab :


**Flickr Faces**

- Repo : https://github.com/justinpinkney/awesome-pretrained-stylegan2

- Download : http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-ffhq-config-e.pkl

**Cats**

- Repo : https://github.com/justinpinkney/awesome-pretrained-stylegan2

- Download : http://d36zk2xti64re0.cloudfront.net/stylegan2/networks/stylegan2-cat-config-f.pkl



**Abstract Art**

- Repo : https://github.com/justinpinkney/awesome-pretrained-stylegan2

- Download : https://drive.google.com/uc?id=1YzZemZAp7BVW701_BZ7uabJWJJaS2g7v


**Fursonna**
- Repo : https://github.com/justinpinkney/awesome-pretrained-stylegan2
- Download : https://thisfursonadoesnotexist.com/model/network-e621-r-512-3194880.pkl


**Painting Faces**
- Repo : https://github.com/justinpinkney/awesome-pretrained-stylegan2#faces-FFHQ-config-f-512x512
- Download : https://drive.google.com/uc?id=1H-MYFZqngF1R0whm4bc3fEoX7VvOWaDl


**Ukiyoe faces**
- Repo : https://github.com/justinpinkney/awesome-pretrained-stylegan2#faces-FFHQ-config-f-512x512
- Download : https://drive.google.com/uc?id=1_QysUKfed1-_x9e5off2WWJKp1yUcidu






# Audio Reactive Style GAN 2




In [None]:
#@title #**Setup stylegan requirements** { display-mode: "form" }
%tensorflow_version 1.x
import tensorflow as tf
!pip install -U kora
from kora.drive import upload_public

# Download the code
!git clone https://github.com/NVlabs/stylegan2.git
%cd stylegan2
!pip install wget
!mkdir model
!nvcc test_nvcc.cu -o test_nvcc -run
#!pip install spotdl==3.3.1

!curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl
!chmod a+rx /usr/local/bin/youtube-dl


!apt install ffmpeg


print('Tensorflow version: {}'.format(tf.__version__) )
!nvidia-smi -L
print('GPU Identified at: {}'.format(tf.test.gpu_device_name()))

!cd /content/stylegan2

!mkdir data


#https://rolux.org/media/stylegan2/vectors/mouth_ratio.npy
!wget https://rolux.org/media/stylegan2/vectors/mouth_ratio.npy -O data/mouth_ratio.npy
!wget https://rolux.org/media/stylegan2/vectors/eye_open.npy -O data/eye_open.npy
!wget https://rolux.org/media/stylegan2/vectors/pitch.npy -O data/pitch.npy
!wget https://rolux.org/media/stylegan2/vectors/gender.npy -O data/gender.npy
!wget https://rolux.org/media/stylegan2/vectors/smile.npy -O data/smil.e.npy
!wget https://rolux.org/media/stylegan2/vectors/age.npy -O data/age.npy



# Mount your google drive account


In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Load Model urls

In [None]:
#@title #**Double click me to add the path to your model files** 
models = {}
models['flikr_faces'] = '/content/drive/My Drive/Colab Notebooks/stylegan/models/eurorack_modules/eurorack_modules_v2_1024.pkl'  
models['abstract_art'] = '/content/drive/My Drive/PathToMyModel'  
models['figure_drawing'] = '/content/drive/My Drive/PathToMyModel' 
models['floor_plans'] = '/content/drive/My Drive/PathToMyModel' 
models['fursona'] = '/content/drive/My Drive/PathToMyModel'  
models['cats'] = '/content/drive/My Drive/PathToMyModel'  
models['celebrities'] = '/content/drive/My Drive/PathToMyModel'  
models['painting_face'] = '/content/drive/My Drive/PathToMyModel' 
models['ukiyoe_faces'] = '/content/drive/My Drive/PathToMyModel'  




In [None]:
#@title #**Model selector** { run: "auto", display-mode: "form" }
# Download the model of choice
import argparse
import numpy as np
import PIL.Image
import dnnlib
import dnnlib.tflib as tflib
import re
import sys
from io import BytesIO
import IPython.display
import numpy as np
from math import ceil
from PIL import Image, ImageDraw
import imageio
import wget
import pretrained_networks
import os
import random
model_selector = "flikr_faces"  #@param ['ukiyoe_faces','painting_face','celebrities','cats','flikr_faces','abstract_art','figure_drawing','floor_plans','fursona']
 
print(model_selector)
 
network_pkl = models[model_selector]

print('Loading networks from "%s"...' % network_pkl)
_G, _D, Gs = pretrained_networks.load_networks(network_pkl)
noise_vars = [var for name, var in Gs.components.synthesis.vars.items() if name.startswith('noise')]




Gs_kwargs = dnnlib.EasyDict()
Gs_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
Gs_kwargs.randomize_noise = False
 
seed = random.randint(0, 5000000)
truncation_psi = 0.5
Gs_kwargs.truncation_psi = truncation_psi


 

#print('Generating image for seed %d (%d/%d) ...' % (seed, seed_idx, len(seeds)))
rnd = np.random.RandomState(seed)
z = rnd.randn(1, *Gs.input_shape[1:]) # [minibatch, component]
tflib.set_vars({var: rnd.randn(*var.shape.as_list()) for var in noise_vars}) # [height, width]
images = Gs.run(z, None, **Gs_kwargs) # [minibatch, height, width, channel]
PIL.Image.fromarray(images[0], 'RGB').save(str(seed) + '.png')
 
isp = IPython.display.display(IPython.display.Image(str(seed) + '.png'))


## Download a music trough youtube

In [None]:
#@title #**Music url** {display-mode: "form" }
import os
musciUrl =  'https://youtu.be/i23NEQEFpgQ' #@param {type: "string"}
os.environ['MUSIC_URL'] = musciUrl

#!spotdl -s $MUSIC_URL 
!youtube-dl --extract-audio --audio-format mp3 $MUSIC_URL --output "/content/stylegan2/data/input.%(ext)s"

'''url = upload_public('/content/stylegan2/data/input.mp3')'''
# then display it
from IPython.display import HTML

#HTML(f""" {url} """)

In [None]:
#@title #**For test purpose time range audio preview** { display-mode: "form" }
#@markdown shorten video rendering to 10-20 seconds to reduce the video rendering wait time .

start_timecode = '00:00:40'  #@param {type: "string"}
end_timecode = '00:01:00'  #@param {type: "string"}

os.environ['START_TIMECODE'] = start_timecode
os.environ['END_TIMECODE'] = end_timecode


!ffmpeg -i /content/stylegan2/data/input.mp3 -ss $START_TIMECODE -to $END_TIMECODE -c copy /content/stylegan2/data/input_preview.mp3
!rm /content/stylegan2/data/input.mp3
!mv /content/stylegan2/data/input_preview.mp3 /content/stylegan2/data/input.mp3

In [None]:
#@title #**Install Spleeter** { display-mode: "form" }
!pip install spleeter

In [None]:
#@title #**Split the audio into tracks with Spleeter** { display-mode: "form" }
!rm /content/stylegan2/data/input.wav
!spleeter separate  -o /content/stylegan2/data/ -p spleeter:5stems /content/stylegan2/data/input.mp3
!mv /content/stylegan2/data/input/* /content/stylegan2/data
!ffmpeg -i /content/stylegan2/data/input.mp3 /content/stylegan2/data/input.wav

In [None]:
#@title #**Audio Processing** { display-mode: "form" }
# git clone https://github.com/NVlabs/stylegan2
import os
import numpy as np
from scipy.interpolate import interp1d
from scipy.io import wavfile
import matplotlib.pyplot as plt
import PIL.Image
import moviepy.editor

import dnnlib
import dnnlib.tflib as tflib
import pretrained_networks

import collections 

audio = {}
fps = 24


# https://www.google.com/search?q=death+grips+black+google+download
for mp3_filename in [f for f in os.listdir('data') if f.endswith('.wav')]:
    mp3_filename = f'data/{mp3_filename}'
    wav_filename = mp3_filename
    if not os.path.exists(wav_filename):
        audio_clip = moviepy.editor.AudioFileClip(mp3_filename)
        audio_clip.write_audiofile(wav_filename, fps=44100, nbytes=2, codec='pcm_s16le')
    track_name = os.path.basename(wav_filename).split(".")[0]
    print(track_name)
    rate, signal = wavfile.read(wav_filename)
    signal = np.mean(signal, axis=1) # to mono
    signal = np.abs(signal)
    seed = signal.shape[0]
    duration = signal.shape[0] / rate
    frames = int(np.ceil(duration * fps))
    samples_per_frame = signal.shape[0] / frames
    audio[track_name] = np.zeros(frames, dtype=signal.dtype)
    #print( audio[track_name])
    for frame in range(frames):
        start = int(round(frame * samples_per_frame))
        stop = int(round((frame + 1) * samples_per_frame))
        audio[track_name][frame] = np.mean(signal[start:stop], axis=0)
    audio[track_name] /= max(audio[track_name])


for track in sorted(audio.keys()):
    plt.figure(figsize=(8, 3))
    plt.title(track)
    plt.plot(audio[track])
    plt.savefig(f'data/{track}.png')

#network_pkl = '/content/drive/My Drive/Colab Notebooks/stylegan/models/StyleGAN2_microscopev1.pkl'
#_G, _D, Gs = pretrained_networks.load_networks(network_pkl)

Gs_kwargs = dnnlib.EasyDict()
Gs_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
Gs_kwargs.randomize_noise = False
Gs_syn_kwargs = dnnlib.EasyDict()
Gs_syn_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
Gs_syn_kwargs.randomize_noise = False
Gs_syn_kwargs.minibatch_size = 4
noise_vars = [var for name, var in Gs.components.synthesis.vars.items() if name.startswith('noise')]
w_avg = Gs.get_var('dlatent_avg')



## Check the seperate audio tracks

In [None]:
#@title #**Listen to the input track** { display-mode: "form" }

from IPython.display import Audio

Audio('/content/stylegan2/data/input.wav')



In [None]:
#@title #**Listen to the bass track** { display-mode: "form" }
Audio('/content/stylegan2/data/bass.wav')



In [None]:
#@title #**Listen to the drums track** { display-mode: "form" }
Audio('/content/stylegan2/data/drums.wav')



In [None]:
#@title #**Listen to the piano track** { display-mode: "form" }
Audio('/content/stylegan2/data/piano.wav')


In [None]:
#@title #**Listen to the vocals track** { display-mode: "form" }
Audio('/content/stylegan2/data/vocals.wav')

In [None]:
#@title #**Listen to the other track** { display-mode: "form" }

Audio('/content/stylegan2/data/other.wav')

In [None]:
#@title #**NOT STATIC Video Generation - multiple channel -  Faces (master)** { display-mode: "form" }
#@markdown if your model is based on faces you can control the facial features as parameters, can also be used with models not containing faces 
# base_index
# psi
#psiValue = "vocals" #@param ['bass', 'drums','piano','vocals','other','microscopev1','cats', 'toplessgan_female', 'dicks_v3', 'fursona', 'family', 'modules_v2_faces', 'eye_makeup']
EnableMouth_open = True #@param {type:"boolean"}
mouth_openFactor = 1.2  #@param {type: "slider", min: 0.2, max: 2, step : 0.2}
EnableEye_open = True #@param {type:"boolean"}
eye_openFactor = 1.4  #@param {type: "slider", min: 0.2, max: 2, step : 0.2}
EnablePitch = True #@param {type:"boolean"}
pitchFactor = 1.4  #@param {type: "slider", min: 0.2, max: 2, step : 0.2}
EnableSmile = False #@param {type:"boolean"}
#@markdown (not audio reactive - fixed in timeline parameter)
smileFactor = 0.6  #@param {type: "slider", min: 0.2, max: 2, step : 0.2}

EnableAge = False #@param {type:"boolean"}
ageFactor = 0.6  #@param {type: "slider", min: 0.2, max: 2, step : 0.2}


psi = 0.1  #@param {type: "slider", min: 0.1, max: 2, step : 0.2}


!rm "/content/stylegan2/data/outputPreview.mp4"
!rm "/content/stylegan2/data/output.mp4"

def get_ws(n, frames, seed):
    filename = f'data/ws_{n}_{frames}_{seed}.npy'
    if not os.path.exists(filename):
        src_ws = np.random.RandomState(seed).randn(n, 512)
        ws = np.empty((frames, 512))
        for i in range(512):
            # FIXME: retarded
            x = np.linspace(0, 3*frames, 3*len(src_ws), endpoint=False)
            y = np.tile(src_ws[:, i], 3)
            x_ = np.linspace(0, 3*frames, 3*frames, endpoint=False)
            y_ = interp1d(x, y, kind='quadratic', fill_value='extrapolate')(x_)
            ws[:, i] = y_[frames:2*frames]
        np.save(filename, ws)
    else:
        ws = np.load(filename)
    return ws

def mix_styles(wa, wb, ivs):
    w = np.copy(wa)
    for i, v in ivs:
        w[i] = wa[i] * (1 - v) + wb[i] * v
    return w

def normalize_vector(v):
    return v * np.std(w_avg) / np.std(v) + np.mean(w_avg) - np.mean(v)

def render_frame(t):
    #print(audio)
    global base_index
    frame = np.clip(np.int(np.round(t * fps)), 0, frames - 1)
    base_index += base_speed * audio['drums'][frame]**2
    #base_index += base_speed
    #base_index += base_speed
    base_w = base_ws[int(round(base_index)) % len(base_ws)]
    base_w = np.tile(base_w, (18, 1))
    psi = 0.5 + audio['bass'][frame] / 2
    #psi = 0.5
    base_w = w_avg + (base_w - w_avg) * psi
    mix_w = np.tile(mix_ws[frame], (18, 1))
    mix_w = w_avg + (mix_w - w_avg) * 0.75
    ranges = [range(0, 4), range(4, 8), range(8, 18)]
    values = [audio[track][frame] for track in ['vocals', 'bass', 'drums', 'other']]
    w = mix_styles(base_w, mix_w, zip(ranges, values))
    if EnableMouth_open :
      w += mouth_open * audio['vocals'][frame] * mouth_openFactor
    if EnablePitch:
      w += pitch * audio['drums'][frame] * pitchFactor * audio['bass'][frame]
    if EnableEye_open :
      w += eye_open * audio['bass'][frame] * eye_openFactor * audio['drums'][frame]
    #w += gender+0.2
    if EnableSmile :
      w += smileFactor
    if EnableAge :
      w += age * audio['bass'][frame] * ageFactor
    image = Gs.components.synthesis.run(np.stack([w]), **Gs_syn_kwargs)[0]
    image = PIL.Image.fromarray(image).resize((size, size), PIL.Image.LANCZOS)
    return np.array(image)
    
size = 512
seconds = int(np.ceil(duration))
resolution = 10
base_frames = resolution * frames
base_ws = get_ws(seconds, base_frames, seed)
base_speed = base_frames / sum(audio['drums']**2)
#base_speed = 0
base_index = 0
mix_ws = get_ws(seconds, frames, seed + 1)
# https://rolux.org/media/stylegan2/vectors/mouth_ratio.npy
!
mouth_open = normalize_vector(-np.load('data/mouth_ratio.npy'))
pitch = normalize_vector(-np.load('data/pitch.npy'))
eye_open = normalize_vector(-np.load('data/eye_open.npy'))
gender = normalize_vector(-np.load('data/gender.npy'))
#smile = normalize_vector(-np.load('data/smile.npy'))
age = normalize_vector(-np.load('data/age.npy'))




mp4_filename = 'data/output.mp4'
video_clip = moviepy.editor.VideoClip(render_frame, duration=duration)
#audio_clip_i = moviepy.editor.AudioFileClip('data/input.wav')
audio_clip_v = moviepy.editor.AudioFileClip('data/input.wav')
audio_clip = moviepy.editor.CompositeAudioClip([audio_clip_v])
video_clip = video_clip.set_audio(audio_clip)
video_clip.write_videofile(mp4_filename, fps=fps, codec='libx264', audio_codec='aac', bitrate='8M')



!ffmpeg -i /content/stylegan2/data/output.mp4 -b 1000000 -y /content/stylegan2/data/outputPreview.mp4



'''url = upload_public('/content/stylegan2/data/outputPreview.mp4')
# then display it
from IPython.display import HTML
HTML(f"""<video src={url} width=500 controls/>""")
'''

from IPython.display import HTML
from base64 import b64encode
mp4 = open('/content/stylegan2/data/outputPreview.mp4','rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=400 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)



In [None]:
#@title #**Clean workspace (data folder)** { display-mode: "form" }
!rm -r /content/stylegan2/data/*.wav
!rm -r /content/stylegan2/data/*.mp3
!rm -r /content/stylegan2/data/*.png
!rm -r /content/stylegan2/data/*.mp4