<a href="https://colab.research.google.com/github/Norod/my-colab-experiments/blob/master/WikiArtCultureShock.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
########################################################################
#
#     WikiArtCultureShock - A meshup of existing resources for your
#                             audiovisual pleasure. 
#
#     The original culture shock code
#     https://gist.github.com/rolux/48f1da6cf2bc6ca5833dbacbf852b348
#
#     The original WikiArt StyleGAN2 repo
#     https://github.com/pbaylies/stylegan2
#
#     The original Royalty Free Music from Bensound
#     https://www.bensound.com/royalty-free-music/track/pop-dance
#
#     Gathered and assembled into a Colab Notebook by Doron Adler
#     @norod78
#
########################################################################

In [0]:
%tensorflow_version 1.x
import tensorflow as tf

%cd /content/

!git clone https://github.com/NVlabs/stylegan2
%cd stylegan2

!nvcc test_nvcc.cu -o test_nvcc -run

print('Tensorflow version: {}'.format(tf.__version__) )
!nvidia-smi -L
print('GPU Identified at: {}'.format(tf.test.gpu_device_name()))

In [0]:
# Download the model of choice
import os
import numpy as np
from scipy.interpolate import interp1d
from scipy.io import wavfile
import matplotlib.pyplot as plt
import moviepy.editor
import dnnlib
import dnnlib.tflib as tflib
import pretrained_networks
import argparse
import PIL.Image
import re
import sys
from io import BytesIO
import IPython.display
import numpy as np
from math import ceil
from PIL import Image, ImageDraw
import imageio
import pretrained_networks
#network_pkl = "gdrive:networks/stylegan2-ffhq-config-f.pkl"

network_pkl = 'https://archive.org/download/wikiart-stylegan2-conditional-model/network-snapshot-006746.pkl'

print('Loading networks from "%s"...' % network_pkl)

_G, _D, Gs = pretrained_networks.load_networks(network_pkl)

Gs_kwargs = dnnlib.EasyDict()
Gs_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
Gs_kwargs.randomize_noise = False
Gs_syn_kwargs = dnnlib.EasyDict()
Gs_syn_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
Gs_syn_kwargs.randomize_noise = False
Gs_syn_kwargs.minibatch_size = 4
noise_vars = [var for name, var in Gs.components.synthesis.vars.items() if name.startswith('noise')]
w_avg = Gs.get_var('dlatent_avg')

def get_ws(n, frames, seed):
    filename = f'data/ws_{n}_{frames}_{seed}.npy'
    if not os.path.exists(filename):
        src_ws = np.random.RandomState(seed).randn(n, 512)
        ws = np.empty((frames, 512))
        for i in range(512):
            # FIXME: retarded
            x = np.linspace(0, 3*frames, 3*len(src_ws), endpoint=False)
            y = np.tile(src_ws[:, i], 3)
            x_ = np.linspace(0, 3*frames, 3*frames, endpoint=False)
            y_ = interp1d(x, y, kind='quadratic', fill_value='extrapolate')(x_)
            ws[:, i] = y_[frames:2*frames]
        np.save(filename, ws)
    else:
        ws = np.load(filename)
    return ws

def mix_styles(wa, wb, ivs):
    w = np.copy(wa)
    for i, v in ivs:
        w[i] = wa[i] * (1 - v) + wb[i] * v
    return w

def normalize_vector(v):
    return v * np.std(w_avg) / np.std(v) + np.mean(w_avg) - np.mean(v)

def render_frame(t):
    global base_index
    frame = np.clip(np.int(np.round(t * fps)), 0, frames - 1)
    base_index += base_speed * audio['accompaniment'][frame]**2
    base_w = base_ws[int(round(base_index)) % len(base_ws)]
    base_w = np.tile(base_w, (18, 1))
    psi = 0.5 + audio['drums'][frame] / 4
    base_w = w_avg + (base_w - w_avg) * psi
    mix_w = np.tile(mix_ws[frame], (18, 1))
    mix_w = w_avg + (mix_w - w_avg) * 0.75
    ranges = [range(0, 4), range(4, 8), range(8, 18)]
    values = [audio[track][frame] for track in ['fx', 'fx', 'fx']]
    #values = [0,-1,1,0,2,1]
    w = mix_styles(base_w, mix_w, zip(ranges, values))
    w += mouth_open * audio['vocals'][frame] * 1.5
    image = Gs.components.synthesis.run(np.stack([w]), **Gs_syn_kwargs)[0]
    image = PIL.Image.fromarray(image).resize((size, size), PIL.Image.LANCZOS)
    return np.array(image)

In [0]:
!mkdir data
!wget https://rolux.org/media/stylegan2/vectors/mouth_ratio.npy -O ./data/mouth_ratio.npy

In [0]:
!wget https://norod78.s3-eu-west-1.amazonaws.com/WikiArtCultureShock/bensound_popdance_audio.zip
!unzip -n -j ./bensound_popdance_audio.zip -d ./tmp

In [0]:
!mv ./tmp/bensound-popdance_other.mp3 ./data/accompaniment.mp3
!mv ./tmp/bensound-popdance_vocals.mp3  ./data/vocals.mp3
!mv ./tmp/bensound-popdance_drums.mp3 ./data/drums.mp3
!mv ./tmp/bensound-popdance_bass.mp3  ./data/fx.mp3
!mv ./tmp/bensound-popdance_all.mp3 ./data/all.mp3

In [0]:

audio = {}
fps = 60

for mp3_filename in [f for f in os.listdir('data') if f.endswith('.mp3')]:
    mp3_filename = f'data/{mp3_filename}'
    wav_filename = mp3_filename[:-4] + '.wav'
    if not os.path.exists(wav_filename):
        audio_clip = moviepy.editor.AudioFileClip(mp3_filename)
        audio_clip.write_audiofile(wav_filename, fps=44100, nbytes=2, codec='pcm_s16le')
    track_name = os.path.basename(wav_filename)[:-4]
    print("track_name = " + str(track_name))
    rate, signal = wavfile.read(wav_filename)
    signal = np.mean(signal, axis=1) # to mono
    signal = np.abs(signal)
    abitofrandomforseed = np.random.randint(10, size=1)
    seed = signal.shape[0] + abitofrandomforseed[0]
    duration = signal.shape[0] / rate
    frames = int(np.ceil(duration * fps))
    samples_per_frame = signal.shape[0] / frames
    audio[track_name] = np.zeros(frames, dtype=signal.dtype)
    for frame in range(frames):
        start = int(round(frame * samples_per_frame))
        stop = int(round((frame + 1) * samples_per_frame))
        audio[track_name][frame] = np.mean(signal[start:stop], axis=0)
    audio[track_name] /= max(audio[track_name])

for track in sorted(audio.keys()):
    plt.figure(figsize=(8, 3))
    plt.title(track)
    plt.plot(audio[track])
    plt.savefig(f'data/{track}.png')

In [0]:
print(audio)

In [0]:
size = 1080
#duration = 20 #If you prefer to test with a shorter *video* duration than the song's length
seconds = int(np.ceil(duration))
resolution = 10
base_frames = resolution * frames
base_ws = get_ws(seconds, base_frames, seed)
base_speed = base_frames / sum(audio['accompaniment']**2)
base_index = 0
mix_ws = get_ws(seconds, frames, seed + 1)
# https://rolux.org/media/stylegan2/vectors/mouth_ratio.npy
mouth_open = normalize_vector(-np.load('data/mouth_ratio.npy'))

mp4_filename = 'data/WikiArtCultureShock-bensound_popdance.mp4'
video_clip = moviepy.editor.VideoClip(render_frame, duration=duration)
audio_clip = moviepy.editor.AudioFileClip('data/all.wav')
video_clip = video_clip.set_audio(audio_clip)
video_clip.write_videofile(mp4_filename, fps=fps, codec='libx264', audio_codec='aac', bitrate='8M')


In [0]:
from IPython import display as ipythondisplay
import io
import os
import base64
from IPython.display import HTML

def show_video(vid):
  ext = os.path.splitext(vid)[-1][1:]
  video = io.open(vid, 'r+b').read()
  ipythondisplay.display(HTML(data='''<BR> Royalty Free Music from Bensound <BR><video alt="WikiArtCultureShock" autoplay 
              loop controls style="height: 512px;">
              <source src="data:video/{1}';base64,{0}" type="video/{1}" />
              </video>'''.format(base64.b64encode(video).decode('ascii'), ext)))

In [0]:
show_video(mp4_filename)