<a href="https://colab.research.google.com/github/mosheman5/timbre_painting/blob/master/timbre_painting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Inspired by https://github.com/magenta/ddsp/blob/master/ddsp/colab/demos/timbre_transfer.ipynb

Licensed under the Apache License, Version 2.0 (the "License");





In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# Hierarchical Timbre-Painting and Articulation Generation Timbre Transfer 

This notebook is a timbre transfer application using articulation and hierarchical timbre-painting, as detailed in our paper:
* [Paper](https://arxiv.org/abs/2008.13095)
* [Audio Examples](https://mosheman5.github.io/timbre_painting/) 

The notebook extracts loudness and pitch features from a given audio sample, uploaded or recoreded using a microphone.

You can choose using pretrained models or upload your own trained model.

### Instructions for running:

* Make sure to use a GPU runtime, click:  __Runtime >> Change Runtime Type >> GPU__
* Press ▶️ on the left of each of the cells






In [None]:
#@title #Initialization

#@markdown Clone the repository and install packages
!git clone https://github.com/mosheman5/timbre_painting
%cd timbre_painting
!pip install hydra-core==0.11.3
!pip install torchaudio
!pip install soundfile
!pip install -qU ddsp
# # Ignore a bunch of deprecation warnings
# import warnings
# warnings.filterwarnings("ignore")

In [None]:
#@markdown Handle imports
import torch
from pathlib import Path
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
from utils.utils import (create_srs, BaseAudio, load_audio, 
                         PRETRAINED_MODEL_DICT, download_pretrained_model)
from utils.sampling import resample_torch, create_samplers
from models.networks import ParallelWaveGANGenerator
from timbre_painting import (load_norm_dicts, shift_ld, norm_loudness, 
                             f0_transfer, load_trained_pyramid, calc_loudness_list)
import soundfile as sf
import librosa
from data_utils.spectral_feats import calc_loudness
import json
import os
from tqdm import tqdm
from ddsp.colab.colab_utils import (
    auto_tune, detect_notes, fit_quantile_transform, 
    get_tuning_factor, download, play, record, 
    specplot, upload, DEFAULT_SAMPLE_RATE)
import time
from google.colab import files
import os
from pathlib import Path
import matplotlib.pyplot as plt

# Helper Functions
SAMPLE_RATE = DEFAULT_SAMPLE_RATE  # 16000


print('Done!')

In [None]:
#@title Record or Upload Audio
#@markdown * Either record audio from microphone or upload audio from file (.mp3 or .wav) 
#@markdown * Audio should be monophonic (single instrument / voice)

record_or_upload = "Record"  #@param ["Record", "Upload (.mp3 or .wav)"]

record_seconds =     10#@param {type:"number", min:1, max:10, step:1}

if record_or_upload == "Record":
  audio = record(seconds=record_seconds+2)
  audio = audio[SAMPLE_RATE*2:]
else:
  # Load audio sample here (.mp3 or .wav3 file)
  # Just use the first file.
  filenames, audios = upload()
  audio = audios[0]
audio = audio[np.newaxis, :]
print('\nExtracting audio features...')

# Plot.
specplot(audio)
play(audio)



In [None]:
#@title Load a model
#@markdown Load pretrained model/user's model, file processors and initializes weights
model = 'Violin_v2' #@param ['Trumpet', 'Violin', 'Violin_v2', 'Saxophone', 'Cello', 'Upload your own (checkpoint folder as .tar.gz)']
MODEL = model

def find_model_dir(dir_name):
  # Iterate through directories until model directory is found
  for root, dirs, filenames in os.walk(dir_name):
    for filename in filenames:
      if filename.endswith("args.pth") and not filename.startswith("."):
        model_dir = root
        break
  return model_dir 


if model in ('Trumpet', 'Violin', 'Violin_v2', 'Saxophone', 'Cello',):
  # Pretrained models.
  PRETRAINED_DIR = '/content/pretrained'
  # remove old checkpoints and download model
  !rm -r $PRETRAINED_DIR &> /dev/null
  !mkdir $PRETRAINED_DIR &> /dev/null
  model_path = download_pretrained_model(model, PRETRAINED_DIR)
  
else:
  # User's model.
  UPLOAD_DIR = '/content/uploaded'
  !rm -r $UPLOAD_DIR &> /dev/null
  !mkdir $UPLOAD_DIR
  uploaded_files = files.upload()

  for fnames in uploaded_files.keys():
    print("Extracting... {}".format(fnames))
    !tar -xzvf $fnames -C $UPLOAD_DIR &> /dev/null
  model_path = find_model_dir(UPLOAD_DIR)

model_path = Path(model_path)

run_args = torch.load(model_path / 'args.pth')

# define args from trained model
sr = run_args.sr
num_scales = run_args.num_scales
scale_factor = run_args.scale_factor
max_value = run_args.max_val
max_value_f0 = run_args.max_val_f0
cond_freq = run_args.cond_freq

# Pytorch device
device = torch.device("cuda")

#load processors
crepe_path = 'data_utils/crepe_models/full.pth'
base_audio = BaseAudio(crepe_path, device, True)
srs = create_srs(sr, num_scales, scale_factor)
samplers = create_samplers(srs, device=device)
norm_dicts = load_norm_dicts(model_path / 'loudness.json')

#load model
Gs = load_trained_pyramid(model_path, network_params=run_args.generator_params, device=device, srs=srs)

In [None]:
#@title Modify conditioning

#@markdown Please tune the octave shift to be aligned with the target instrument. Trial and error are recommended, to see which key fits best your audio and to have some fun along the way!

#@markdown Shift the pitch (octaves)
pitch_shift =  0 #@param {type:"slider", min:-3, max:3, step:0.5}

#@markdown Shift the loudness (octaves)
loudness_shift =  0 #@param {type:"slider", min:-30, max:30, step:5}

#@markdown Zero out unvoiced pitch, sometimes removes original signal
zero_unvoiced_pitch = False #@param{type:"boolean"}

def norm_loudness_list(loudness_list, loudness_shift):
  for it, loudness_item in enumerate(loudness_list):
    loudness_list[it] = loudness_item + loudness_shift
  return loudness_list

real_audio = audio.squeeze()
loudness_hop = 8 * sr // cond_freq
real_audio = real_audio[:len(real_audio) // loudness_hop * loudness_hop]
loudness_list = calc_loudness_list(audio=real_audio, srs=srs, device=device,
                                    sr_in=sr, norm_dicts=norm_dicts)
loudness_list = norm_loudness_list(loudness_list, loudness_shift)
real_audio, _, frequency = base_audio.forward(real_audio, sr, max_value_f0, 
                              numpy_flag=True, octave=2**pitch_shift, 
                              return_raw=True, unvoiced_flag=zero_unvoiced_pitch)

real_audio_orig = real_audio[None, None, ...].to(device)
# resample input to the wanted scale
real_audio = resample_torch(real_audio_orig, sr, srs[0], max_val=max_value_f0)


# Plot Features.
n_plots = 2
fig, axes = plt.subplots(nrows=n_plots, 
                      ncols=1, 
                      sharex=False,
                      figsize=(8, 2*n_plots))
ax = axes[0]
ax.plot(librosa.hz_to_midi(frequency / 2**pitch_shift))
ax.plot(librosa.hz_to_midi(frequency))
ax.set_ylabel('f0 [midi]')
_ = ax.legend(['Original','Adjusted'])

ax = axes[1]
loudness_plot = loudness_list[-1].squeeze().cpu().numpy()
ax.plot(loudness_plot)
ax.plot(loudness_plot + loudness_shift)
ax.set_ylabel('Loudness [dB]')
_ = ax.legend(['Original','Adjusted'])

In [None]:
#@title #Resynthesize Audio

# Run a batch of predictions.
start_time = time.time()
audio_outputs = f0_transfer(real_audio,loudness_list, Gs, samplers, max_val=max_value, save_all = False)
audio_gen=audio_outputs[0]
audio_gen = audio_gen.squeeze(0).cpu().numpy()
audio_gen *= (0.4 / abs(audio_gen).max())
print('Generation took %.1f seconds' % (time.time() - start_time))

# Plot
print('Original')
play(audio)

print('Resynthesis')
play(audio_gen)

specplot(audio)
plt.title("Original")

specplot(audio_gen)
_ = plt.title("Resynthesis")