In [None]:
! apt-get install -y portaudio19-dev 
! apt-get install python3-all-dev

! pip install librosa tqdm gdown pyaudio wave pydub noisereduce

In [None]:
! gdown 1Qq4WdRhAT2GNCdGcSpGLCpA21Uy-N3hc # my training data
! unzip -qq spmel.zip
! rm spmel.zip

# Preprocess
### Prepare the training data
* Put wav files in `/wavs` folder.
* Please arrange the wav files for each speakers as the following:
    ```
    wavs
    ├── speaker1
    |   ├── speaker1_001.wav
    |   └── speaker1_002.wav
    ├── speaker2
    |   ├── speaker2_001.wav
    |   └── speaker2_002.wav
    ├── ...
    ```
* It's better to use same number of utterances per speaker.
* I use the subset of [VCTK corpus](https://datashare.ed.ac.uk/handle/10283/2950) as the raw training data.

### Convert to mel spectrums
* Use the command: ```python make_spec.py --rootDir "./wavs" --targetDir "./spmel"```
* My training dataset is [here](https://drive.google.com/file/d/1Qq4WdRhAT2GNCdGcSpGLCpA21Uy-N3hc/view?usp=drive_link).

### Extract the feature of speaker
* Use the command: ```! python make_d_vector.py --num_uttrs 400 --rootDir "./spmel" --model "./3000000-BL.ckpt"```

In [None]:
# ! python make_spec.py --rootDir "./wavs" --targetDir "./spmel"

In [None]:
# import os
# from random import shuffle

# num_uttrs = 400
# rootdir = './spmel'

# dirs = []
# dirpath, dirnames, filenames = next(os.walk(rootdir))
# for dir in dirnames:
#     path = os.path.join(dirpath, dir)
#     dirs.append(path)
#     print(path)
#     num_uttrs = min(num_uttrs, len(os.listdir(path)))
# print(f'num_uttrs: {num_uttrs}')

# for dir in dirs:
#     left_files = os.listdir(dir)
#     shuffle(left_files)
#     left_files = left_files[:num_uttrs]
#     for f in os.listdir(dir):
#         if f not in left_files:
#             os.remove(os.path.join(dir,f))

In [None]:
! python make_d_vector.py --num_uttrs 400 --rootDir "./spmel" --model "./3000000-BL.ckpt"

# Training

In [None]:
import torch
from solver_encoder import Solver
from data_loader import get_loader

class Config:
    def __init__(self):
        self.data_dir = './spmel'
        self.batch_size = 20
        self.len_crop = 176
        self.lambda_cd = 1
        self.dim_neck = 32
        self.dim_emb = 256
        self.dim_pre = 512
        self.freq = 22
        self.num_iters = 1000000
        self.log_step = 1000
        self.early_stop = 20000
config = Config()

In [None]:
vcc_loader = get_loader(config.data_dir, config.batch_size, config.len_crop)
solver = Solver(vcc_loader, config, checkpoint='autovc_136000.ckpt') # TODO: load checkpoint
solver.train()
torch.save(solver.best_state_dict, f'autovc_best.ckpt')
torch.save(solver.G.state_dict(), f'autovc_latest.ckpt')

# Create your own testing corpus

In [None]:
# import pyaudio
# import wave
# import numpy as np

# RECORD_SECONDS = 600  # Record time (seconds)
# WAVE_OUTPUT_FILENAME = "test_wavs/t1/recordedFile_001.wav"  # Filename
# FORMAT = pyaudio.paInt16
# CHANNELS = 1
# RATE = 22050
# CHUNK = 512
# device_index = 2
# audio = pyaudio.PyAudio()

# print("----------------------record device list---------------------")
# info = audio.get_host_api_info_by_index(0)
# numdevices = info.get('deviceCount')
# for i in range(0, numdevices):
#     if audio.get_device_info_by_host_api_device_index(0, i).get('maxInputChannels') > 0:
#         print("Input Device id ", i, " - ", audio.get_device_info_by_host_api_device_index(0, i).get('name'))

# print("-------------------------------------------------------------")

# index = int(input("Enter recording device index: "))
# print("Recording via index " + str(index))

# stream = audio.open(format=FORMAT, channels=CHANNELS,
#                     rate=RATE, input=True, input_device_index=index,
#                     frames_per_buffer=CHUNK)
# print("Recording started")
# record_frames = []

# for i in range((RATE * (RECORD_SECONDS + 1)) // CHUNK):
#     data = stream.read(CHUNK)
#     record_frames.append(data)

# print("Recording stopped")

# stream.stop_stream()
# stream.close()
# audio.terminate()

# # Amplify the audio level
# audio_data = np.frombuffer(b''.join(record_frames), dtype=np.int16)
# max_amplitude = np.max(np.abs(audio_data))
# target_peak_level =  np.iinfo(np.int16).max
# gain_factor = target_peak_level / max_amplitude
# amplified_data = audio_data * gain_factor
# clipped_data = np.clip(amplified_data, -32768, 32767)
# amplified_frames = clipped_data.astype(np.int16).tobytes()

# waveFile = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
# waveFile.setnchannels(CHANNELS)
# waveFile.setsampwidth(audio.get_sample_size(FORMAT))
# waveFile.setframerate(RATE)
# waveFile.writeframes(amplified_frames)
# waveFile.close()

In [None]:
# from pydub import AudioSegment
# from pydub.silence import split_on_silence
# from pydub.effects import normalize
# import numpy as np
# import os
# from tqdm import tqdm

# time_step = 5
# wav_name = WAVE_OUTPUT_FILENAME
# audio = AudioSegment.from_file(wav_name, "wav")
# audio_time = len(audio)
# cut_parameters = np.arange(time_step, audio_time / 1000, time_step)
# start_time = int(0)

# print(f'num_uttrs: {len(cut_parameters)}')

# for t in tqdm(cut_parameters):
#     stop_time = int(t * 1000)
#     audio_chunk = audio[start_time:stop_time]

#     # Remove empty spaces
#     audio_chunks = split_on_silence(
#         audio_chunk,
#         min_silence_len=500,
#         silence_thresh=-50
#     )

#     # Concatenate the non-silent chunks
#     audio_chunk = sum(audio_chunks)

#     # Remove sonic boom
#     fade_duration = 50  # milliseconds
#     audio_chunk = audio_chunk.fade_in(fade_duration).fade_out(fade_duration)

#     # Enhance SNR and sound amplitude level
#     audio_chunk = normalize(audio_chunk)

#     audio_chunk.export(f"{wav_name}-{t // time_step}.wav", format="wav")
#     start_time = stop_time
#     # print(f'done -- {wav_name}-{t // time_step}')

# os.remove(wav_name)  # Remove original file

In [None]:
# ! python make_spec.py --rootDir "./test_wavs" --targetDir "./test_spmel"
# ! python make_d_vector.py --num_uttrs 120 --rootDir "./test_spmel" --model "./3000000-BL.ckpt"

# Inference

In [None]:
import pickle

train_pkl_path = 'spmel/train.pkl'
with open(train_pkl_path, 'rb') as pickle_file:
    speakers = pickle.load(pickle_file)

for idx in range(len(speakers)):
    print(f'{idx} --- {speakers[idx][0]}')

In [None]:
import IPython.display as ipd
import pickle
import torch
import numpy as np
from model_vc import Generator
import noisereduce as nr

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'device: {device}')

G = Generator(config.dim_neck, config.dim_emb, config.dim_pre, config.freq).eval().to(device)
G.load_state_dict(torch.load('models/autovc_378000_32.ckpt')) # TODO: change to your model
org_metadata = pickle.load(open('spmel/train.pkl', "rb")) # TODO: your from-corpus
trg_metadata = pickle.load(open('spmel/train.pkl', "rb")) # TODO: your to-corpus

source = 0 # TODO: from-speaker
target = 1 # TODO: to-speaker
sr = 22050
uttr = np.load("spmel/p232/p232_003.npy") # TODO: from-utterance
uttr_len = uttr.shape[0]
n_pad = config.len_crop - uttr.shape[0] % config.len_crop
uttr = np.concatenate((uttr, np.zeros((n_pad, 80))))
uttr = uttr.reshape([-1, config.len_crop, 80]) # slice the audio into chunks

emb_org = torch.from_numpy(np.expand_dims(org_metadata[source][1],axis=0)).to(device)
emb_trg = torch.from_numpy( np.expand_dims(trg_metadata[target][1],axis=0)).to(device)

uttr_collector = np.zeros((0,80))

for i in range(uttr.shape[0]):
    uttr_trg = torch.from_numpy( np.expand_dims(np.squeeze(uttr[i,:]),axis=0)).float().to(device)
    with torch.no_grad():
        _, x_identic_psnt, _ = G(uttr_trg, emb_org, emb_trg)
        uttr_collector = np.concatenate([uttr_collector, x_identic_psnt[0, 0, :, :].cpu().numpy()], axis=0)

# To　Waveform
from interface import *
uttr_collector = uttr_collector[:uttr_len]
vocoder = MelVocoder(path=None, github=True)
audio = np.squeeze(vocoder.inverse(torch.from_numpy(np.expand_dims(uttr_collector.T,axis=0)).float()).cpu().numpy())

# Enhance audio quality
# mean, std = np.mean(audio), np.std(audio)
# audio = (audio - mean) / std

# reduced_noise = nr.reduce_noise(y=audio, sr=sr, hop_length=256, n_fft=1024, n_jobs=-1)
audio = ipd.Audio(audio, rate = sr)

with open('inference_003.wav', 'wb') as f: # saved file name
    f.write(audio.data)
    
audio