In [1]:
# !git clone https://github.com/Nik-Kras/voice_ukr_to_eng.git
%pip install pytube pydub tortoise-tts torch torchaudio librosa soundfile -q

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
from youtube_utils import get_audio_from_youtube
from audio_processing_utils import read_audio, cut_sound

url = "https://youtu.be/NFTAixnLcI8?si=pCBYro4ZlOwcFToK"

audio_path = get_audio_from_youtube(url)
audio_path = "data/youtube_audio.wav" # Bug
audio, sample_rate = read_audio(audio_path)
voice_only = cut_sound(audio, sample_rate, start_time=30)

In [3]:
# Preview
from IPython.display import Audio, display

display(Audio(data=cut_sound(voice_only, sample_rate, end_time=60), rate=sample_rate))

In [4]:
voice_samples = [
    cut_sound(voice_only, sample_rate, start_time=0,  end_time=10),
    cut_sound(voice_only, sample_rate, start_time=10, end_time=20),
    cut_sound(voice_only, sample_rate, start_time=20, end_time=30),
    cut_sound(voice_only, sample_rate, start_time=30, end_time=40),
    cut_sound(voice_only, sample_rate, start_time=40, end_time=50),
    cut_sound(voice_only, sample_rate, start_time=50, end_time=60)
]

from audio_processing_utils import save_audio, resample

TORTUISE_SAMPLE_RATE = 22_050

for i, voice_sample in enumerate(voice_samples):
  save_audio(
      audio_data=resample(voice_sample, sample_rate, TORTUISE_SAMPLE_RATE),
      sample_rate=TORTUISE_SAMPLE_RATE,
      file_path=f"data/voices/voice1/sample_{i}.wav"
  )

# Sanity Check
audio1, sample_rate = read_audio("data/voices/voice1/sample_1.wav")
audio2, sample_rate = read_audio("data/voices/voice1/sample_5.wav")


display(Audio(data=audio1, rate=sample_rate))
display(Audio(data=audio2, rate=sample_rate))

In [6]:
urk_text = """Маленький острів джень бало-дало, з китайської дорогоцінний, відомий також під російською назвою даманський, в березні тисяча дев'ятсот шісдесят дев'ятого року опинився в епіцентрі міжнародної політики. Саме звідти могла початися велика війна між колишніми союзниками та стратегічними партнерами: СССР і китайською народною республікою. Однопартійними диктаторськими режимами які володіли ядерною зброєю, чому розпочався та як відбувався совєтськокитайський конфлікт, на яких ще ділянках кордону відбувалися зіткнення, а зрештою, хто став переморцем? Про все це й набагато більше дивіться далі у відео. Слава Україні! Мене звати Владлен Мараєв і ви на Історії Без Міфів, де ми розповідаємо про минуле україни та світу без прикрас і фальсифікацій. Підписуйтеся на наш канал і натискайте звіночок аби не пропустити нових випусків хай квітне український ютюб!"""
eng_text = """The small island of Jin balo-dalo, from the Chinese word for precious, also known by the Russian name of Damansky, in March 1969 found itself at the epicenter of international politics. It was from there that a great war could begin between former allies and strategic partners: the USSR and the People's Republic of China. One-party dictatorial regimes that possessed nuclear weapons, why did the Soviet-Chinese conflict begin and how did it take place, on what other areas of the border did clashes take place, and in the end, who became the victor? See all this and much more in the video below. Glory to Ukraine! My name is Vladlen Maraev and you are on History Without Myths, where we talk about the past of Ukraine and the world without embellishments and falsifications. Subscribe to our channel and press the bell so you don't miss new releases, let Ukrainian YouTube flourish!"""
 
text_list = []
current_char_num = 0
text_buffer = ""
MAX_CHAR = 400
for x in eng_text.split("."):
    if current_char_num + len(x) > MAX_CHAR:
        text_list.append(text_buffer)
        text_buffer = x
        current_char_num = len(x)
    else:
        current_char_num += len(x)
        text_buffer += x
text_list.append(text_buffer)

print(text_list)

["The small island of Jin balo-dalo, from the Chinese word for precious, also known by the Russian name of Damansky, in March 1969 found itself at the epicenter of international politics It was from there that a great war could begin between former allies and strategic partners: the USSR and the People's Republic of China", ' One-party dictatorial regimes that possessed nuclear weapons, why did the Soviet-Chinese conflict begin and how did it take place, on what other areas of the border did clashes take place, and in the end, who became the victor? See all this and much more in the video below', " Glory to Ukraine! My name is Vladlen Maraev and you are on History Without Myths, where we talk about the past of Ukraine and the world without embellishments and falsifications Subscribe to our channel and press the bell so you don't miss new releases, let Ukrainian YouTube flourish!"]


In [10]:
from tortoise.utils.text import split_and_recombine_text

tts_text = split_and_recombine_text(eng_text)
tts_text

['The small island of Jin balo-dalo, from the Chinese word for precious, also known by the Russian name of Damansky, in March 1969 found itself at the epicenter of international politics.',
 "It was from there that a great war could begin between former allies and strategic partners: the USSR and the People's Republic of China.",
 'One-party dictatorial regimes that possessed nuclear weapons, why did the Soviet-Chinese conflict begin and how did it take place, on what other areas of the border did clashes take place, and in the end, who became the victor?',
 'See all this and much more in the video below. Glory to Ukraine! My name is Vladlen Maraev and you are on History Without Myths, where we talk about the past of Ukraine and the world without embellishments and falsifications.',
 "Subscribe to our channel and press the bell so you don't miss new releases, let Ukrainian YouTube flourish!"]

In [13]:
from time import time
import os
from tortoise.utils.audio import load_voice
from tortoise.api import TextToSpeech
import torchaudio

# This will download all the models used by Tortoise from the HuggingFace hub.
tts = TextToSpeech()

seed = int(time())
voice_samples, conditioning_latents = load_voice(voice="voice1", extra_voice_dirs=["./data/voices"])
gen = tts.tts_with_preset(tts_text[0], voice_samples=voice_samples, conditioning_latents=conditioning_latents,
                              preset="high_quality", k=1, use_deterministic_seed=seed)
gen = gen.squeeze(0).cpu()
torchaudio.save(os.path.join("data/", f'fast_gen.wav'), gen, 24000)

Some weights of the model checkpoint at jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_v', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.o

Generating autoregressive samples..


100%|██████████| 16/16 [15:18<00:00, 57.41s/it]


Computing best candidates using CLVP


100%|██████████| 16/16 [00:02<00:00,  7.09it/s]


Transforming autoregressive outputs into audio..


  return F.conv1d(input, weight, bias, self.stride,
100%|██████████| 400/400 [00:37<00:00, 10.66it/s]


In [11]:
from tts_utils import tts_model

speech_processor = tts_model()
speech_processor.text_to_speech(tts_text[0], quality="fast", voice_set_name="voice1", file_name=f"fast")

Generating autoregressive samples..


100%|██████████| 6/6 [09:49<00:00, 98.29s/it] 


Computing best candidates using CLVP


100%|██████████| 6/6 [00:00<00:00,  7.39it/s]


Transforming autoregressive outputs into audio..


  return F.conv1d(input, weight, bias, self.stride,
100%|██████████| 80/80 [00:07<00:00, 11.34it/s]


In [14]:
speech_processor = tts_model()
speech_processor.text_to_speech(tts_text[0], quality="standard", voice_set_name="voice1", file_name=f"standard")

Loading models...


Some weights of the model checkpoint at jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_v', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.o

Models are ready!
Generating autoregressive samples..


 56%|█████▋    | 9/16 [08:59<07:14, 62.11s/it]

In [None]:
for ind, text in enumerate(text_list):
  speech_processor.text_to_speech(text, quality="fast", voice_set_name="voice1", file_name=f"high_quality_{ind}")

# Full Audio 

In [None]:
with open("full_text.txt", 'r', encoding='utf-8') as f:
    text = ' '.join([l for l in f.readlines()])
text[:50]

In [None]:
texts = split_and_recombine_text(text)
texts[:3]

In [None]:
tts = TextToSpeech()
voice_samples, conditioning_latents = load_voice(voice="voice2", extra_voice_dirs=["./data/voices"])

In [None]:
import torch
import IPython.display

all_parts = []
for j, text in enumerate(texts):
    gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
                              preset="high_quality", k=1, use_deterministic_seed=seed)
    gen = gen.squeeze(0).cpu()
    torchaudio.save(os.path.join("data", f'{j}.wav'), gen, 24000)
    all_parts.append(gen)

full_audio = torch.cat(all_parts, dim=-1)
torchaudio.save(os.path.join("data", 'combined.wav'), full_audio, 24000)
IPython.display.Audio(os.path.join("data", 'combined.wav'))