# <b><ins>Installing dependencies</ins></b>

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install pytube
!pip install git+https://github.com/openai/whisper.git -q
!pip install pydub
!pip install translate
!pip install librosa
!git clone https://github.com/x4nth055/gender-recognition-by-voice.git
!sudo apt-get install portaudio19-dev
!pip install pyaudio
!pip install aksharamukha
!pip install audio_effects
!pip install omegaconf

In [None]:
os.rename('gender-recognition-by-voice', 'gender_recognition_by_voice')

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# <b><ins>Installing video from youtube and seperating audio and video from it</ins></b>

In [None]:
import pytube
import os
import subprocess
from pydub import AudioSegment
from moviepy.editor import AudioFileClip
import numpy as np

In [None]:
def convert_to_mp3(audio_obj, audio_file_path):
  new_audio_filename = audio_obj.default_filename[:-4] + '.mp3'
  subprocess.run([
      'ffmpeg',
      '-i', os.path.join(audio_file_path.strip(audio_obj.default_filename)[:-1], audio_obj.default_filename),
      os.path.join(audio_file_path.strip(audio_obj.default_filename)[:-1], new_audio_filename)
  ])

In [None]:
def cut_audio(audio_file_path, start_time, end_time):
    new_audio_file_path = audio_file_path.split('.')[0] + f'_clip_{start_time}_to_{end_time}'+'.' + audio_file_path.split('.')[1]
    duration = end_time - start_time
    ffmpeg_cmd_audio = f'ffmpeg -i "{audio_file_path}" -ss {start_time} -t {duration} -acodec copy "{new_audio_file_path}"'
    os.system(ffmpeg_cmd_audio)
    return new_audio_file_path

In [None]:
def cut_video_and_audio(audio_file_path, video_file_path, start_time, end_time):
  # Appending '_clip_{st}_to_{et} to each of audio and video to be clipped
  new_audio_file_path = audio_file_path.split('.')[0] + f'_clip_{start_time}_to_{end_time}'+'.' + audio_file_path.split('.')[1]
  new_video_file_path = video_file_path.split('.')[0] + f'_clip_{start_time}_to_{end_time}'+'.' + video_file_path.split('.')[1]

  duration = end_time - start_time

  # Build the FFmpeg command
  ffmpeg_cmd_video = f'ffmpeg -i "{video_file_path}" -ss {start_time} -t {duration} -c:v copy -c:a copy "{new_video_file_path}"'
  ffmpeg_cmd_audio = f'ffmpeg -i "{audio_file_path}" -ss {start_time} -t {duration} -acodec copy "{new_audio_file_path}"'

  os.system(ffmpeg_cmd_video)
  os.system(ffmpeg_cmd_audio)

  
  # removing the older audio and video files
  os.remove(audio_file_path)
  os.remove(video_file_path)

  return new_audio_file_path, new_video_file_path



In [None]:
def seperate_audio_and_video(video, clip_start_time=0, clip_end_time=60):
  data = pytube.YouTube(video)
  # Converting and downloading as 'MP4' file and seperating audio and video
  audio = data.streams.get_audio_only()
  audio_file_path = audio.download("audio_data")
  vids= data.streams
  video_file_path = ""
  for i in range(len(vids)):
      if vids[i].resolution == '1080p' and vids[i].codecs[0] in ['vp9', 'av01.0.08M.08']:
        print(i,'-',vids[i])
        video_file_path = vids[i].download("video_data")
        break
  print("a",video_file_path)
  #print(video_file_path)
  # Creating a short clip to process on
  audio_file_path, video_file_path = cut_video_and_audio(audio_file_path, video_file_path, clip_start_time, clip_end_time)

  # Converting mp4 audio to mp3
  convert_to_mp3(audio, audio_file_path)

  return audio_file_path, video_file_path

In [None]:
# Insert the link to the youtube video you want to dub
# video can also be clipped based on clip_start_time and clip_end_time in seconds
# Caution: Only the videos that support the codecs 'vp9' and 'av01.0.08M.08' can be used
original_audio_file_path, original_video_file_path = seperate_audio_and_video("https://www.youtube.com/live/n0f9D-vP5Pc?feature=share", clip_start_time=1, clip_end_time=60)

# <b><ins>Speech Recognition model</ins></b>
### We have used openAI's whisper model for speech recognition. It convert the hinglish audio to english text for further processing.

In [None]:
import whisper

In [None]:
model = whisper.load_model('large')

In [None]:
decode_options = {"language":"english"}
text = model.transcribe(original_audio_file_path, verbose=True, word_timestamps=True, initial_prompt="Indian guy/lady teaching stuff", **decode_options)

In [None]:
text['text']

In [None]:
# To generate a word based transcript, just for testing purposes
def generate_transcript(text):
  for segment in text['segments']:
    for word_dict in segment['words']:
      if word_dict['probability'] > 0.00001:
        print(f"{word_dict['start']} --> {word_dict['end']} : {word_dict['word']}  {round(word_dict['probability'], 5)}")

In [None]:
generate_transcript(text)

# <b><ins>Machine translation</ins></b>
### We are using google translate to sentence vise convert english text to telugu text with its corresponding timestamps.

In [None]:
from translate import Translator

In [None]:
%%time
telugu_transcript = []
for i, segment in enumerate(text['segments']):
    translator = Translator(to_lang="te")
    result = translator.translate(segment['text'])
    telugu_transcript.append(
                              {
                                "word":result, 
                                "start":segment['start'], 
                                "end":segment['end']
                              }
                            )

In [None]:
telugu_transcript

In [None]:
telugu_text = ""
for sentence in telugu_transcript:
  telugu_text += (sentence['word'] + " ")
telugu_text

# <b><ins>Gender Recognition </ins></b>
### We are now using artifical neural network with 5 dense layers from 256 units to 64 units to recognize the gender of speaker's voice.

In [None]:
from gender_recognition_by_voice.utils import load_data, split_data, create_model
from gender_recognition_by_voice import test
import librosa

In [None]:
model = create_model()

In [None]:
clipped_audio_file_path = cut_audio(original_audio_file_path, 10, 100)

In [None]:
model.load_weights("gender_recognition_by_voice/results/model.h5")

In [None]:
def extract_feature(file_name, **kwargs):
    X, sample_rate = librosa.core.load(file_name)
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")
    X, sample_rate = librosa.core.load(file_name)
    if chroma or contrast:
        stft = np.abs(librosa.stft(X))
    result = np.array([])
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
    if chroma:
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
        result = np.hstack((result, chroma))
    if mel:
        mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0)
        result = np.hstack((result, mel))
    if contrast:
        contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
        result = np.hstack((result, contrast))
    if tonnetz:
        tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
        result = np.hstack((result, tonnetz))
    return result

In [None]:
# This current model is only trained using melspectogram features. 
features_to_extract = {'mfcc':False, 'chroma':False, 'mel':True, 'contrast':False, 'tonnetz':False}
features = extract_feature(clipped_audio_file_path, **features_to_extract).reshape(1, -1)

In [None]:
male_prob = model.predict(features)[0][0]

In [None]:
male_prob

In [None]:
speaker_gender = ""
if male_prob > 0.5:
  speaker_gender = "male"
else:
  speaker_gender = "female"

# <b><ins>Speech Synthesis model</ins></b>
### Here too sentence-vise speech synthesis is done and finally all the sentences are concatenated together to form the output speech.
### We first use 'aksharamukha' for transliteration, this library uses ISO romanization techniques to transliterate indic text. Then this transliterated text is fed to 'silero-tts' model for speech synthesis. Then some audio processing is done for syncing the timestamps.

In [None]:
original_audio = AudioSegment.from_file(original_audio_file_path)

In [None]:
import torch
from aksharamukha import transliterate
from scipy.io import wavfile
import torchaudio

In [None]:
%%time
akshara_output_audio = AudioSegment.empty()

for i, segment in enumerate(telugu_transcript):
  model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models', 
                                     model='silero_tts',
                                     language='indic',
                                     speaker='v3_indic',
                                     verbose=False)
  roman_text = transliterate.process('Telugu', 'ISO', segment['word'])
  audio_path = model.save_wav(text=roman_text, speaker=f'telugu_{speaker_gender}', sample_rate=48000)
  temp_audio_segment = AudioSegment.from_wav(audio_path)

  actual_duration = (segment['end'] - segment['start'])

  # If output sentence audio duration is more than actual sentence audio's duration then speed it up
  # Else keep it as it is and add uniform pauses in the start and end of the sentence.
  # To preserve the audio quality, speed down of audio was not performed.
  speed = temp_audio_segment.duration_seconds/actual_duration
  if speed > 1.01:
    if speed-1 > 0.2:
      speed  = 1.2
    temp_audio_segment = temp_audio_segment.speedup(playback_speed = speed)
  elif speed < 1:
    sentence_pause_duration = (actual_duration - temp_audio_segment.duration_seconds)*1000
    temp_audio_segment = AudioSegment.silent(duration=sentence_pause_duration/2) + temp_audio_segment + AudioSegment.silent(duration=sentence_pause_duration/2)

  # For pauses between sentences
  if i == 0:
    pause_duration = segment['start'] - 0
  else:
    pause_duration = segment['start'] - telugu_transcript[i-1]['end']
  
  akshara_output_audio = akshara_output_audio + AudioSegment.silent(duration=pause_duration*1000) + temp_audio_segment
  os.remove(audio_path)

akshara_output_audio = akshara_output_audio.set_frame_rate(original_audio.frame_rate)
akshara_output_audio = akshara_output_audio.set_channels(original_audio.channels)
akshara_output_audio = akshara_output_audio.set_sample_width(original_audio.sample_width)

# Final check to make the output audio's duration equal to the input audio
# The above speedup of sentences still led to some minor differences betweeen actual sentence duration and output sentence duration
final_speed = akshara_output_audio.duration_seconds/original_audio.duration_seconds
akshara_output_audio = akshara_output_audio.speedup(playback_speed=final_speed)

akshara_output_audio.export('aksharamukha_output_audio.mp3', format='mp3')


In [None]:
akshara_output_audio

In [None]:
original_audio

# <b><ins>Combining Audio and Video Clips</ins></b>
### We used 'moviepy' for combining the video with the output audio.
### It will output the final video as 'output_video.webm'.

In [None]:
from moviepy.editor import *

In [None]:
clip = VideoFileClip(original_video_file_path)

In [None]:
clip.duration

In [None]:
audioclip = AudioFileClip('aksharamukha_output_audio.mp3')

In [None]:
if audioclip.duration > clip.duration:
  print(f"Audio clip is longer by {audioclip.duration - clip.duration}")
  audioclip = audioclip.subclip(0, clip.duration)
else:
  print(f"Video clip is longer by {clip.duration - audioclip.duration}")
  clip = clip.subclip(0, audioclip.duration)

In [None]:
audioclip.duration, clip.duration

In [None]:
# adding audio to the video clip
videoclip = clip.set_audio(audioclip)

In [None]:
 # showing video clip
videoclip.write_videofile("output_video.webm")