# Import Library

In [None]:
import torch
import torchaudio
import librosa
import soundfile as sf
import random
from pydub import AudioSegment

# Settings Variable

In [None]:
bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
model = bundle.get_model()

In [6]:
AUDIO_FILE = "./speech.wav"

# Predict Function

In [None]:
def predict_text(audio_path):

    # Load
    waveform, sample_rate = torchaudio.load(audio_path)

    # Resample
    if sample_rate != bundle.sample_rate:
        waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

    with torch.inference_mode():
        emission, _ = model(waveform)

    class GreedyCTCDecoder(torch.nn.Module):
        def __init__(self, labels):
            super().__init__()
            self.labels = labels
            self.blank = 0

        def forward(self, emission:torch.Tensor)->str:
            indices = torch.argmax(emission, dim=-1)
            indices = torch.unique_consecutive(indices, dim=-1)
            indices = [i for i in indices if i != self.blank]
            return ''.join([self.labels[i] for i in indices])
        
    decoder = GreedyCTCDecoder(bundle.get_labels())
    transcript = decoder(emission[0])
    transcript = transcript.replace("|", " ")
    return transcript

# Sub Menu Functions

In [8]:
def validate_filename(filename):
    if filename.endswith(".wav"):
        return True
    else:
        print ("Filename must ends with .wav")
        return False
    
def adjust_volume(audio, volume):
    return audio + volume

def adjust_pitch(audio_path, pitch_rate):
    data, sample_rate = librosa.load(audio_path, sr=None)
    shifted_audio = librosa.effects.pitch_shift(data, sr=sample_rate, n_steps=pitch_rate)
    temp_file_name = "shifted.wav"
    sf.write(temp_file_name, shifted_audio, sample_rate)
    new_audio = AudioSegment.from_wav(temp_file_name)
    return new_audio

def apply_fade_in(audio, duration):
    return audio.fade_in(duration)

def apply_fade_out(audio, duration):
    return audio.fade_out(duration)

def export_file(audio, filename):
    audio.export(filename, format="wav")
    print (f"Audio successfully exported as {filename}")

def export_function(audio):
    while True:
        filename = input("Please input the filename")
        if filename.endswith(".wav"):
            break
        else:
            print ("Please end the filename with .wav")
    export_file(audio, filename)

# Menu Functions

In [None]:
def menu_1():
    
    audio = AudioSegment.from_wav(AUDIO_FILE)

    while True:
        print ("1. Increase or Decrease Volume")
        print ("2. Increase or Decrease Pitch")
        print ("3. Insert Fade In Effect")
        print ("4. Insert Fade Out Effect")
        print ("5. Back to the main menu")

        choice = input("Please input chose your menu")

        if choice == '1':
            while True:
                db_input = int(input("Input db: "))
                if db_input > -10000 and db_input < 100:
                    break
                else:
                    print ("db input must be between -10000 and 100")
            audio = adjust_volume(audio, db_input)
            export_function(audio)

        elif choice == '2':
            while True:
                pitch = int(input("Input pitch"))
                if pitch > -10 and pitch < 10:
                    break
                else:
                    print ("Pitch input must be between -10 and 10")

            audio = adjust_pitch(AUDIO_FILE, pitch)
            export_function(audio)

        elif choice == '3':
            duration = random.randint(1, 5) * 1000
            audio = apply_fade_in(audio, duration)
            export_function(audio)
        
        elif choice == '4':
            duration = random.randint(1, 5) * 1000
            audio = apply_fade_out(audio, duration)
            export_function(audio)

        elif choice =='5':
            break

        else:
            print ("Input menu must be between 1-5")

# Main Menu

In [10]:
def main_menu():
    while True:
        print ("1. Audio Augmentation")
        print ("2. Predict Text")
        print ("3. Exit")

        choice = input("Input your menu")

        if choice == "1":
            menu_1()
        elif  choice == "2":
            transcript = predict_text(AUDIO_FILE)
            print (f"Text Prediction: {transcript}")
        elif choice == "3":
            break
        else:
            print ("Input must be between 1-3")

In [11]:
main_menu()

1. Audio Augmentation
2. Predict Text
3. Exit
1. Increase or Decrease Volume
2. Increase or Decrease Pitch
3. Insert Fade In Effect
4. Insert Fade Out Effect
5. Back to the main menu
Please end the filename with .wav
Audio successfully exported as 1.wav
1. Increase or Decrease Volume
2. Increase or Decrease Pitch
3. Insert Fade In Effect
4. Insert Fade Out Effect
5. Back to the main menu
Pitch input must be between -10 and 10
Audio successfully exported as 2.wav
1. Increase or Decrease Volume
2. Increase or Decrease Pitch
3. Insert Fade In Effect
4. Insert Fade Out Effect
5. Back to the main menu
Audio successfully exported as 3.wav
1. Increase or Decrease Volume
2. Increase or Decrease Pitch
3. Insert Fade In Effect
4. Insert Fade Out Effect
5. Back to the main menu
Audio successfully exported as 4.wav
1. Increase or Decrease Volume
2. Increase or Decrease Pitch
3. Insert Fade In Effect
4. Insert Fade Out Effect
5. Back to the main menu
1. Audio Augmentation
2. Predict Text
3. Exit
Te