# Import Library

In [23]:
import torch
import torchaudio
import librosa
import soundfile as sf
from pydub import AudioSegment
import random

# Settings Model and Audio File

In [24]:
bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
model = bundle.get_model()

In [25]:
AUDIO_FILE = "./speech.wav"

# Predict Function

In [None]:
def predict_text(audio_path):

    # Load
    waveform, sample_rate = torchaudio.load(audio_path)

    if sample_rate != bundle.sample_rate:
        waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

    with torch.inference_mode():
        emission, _ = model(waveform)

    class GreedyCTCDecoder(torch.nn.Module):
        def __init__(self, labels):
            super().__init__()
            self.labels = labels
            self.blank = 0

        def forward(self, emission:torch.Tensor)->str:
            indices = torch.argmax(emission, dim=-1)
            indices = torch.unique_consecutive(indices, dim=-1)
            indices = [i for i in indices if i != self.blank]
            return ''.join([self.labels[i] for i in indices])
        
    decoder = GreedyCTCDecoder(bundle.get_labels())
    transcript = decoder(emission[0])
    transcript = transcript.replace("|", " ")
    return transcript

# Sub Menu 1 Functions

In [None]:
def validate_filename(filename):
    if filename.endswith(".wav"):
        return True
    else:
        print ("File name must end with .wav")
        return False

def adjust_volume(audio, db):
    return audio + db

def adjust_pitch(audio_path, pitch_rate):
    data, sample_rate = librosa.load(audio_path, sr=None)
    shifted_audio = librosa.effects.pitch_shift(data, sr=sample_rate, n_steps=pitch_rate)

    temp_file = "shifted.wav"
    sf.write(temp_file, shifted_audio, sample_rate)

    new_audio = AudioSegment.from_wav(temp_file)
    return new_audio

def apply_fade_in(audio, duration):
    return audio.fade_in(duration)

def apply_fade_out(audio, duration):
    return audio.fade_out(duration)

def export_file(audio, filename):
    audio.export(filename, format="wav")
    print(f"Audio successfully exported as {filename}")

def export_function(audio):
    while True:
        filename = input('file name: ')
        if filename.endswith('.wav'):
            break
        else:
            print('filename must end with .wav')
    export_file(audio, filename)

# Menu 1

In [28]:
def menu1():
    audio = AudioSegment.from_wav(AUDIO_FILE)
    while True:
        print('1. increase or decrease volume')
        print('2. increase or decrease pitch')
        print('3. insert fade in effect')
        print('4. insert fade out effect')
        print('5. back to main menu')

        choice = int(input('>> '))

        if choice == 1:
            while True:
                db_input = int(input('input db: '))
                if db_input > -10000 and db_input < 100:
                    break
                else:
                    print('must be between -10000 and 100')
            audio = adjust_volume(audio, db_input)
            export_function(audio)
        elif choice == 2:
            while True:
                pitch = int(input('input pitch: '))
                if pitch > -10 and pitch < 10:
                    break
                else:
                    print('must be between -10 and 10')
            audio = adjust_pitch(AUDIO_FILE, pitch)
            export_function(audio)
        elif choice == 3:
            duration = random.randint(1, 5)*1000
            audio = apply_fade_in(audio, duration)
            export_function(audio)
        elif choice == 4:
            duration = random.randint(1, 5)*1000
            audio = apply_fade_out(audio, duration)
            export_function(audio)
        elif choice == 5:
            break
        else:
            print('must be 1-5')

# Main Menu

In [29]:
def main_menu():
    while True:
        print('1. audio augmentation')
        print('2. predict text')
        print('3. exit')

        choice = int(input('>> '))
        if choice == 1:
            menu1()
        elif choice == 2:
            transcript = predict_text(AUDIO_FILE)
            print(transcript)
        elif choice == 3:
            break
    print('thank you for using the program')

In [30]:
main_menu()

1. audio augmentation
2. predict text
3. exit
1. increase or decrease volume
2. increase or decrease pitch
3. insert fade in effect
4. insert fade out effect
5. back to main menu
filename must end with .wav
Audio successfully exported as hii.wav
1. increase or decrease volume
2. increase or decrease pitch
3. insert fade in effect
4. insert fade out effect
5. back to main menu
1. audio augmentation
2. predict text
3. exit
thank you for using the program
