# Import Library

In [3]:
import torch
import torchaudio
import librosa
import soundfile as sf
import random
from pydub import AudioSegment

# Settings Variable

In [4]:
bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
model = bundle.get_model()

In [5]:
AUDIO_PATH = "./speech.wav"

# Predict Function

In [6]:
def predict_text(audio_path):

    # Load
    waveform, sample_rate = torchaudio.load(audio_path)

    # Resample
    if sample_rate != bundle.sample_rate:
        waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

    with torch.inference_mode():
        emission, _ = model(waveform)

    class GreedyCTCDecoder(torch.nn.Module):
        def __init__(self, labels):
            super().__init__()
            self.labels = labels
            self.blank = 0

        def forward(self, emission:torch.Tensor)->str:
            indices = torch.argmax(emission, dim=-1)
            indices = torch.unique_consecutive(indices, dim=-1)
            indices = [i for i in indices if i != self.blank]
            return ''.join([self.labels[i] for i in indices])
        
    decoder = GreedyCTCDecoder(bundle.get_labels())
    transcript = decoder(emission[0])
    transcript = transcript.replace("|", " ")
    return transcript

# Sub Menu Functions

In [None]:
def validate_filename(filename):
    if filename.endswith(".wav"):
        return True
    else:
        print ("Filename must ends with .wav")
        return False
    
def adjust_volume(audio, db):
    return audio + db

def adjust_pitch(audio_path, pitch_rate):
    data, sample_rate = librosa.load(audio_path, sr=None)
    shifted_data = librosa.effects.pitch_shift(data, sr=sample_rate, n_steps=pitch_rate)

    temp_file = "shifted.wav"
    sf.write(temp_file, shifted_data, sample_rate)

    new_audio = AudioSegment.from_wav(temp_file)
    return new_audio

def apply_fade_in(audio, duration):
    return audio.fade_in(duration)

def apply_fade_out(audio, duration):
    return audio.fade_out(duration)

def export_file(audio, filename):
    audio.export(filename, format="wav")
    print (f"Audio successfully exported as {filename}")

def export_functionn(audio):
    while True:
        filename = input("Please input the filename")
        if filename.endswith(".wav"):
            break
        else:
            print ("Filename must endswith .wav")

    export_file(audio, filename)

# Menu Functions

In [None]:
def menu_1():

    audio = AudioSegment.from_wav(AUDIO_PATH)

    while True:

        print('1. increase or decrease volume')
        print('2. increase or decrease pitch')
        print('3. insert fade in effect')
        print('4. insert fade out effect')
        print('5. back to main menu')

        choice = input(">> ")

        if choice == '1':
            while True:
                db = int(input("Input the volume between -10000 and 100"))
                if db > -10000 and db < 100:
                    break
                else:
                    print ("db must be between -10000 and 100")
            audio = adjust_volume(audio, db)
            export_functionn(audio)

        elif choice == '2':
            while True:
                pitch = int(input("Input the volume between -10 and 10"))
                if pitch > -10 and pitch < 10:
                    break
                else:
                    print ("pitch must be between -10 and 10")
            audio = adjust_pitch(AUDIO_PATH, pitch)
            export_functionn(audio)

        elif choice == '3':
            duration = random.randint(1, 5) * 1000
            audio = apply_fade_in(audio, duration)
            export_functionn(audio)
        
        elif choice == '4':
            duration = random.randint(1, 5) * 1000
            audio = apply_fade_out(audio, duration)
            export_functionn(audio)

        elif choice == '5':
            break
    
        else:
            print ("Submenu input must be between 1-5")

# Main Menu

In [9]:
def main_menu():

    while True:

        print ("1. Audio Segmentation")
        print ("2. Predict Text")
        print ("3. Exit")

        choice = input ("Input your menu choice")

        if choice == '1':
            menu_1()
        elif choice == '2':
            transcript = predict_text(AUDIO_PATH)
            print (transcript)
        elif choice == '3':
            print ("Thank youu :)")
            break

In [10]:
main_menu()

1. Audio Segmentation
2. Predict Text
3. Exit
1. increase or decrease volume
2. increase or decrease pitch
3. insert fade in effect
4. insert fade out effect
5. back to main menu


TypeError: '>' not supported between instances of 'str' and 'int'