In [None]:
!pip install webrtcvad -q
!apt-get install ffmpeg -q

In [None]:
import numpy as np
import pandas as pd
import os
import time
import shutil
import librosa
from transformers import pipeline
from tqdm import tqdm
from glob import glob
from pydub import AudioSegment
from pydub.silence import split_on_silence
from IPython.display import FileLink, FileLinks 
import warnings
import subprocess

In [None]:
warnings.filterwarnings("ignore")

pipe = pipeline("automatic-speech-recognition", model="bengaliAI/tugstugi_bengaliai-regional-asr_whisper-medium")

In [None]:
def split_audio_by_voice(input_file, output_folder, min_silence_len=1000, silence_thresh=-40):
    
    if not os.path.exists(output_folder):  # creating output directory
        os.makedirs(output_folder)
    
    audio = AudioSegment.from_file(input_file) # loading the audio clip
    
    chunks = split_on_silence(             # making chunks based on VAD
        audio,
        min_silence_len=min_silence_len,  # minimum silence length in ms
        silence_thresh=silence_thresh,     # silence threshold in dB
        keep_silence=500                   # keeping 500ms of silence at edges
    )
    
    for i, chunk in enumerate(chunks, start=1): # saving chunks
        output_file = os.path.join(output_folder, f"chunk_{i:04d}.wav")
        chunk.export(output_file, format="wav")


def hms_format(seconds:float, explicit_format=False) -> str:

    hours, seconds = divmod(seconds, 3600)
    minutes, seconds = divmod(seconds, 60)

    if explicit_format:
        return "{} hours {:02} minutes {:02} seconds".format(int(hours), int(minutes), round(seconds))
    else:
        return "{}:{:02}:{:02}".format(int(hours), int(minutes), round(seconds))


def infer_tugstugi(aud_path): # use the audio file as the audio source
    transcription = pipe(aud_path)['text']
    text = str(transcription)
    return text

In [None]:
input_dir = "/kaggle/input/interview-audios" #input path
output_dir = "/kaggle/working/chunks"                 # output directory path
output_conv_dir = "/kaggle/working/converted_audio"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_conv_dir, exist_ok=True)

df = pd.DataFrame(columns=["file_name", "file_format", "transcriptions", "original_audio_length", "trimmed_audio_length", "total_transcription_time"])

orig_audio_length = 0
trim_audio_length = 0
trans_audio_length = 0

In [None]:
for i in os.listdir(input_dir):

    f = os.path.splitext(i)[1]

    
    input_audio = f"{input_dir}/{i}"  # input data path 

    # Convert to WAV if the file is not WAV
    if not i.endswith(".wav"):

        
        
        converted_audio = f"{output_conv_dir}/{i.split('.')[0]}.wav"
        
        subprocess.run(["ffmpeg", "-i", 
                        input_audio, converted_audio, 
                       # "-y"
                       ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        

        input_audio = converted_audio  # Use the converted file

    # Now process with librosa
    try:
        d = librosa.get_duration(filename=input_audio)
        dur = hms_format(d, explicit_format=True)
        print(f"Input Audio Duration ({i}): {dur} \n")
        orig_audio_length += d

    except Exception as e:
        print(f"Error processing {i}: {e}")

        break

    
    split_audio_by_voice(input_audio, output_dir) # making chunks based on vocal activity

    t = 0
    transcriptions = []
    start_time = time.time()

    aud_list = os.listdir(output_dir)
    aud_list.sort()
    for wav in tqdm(aud_list):
        if wav[-4:] == ".wav":
            wav = f"{output_dir}/{wav}"
            #print(wav)
            d = librosa.get_duration(filename=wav)
            t+=d

            try:
                transcription=infer_tugstugi(wav)
                #print(transcription)
                #transcriptions.append(transcription)
            except Exception as e:

                q = 0

                while q < 20:
                    try:
                        transcription=infer_tugstugi(wav)
                    except:
                        q += 1
                        transcription = "<UNK>"
            
            transcriptions.append(transcription)
                
            #df_test = pd.concat([df_test, pd.DataFrame([{"chunk": wav, "transcriptions":transcription}])], ignore_index=True)
    
    #df_test.to_excel(f"{i}_chunk_transcripts.xlsx", index=False)

    trim_audio_length += t
    trimmed_dur = hms_format(t, explicit_format=True)
    print(f"Input Audio Duration with trimmed silence: {trimmed_dur} \n")
    concatenated_text = "\n".join(transcriptions)
    #print(concatenated_text)
    end_time = time.time()
    transcription_time = end_time - start_time
    trans_audio_length += transcription_time
    hms = hms_format(transcription_time, explicit_format=False)
    print(f"Automated Transcription Duration ({i}): {hms} \n")


    df = pd.concat([df, pd.DataFrame([{"file_name": i,
                                       "file_format": f,
                                       "transcriptions": concatenated_text,
                                       "original_audio_length": dur,
                                       "trimmed_audio_length": trimmed_dur,
                                       "total_transcription_time": hms}])], ignore_index=True)
    
    df.to_excel("audio_transcripts.xlsx", index=False)

    print("======================================================================\n")
    !rm -rf /kaggle/working/chunks/*
    !rm -rf /kaggle/working/converted_audio/*
        
df.to_excel("audio_transcripts(final).xlsx", index = False)

In [None]:
# !rm -rf /kaggle/working/*
# !rm -rf /kaggle/working/converted_audio/*

In [None]:
print(f"Total Original Audio Duration: {hms_format(orig_audio_length, explicit_format=True)} \n")
print(f"Total Silence Trimmed Audio Duration: {hms_format(trim_audio_length, explicit_format=True)} \n")
print(f"Total Transcription Duration: {hms_format(trans_audio_length, explicit_format=True)} \n")

In [None]:
trans = pd.read_excel("/kaggle/working/audio_transcripts(final).xlsx")
trans

In [None]:
trans = pd.read_excel("/kaggle/working/audio_transcripts.xlsx")
trans