In [1]:
!pip install webrtcvad -q
!apt-get install ffmpeg -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for webrtcvad (setup.py) ... [?25l[?25hdone
Reading package lists...
Building dependency tree...
Reading state information...
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 129 not upgraded.


In [2]:
import numpy as np
import pandas as pd
import os
import time
import shutil
import librosa
from transformers import pipeline
from tqdm import tqdm
from glob import glob
from pydub import AudioSegment
from pydub.silence import split_on_silence
from IPython.display import FileLink, FileLinks 
import warnings
import subprocess

In [3]:
warnings.filterwarnings("ignore")

pipe = pipeline("automatic-speech-recognition", model="bengaliAI/tugstugi_bengaliai-regional-asr_whisper-medium")

config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/223 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/978k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.76M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

Device set to use cuda:0


In [4]:
def split_audio_by_voice(input_file, output_folder, min_silence_len=1000, silence_thresh=-40):
    
    if not os.path.exists(output_folder):  # creating output directory
        os.makedirs(output_folder)
    
    audio = AudioSegment.from_file(input_file) # loading the audio clip
    
    chunks = split_on_silence(             # making chunks based on VAD
        audio,
        min_silence_len=min_silence_len,  # minimum silence length in ms
        silence_thresh=silence_thresh,     # silence threshold in dB
        keep_silence=500                   # keeping 500ms of silence at edges
    )
    
    for i, chunk in enumerate(chunks, start=1): # saving chunks
        output_file = os.path.join(output_folder, f"chunk_{i:04d}.wav")
        chunk.export(output_file, format="wav")


def hms_format(seconds:float, explicit_format=False) -> str:

    hours, seconds = divmod(seconds, 3600)
    minutes, seconds = divmod(seconds, 60)

    if explicit_format:
        return "{} hours {:02} minutes {:02} seconds".format(int(hours), int(minutes), round(seconds))
    else:
        return "{}:{:02}:{:02}".format(int(hours), int(minutes), round(seconds))


def infer_tugstugi(aud_path): # use the audio file as the audio source
    transcription = pipe(aud_path)['text']
    text = str(transcription)
    return text

In [5]:
input_dir = "/kaggle/input/interview-audios"                  # input path
output_dir = "/kaggle/working/chunks"                 # output directory path
output_conv_dir = "/kaggle/working/converted_audio"

os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_conv_dir, exist_ok=True)

df = pd.DataFrame(columns=["file_name", "file_format", "transcriptions", "original_audio_length", 
                           #"trimmed_audio_length", 
                           "total_transcription_time"])

orig_audio_length = 0
trim_audio_length = 0
trans_audio_length = 0

In [6]:
for i in os.listdir(input_dir):

    f = os.path.splitext(i)[1]

    
    input_audio = f"{input_dir}/{i}"  # input data path 

    # Convert to WAV if the file is not WAV
    if not i.endswith(".wav"):

        
        
        converted_audio = f"{output_conv_dir}/{i.split('.')[0]}.wav"
        
        subprocess.run(["ffmpeg", "-i", 
                        input_audio, converted_audio, 
                       # "-y"
                       ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        

        input_audio = converted_audio  # Use the converted file

    # Now process with librosa
    try:
        d = librosa.get_duration(filename=input_audio)
        dur = hms_format(d, explicit_format=True)
        print(f"Input Audio Duration ({i}): {dur} \n")
        orig_audio_length += d

    except Exception as e:
        print(f"Error processing {i}: {e}")

        break

    
    split_audio_by_voice(input_audio, output_dir) # making chunks based on vocal activity

    t = 0
    transcriptions = []
    start_time = time.time()

    aud_list = os.listdir(output_dir)
    aud_list.sort()
    for wav in tqdm(aud_list):
        if wav[-4:] == ".wav":
            wav = f"{output_dir}/{wav}"
            #print(wav)
            d = librosa.get_duration(filename=wav)
            t+=d

            try:
                transcription=infer_tugstugi(wav)
                #print(transcription)
                #transcriptions.append(transcription)
            except Exception as e:

                q = 0

                while q < 20:
                    try:
                        transcription=infer_tugstugi(wav)
                    except:
                        q += 1
                        transcription = "<UNK>"
            
            transcriptions.append(transcription)
                
            #df_test = pd.concat([df_test, pd.DataFrame([{"chunk": wav, "transcriptions":transcription}])], ignore_index=True)
    
    #df_test.to_excel(f"{i}_chunk_transcripts.xlsx", index=False)

    trim_audio_length += t
    trimmed_dur = hms_format(t, explicit_format=True)
    #print(f"Input Audio Duration with trimmed silence: {trimmed_dur} \n")
    concatenated_text = "\n".join(transcriptions)
    #print(concatenated_text)
    end_time = time.time()
    transcription_time = end_time - start_time
    trans_audio_length += transcription_time
    hms = hms_format(transcription_time, explicit_format=False)
    print(f"Automated Transcription Duration ({i}): {hms} \n")


    df = pd.concat([df, pd.DataFrame([{"file_name": i,
                                       "file_format": f,
                                       "transcriptions": concatenated_text,
                                       "original_audio_length": dur,
                                       #"trimmed_audio_length": trimmed_dur,
                                       "total_transcription_time": hms}])], ignore_index=True)
    
    df.to_excel("audio_transcripts.xlsx", index=False)

    print("======================================================================\n")
    !rm -rf /kaggle/working/chunks/*
    !rm -rf /kaggle/working/converted_audio/*
        
df.to_excel("audio_transcripts(final).xlsx", index = False)

Input Audio Duration (recording_07.aac): 0 hours 45 minutes 24 seconds 



  5%|▌         | 5/91 [00:15<04:30,  3.14s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 91/91 [06:09<00:00,  4.06s/it]


Automated Transcription Duration (recording_07.aac): 0:06:09 


Input Audio Duration (recording_13.m4a): 0 hours 47 minutes 01 seconds 



100%|██████████| 249/249 [08:52<00:00,  2.14s/it]

Automated Transcription Duration (recording_13.m4a): 0:08:53 







Input Audio Duration (recording_0.mkv): 0 hours 37 minutes 58 seconds 



100%|██████████| 73/73 [03:56<00:00,  3.23s/it]

Automated Transcription Duration (recording_0.mkv): 0:03:56 







Input Audio Duration (recording_09.aac): 0 hours 00 minutes 35 seconds 



100%|██████████| 2/2 [00:05<00:00,  2.97s/it]

Automated Transcription Duration (recording_09.aac): 0:00:06 







Input Audio Duration (recording_01.aiff): 0 hours 48 minutes 33 seconds 



100%|██████████| 126/126 [07:13<00:00,  3.44s/it]

Automated Transcription Duration (recording_01.aiff): 0:07:13 







Input Audio Duration (recording_06.wma): 0 hours 25 minutes 27 seconds 



100%|██████████| 118/118 [04:03<00:00,  2.07s/it]

Automated Transcription Duration (recording_06.wma): 0:04:04 







Input Audio Duration (recording_03.mp3): 0 hours 46 minutes 06 seconds 



100%|██████████| 148/148 [06:57<00:00,  2.82s/it]

Automated Transcription Duration (recording_03.mp3): 0:06:57 







Input Audio Duration (recording_04.aac): 0 hours 44 minutes 43 seconds 



100%|██████████| 273/273 [08:51<00:00,  1.95s/it]

Automated Transcription Duration (recording_04.aac): 0:08:52 







Input Audio Duration (recording_11.m4a): 0 hours 46 minutes 49 seconds 



100%|██████████| 69/69 [05:02<00:00,  4.39s/it]

Automated Transcription Duration (recording_11.m4a): 0:05:03 







Input Audio Duration (recording_02.flac): 0 hours 24 minutes 06 seconds 



100%|██████████| 185/185 [04:22<00:00,  1.42s/it]

Automated Transcription Duration (recording_02.flac): 0:04:22 







Input Audio Duration (recording_10.m4a): 0 hours 29 minutes 50 seconds 



100%|██████████| 125/125 [04:48<00:00,  2.31s/it]

Automated Transcription Duration (recording_10.m4a): 0:04:48 







Input Audio Duration (recording_14.m4a): 0 hours 28 minutes 01 seconds 



100%|██████████| 229/229 [04:54<00:00,  1.28s/it]

Automated Transcription Duration (recording_14.m4a): 0:04:54 







Input Audio Duration (recording_08.m4a): 0 hours 56 minutes 55 seconds 



100%|██████████| 254/254 [10:06<00:00,  2.39s/it]


Automated Transcription Duration (recording_08.m4a): 0:10:06 


Input Audio Duration (recording_05.wav): 0 hours 09 minutes 55 seconds 



100%|██████████| 44/44 [01:30<00:00,  2.06s/it]

Automated Transcription Duration (recording_05.wav): 0:01:31 







Input Audio Duration (recording_16.m4a): 0 hours 24 minutes 24 seconds 



100%|██████████| 174/174 [03:58<00:00,  1.37s/it]


Automated Transcription Duration (recording_16.m4a): 0:03:59 


Input Audio Duration (recording_12.m4a): 0 hours 31 minutes 01 seconds 



100%|██████████| 161/161 [05:04<00:00,  1.89s/it]


Automated Transcription Duration (recording_12.m4a): 0:05:04 


Input Audio Duration (recording_15.m4a): 0 hours 05 minutes 42 seconds 



100%|██████████| 40/40 [00:54<00:00,  1.37s/it]

Automated Transcription Duration (recording_15.m4a): 0:00:55 







In [7]:
# !rm -rf /kaggle/working/*
# !rm -rf /kaggle/working/converted_audio/*

In [8]:
print(f"Total Original Audio Duration: {hms_format(orig_audio_length, explicit_format=True)} \n")
#print(f"Total Silence Trimmed Audio Duration: {hms_format(trim_audio_length, explicit_format=True)} \n")
print(f"Total Transcription Duration: {hms_format(trans_audio_length, explicit_format=True)} \n")

Total Original Audio Duration: 9 hours 12 minutes 30 seconds 

Total Transcription Duration: 1 hours 26 minutes 53 seconds 



In [9]:
trans = pd.read_excel("/kaggle/working/audio_transcripts(final).xlsx")
trans

Unnamed: 0,file_name,file_format,transcriptions,original_audio_length,total_transcription_time
0,recording_07.aac,.aac,"আচ্ছা, ভাইয়া, আমরা হচ্ছে ইন্টারভিউটা শুরু করতে...",0 hours 45 minutes 24 seconds,0:06:09
1,recording_13.m4a,.m4a,ওকে আপু ঠিক আছে। তাহলে আমরা শুরু করছি আপু। আমি...,0 hours 47 minutes 01 seconds,0:08:53
2,recording_0.mkv,.mkv,<>\nআচ্ছা। মোমো তোমার কথাগুলা আমরা হচ্ছে রেকর্...,0 hours 37 minutes 58 seconds,0:03:56
3,recording_09.aac,.aac,জ্বি আচ্ছা আপু আপনার নামটা এবং আপনার পরিচয়টা য...,0 hours 00 minutes 35 seconds,0:00:06
4,recording_01.aiff,.aiff,আচ্ছা আমরা তাহলে এখন ইন্টারভিউটা শুরু করি। আমি...,0 hours 48 minutes 33 seconds,0:07:13
5,recording_06.wma,.wma,সো আমাদের হচ্ছে রিসার্চের জন্য কিছু ডেমোগ্রাফি...,0 hours 25 minutes 27 seconds,0:04:04
6,recording_03.mp3,.mp3,<UNK>\nআচ্ছা ঠিক আছে আপু। আচ্ছা আপু আফনার ডেইল...,0 hours 46 minutes 06 seconds,0:06:57
7,recording_04.aac,.aac,আচ্ছা আপু আমরা ডকুমেন্টেশনের জন্য একটু এই আমাদ...,0 hours 44 minutes 43 seconds,0:08:52
8,recording_11.m4a,.m4a,হ্যাঁ ঠিক আছে আপা বলে। ওকে। আপু আমরা হচ্ছে একট...,0 hours 46 minutes 49 seconds,0:05:03
9,recording_02.flac,.flac,রেকর্ডিং স্টার্ট হয়েছে।\nআচ্ছা। আফরা তুমি কি আ...,0 hours 24 minutes 06 seconds,0:04:22


In [10]:
trans = pd.read_excel("/kaggle/working/audio_transcripts.xlsx")
trans

Unnamed: 0,file_name,file_format,transcriptions,original_audio_length,total_transcription_time
0,recording_07.aac,.aac,"আচ্ছা, ভাইয়া, আমরা হচ্ছে ইন্টারভিউটা শুরু করতে...",0 hours 45 minutes 24 seconds,0:06:09
1,recording_13.m4a,.m4a,ওকে আপু ঠিক আছে। তাহলে আমরা শুরু করছি আপু। আমি...,0 hours 47 minutes 01 seconds,0:08:53
2,recording_0.mkv,.mkv,<>\nআচ্ছা। মোমো তোমার কথাগুলা আমরা হচ্ছে রেকর্...,0 hours 37 minutes 58 seconds,0:03:56
3,recording_09.aac,.aac,জ্বি আচ্ছা আপু আপনার নামটা এবং আপনার পরিচয়টা য...,0 hours 00 minutes 35 seconds,0:00:06
4,recording_01.aiff,.aiff,আচ্ছা আমরা তাহলে এখন ইন্টারভিউটা শুরু করি। আমি...,0 hours 48 minutes 33 seconds,0:07:13
5,recording_06.wma,.wma,সো আমাদের হচ্ছে রিসার্চের জন্য কিছু ডেমোগ্রাফি...,0 hours 25 minutes 27 seconds,0:04:04
6,recording_03.mp3,.mp3,<UNK>\nআচ্ছা ঠিক আছে আপু। আচ্ছা আপু আফনার ডেইল...,0 hours 46 minutes 06 seconds,0:06:57
7,recording_04.aac,.aac,আচ্ছা আপু আমরা ডকুমেন্টেশনের জন্য একটু এই আমাদ...,0 hours 44 minutes 43 seconds,0:08:52
8,recording_11.m4a,.m4a,হ্যাঁ ঠিক আছে আপা বলে। ওকে। আপু আমরা হচ্ছে একট...,0 hours 46 minutes 49 seconds,0:05:03
9,recording_02.flac,.flac,রেকর্ডিং স্টার্ট হয়েছে।\nআচ্ছা। আফরা তুমি কি আ...,0 hours 24 minutes 06 seconds,0:04:22
