In [1]:
!pip install webrtcvad -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for webrtcvad (setup.py) ... [?25l[?25hdone


In [2]:
import numpy as np
import pandas as pd
import os
import time
import shutil
import librosa
from transformers import pipeline
from tqdm import tqdm
from glob import glob
from pydub import AudioSegment
from pydub.silence import split_on_silence
from IPython.display import FileLink, FileLinks 
import warnings

In [3]:
warnings.filterwarnings("ignore")

pipe = pipeline("automatic-speech-recognition", model="bengaliAI/tugstugi_bengaliai-regional-asr_whisper-medium")

config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/223 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/978k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.76M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

Device set to use cuda:0


In [4]:
def split_audio_by_voice(input_file, output_folder, min_silence_len=1000, silence_thresh=-40):
    
    if not os.path.exists(output_folder):  # creating output directory
        os.makedirs(output_folder)
    
    audio = AudioSegment.from_file(input_file) # loading the audio clip
    
    chunks = split_on_silence(             # making chunks based on VAD
        audio,
        min_silence_len=min_silence_len,  # minimum silence length in ms
        silence_thresh=silence_thresh,     # silence threshold in dB
        keep_silence=500                   # keeping 500ms of silence at edges
    )
    
    for i, chunk in enumerate(chunks, start=1): # saving chunks
        output_file = os.path.join(output_folder, f"chunk_{i:04d}.wav")
        chunk.export(output_file, format="wav")


def hms_format(seconds:float, explicit_format=False) -> str:

    hours, seconds = divmod(seconds, 3600)
    minutes, seconds = divmod(seconds, 60)

    if explicit_format:
        return "{} hours {:02} minutes {:02} seconds".format(int(hours), int(minutes), round(seconds))
    else:
        return "{}:{:02}:{:02}".format(int(hours), int(minutes), round(seconds))


def infer_tugstugi(aud_path): # use the audio file as the audio source
    transcription = pipe(aud_path)['text']
    text = str(transcription)
    return text

In [5]:
input_dir = "/kaggle/input/hci-interview-audios/audio_clips"
output_dir = "/kaggle/working/chunks"                 # output directory path

df = pd.DataFrame(columns=["file_name", "transcriptions", "original_audio_length", "trimmed_audio_length", "total_transcription_time"])

orig_audio_length = 0
trim_audio_length = 0
trans_audio_length = 0

In [6]:
for i in os.listdir(input_dir):

    #df_test = pd.DataFrame(columns=["chunk", "transcriptions"])

    input_audio = f"{input_dir}/{i}"  # input data path 
    d = librosa.get_duration(filename=input_audio)
    orig_audio_length += d
    dur = hms_format(d, explicit_format=True)
    print(f"Input Audio Duration ({i}): {dur} \n")
    split_audio_by_voice(input_audio, output_dir) # making chunks based on vocal activity

    t = 0
    transcriptions = []
    start_time = time.time()

    aud_list = os.listdir(output_dir)
    aud_list.sort()
    for wav in tqdm(aud_list):
        if wav[-4:] == ".wav":
            wav = f"{output_dir}/{wav}"
            #print(wav)
            d = librosa.get_duration(filename=wav)
            t+=d

            try:
                transcription=infer_tugstugi(wav)
                #print(transcription)
                #transcriptions.append(transcription)
            except Exception as e:

                q = 0

                while q < 20:
                    try:
                        transcription=infer_tugstugi(wav)
                    except:
                        q += 1
                        transcription = "<UNK>"
            
            transcriptions.append(transcription)
                
            #df_test = pd.concat([df_test, pd.DataFrame([{"chunk": wav, "transcriptions":transcription}])], ignore_index=True)
    
    #df_test.to_excel(f"{i}_chunk_transcripts.xlsx", index=False)

    trim_audio_length += t
    trimmed_dur = hms_format(t, explicit_format=True)
    print(f"Input Audio Duration with trimmed silence: {trimmed_dur} \n")
    concatenated_text = "\n".join(transcriptions)
    #print(concatenated_text)
    end_time = time.time()
    transcription_time = end_time - start_time
    trans_audio_length += transcription_time
    hms = hms_format(transcription_time, explicit_format=False)
    print(f"Automated Transcription Duration ({i}): {hms} \n")


    df = pd.concat([df, pd.DataFrame([{"file_name": i, 
                                       "transcriptions": concatenated_text,
                                       "original_audio_length": dur,
                                       "trimmed_audio_length": trimmed_dur,
                                       "total_transcription_time": hms}])], ignore_index=True)
    
    df.to_excel("audio_transcripts.xlsx", index=False)

    print("======================================================================\n")
    !rm -rf /kaggle/working/chunks/*

df.to_excel("audio_transcripts(final).xlsx", index = False)

Input Audio Duration (KII 07_Audio.m4a): 1 hours 12 minutes 60 seconds 



  1%|          | 2/174 [00:05<08:08,  2.84s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 174/174 [09:10<00:00,  3.17s/it]


Input Audio Duration with trimmed silence: 1 hours 11 minutes 16 seconds 

Automated Transcription Duration (KII 07_Audio.m4a): 0:09:11 


Input Audio Duration (Nadia interview deepfake.m4a): 0 hours 56 minutes 55 seconds 



100%|██████████| 254/254 [10:09<00:00,  2.40s/it]

Input Audio Duration with trimmed silence: 0 hours 53 minutes 31 seconds 

Automated Transcription Duration (Nadia interview deepfake.m4a): 0:10:10 







Input Audio Duration (Swarna interview deepfake.m4a): 0 hours 47 minutes 01 seconds 



100%|██████████| 249/249 [08:50<00:00,  2.13s/it]

Input Audio Duration with trimmed silence: 0 hours 43 minutes 52 seconds 

Automated Transcription Duration (Swarna interview deepfake.m4a): 0:08:51 







Input Audio Duration (Bristi dey deepfake interview.m4a): 0 hours 46 minutes 06 seconds 



100%|██████████| 148/148 [06:57<00:00,  2.82s/it]

Input Audio Duration with trimmed silence: 0 hours 44 minutes 09 seconds 

Automated Transcription Duration (Bristi dey deepfake interview.m4a): 0:06:57 







Input Audio Duration (Afra.m4a): 0 hours 24 minutes 06 seconds 



100%|██████████| 185/185 [04:20<00:00,  1.41s/it]

Input Audio Duration with trimmed silence: 0 hours 19 minutes 59 seconds 

Automated Transcription Duration (Afra.m4a): 0:04:21 







Input Audio Duration (Titly.m4a): 0 hours 28 minutes 01 seconds 



100%|██████████| 229/229 [04:51<00:00,  1.27s/it]

Input Audio Duration with trimmed silence: 0 hours 22 minutes 20 seconds 

Automated Transcription Duration (Titly.m4a): 0:04:51 







Input Audio Duration (Sushmita.m4a): 0 hours 31 minutes 01 seconds 



100%|██████████| 161/161 [05:03<00:00,  1.88s/it]

Input Audio Duration with trimmed silence: 0 hours 28 minutes 18 seconds 

Automated Transcription Duration (Sushmita.m4a): 0:05:03 







Input Audio Duration (Zerin.m4a): 0 hours 24 minutes 24 seconds 



100%|██████████| 174/174 [03:57<00:00,  1.36s/it]

Input Audio Duration with trimmed silence: 0 hours 20 minutes 05 seconds 

Automated Transcription Duration (Zerin.m4a): 0:03:57 







Input Audio Duration (Naznin sultana 01.aac): 0 hours 00 minutes 34 seconds 



100%|██████████| 2/2 [00:05<00:00,  2.92s/it]

Input Audio Duration with trimmed silence: 0 hours 00 minutes 34 seconds 

Automated Transcription Duration (Naznin sultana 01.aac): 0:00:06 







Input Audio Duration (Interviewer 01_2nd part.m4a): 0 hours 09 minutes 55 seconds 



100%|██████████| 44/44 [01:30<00:00,  2.05s/it]

Input Audio Duration with trimmed silence: 0 hours 09 minutes 16 seconds 

Automated Transcription Duration (Interviewer 01_2nd part.m4a): 0:01:30 







Input Audio Duration (Zerin part 2.m4a): 0 hours 05 minutes 42 seconds 



100%|██████████| 40/40 [00:54<00:00,  1.37s/it]

Input Audio Duration with trimmed silence: 0 hours 04 minutes 43 seconds 

Automated Transcription Duration (Zerin part 2.m4a): 0:00:55 







Input Audio Duration (Ishrat.m4a): 0 hours 25 minutes 27 seconds 



100%|██████████| 117/117 [04:01<00:00,  2.07s/it]

Input Audio Duration with trimmed silence: 0 hours 23 minutes 45 seconds 

Automated Transcription Duration (Ishrat.m4a): 0:04:02 







Input Audio Duration (sameha_deepfake.m4a): 0 hours 29 minutes 50 seconds 



100%|██████████| 125/125 [04:46<00:00,  2.29s/it]

Input Audio Duration with trimmed silence: 0 hours 28 minutes 20 seconds 

Automated Transcription Duration (sameha_deepfake.m4a): 0:04:47 







Input Audio Duration (Sheba palma deep fake.m4a): 0 hours 46 minutes 49 seconds 



100%|██████████| 69/69 [04:59<00:00,  4.34s/it]

Input Audio Duration with trimmed silence: 0 hours 46 minutes 00 seconds 

Automated Transcription Duration (Sheba palma deep fake.m4a): 0:04:60 







Input Audio Duration (Faria interview deepfake.aac): 0 hours 44 minutes 43 seconds 



100%|██████████| 273/273 [08:47<00:00,  1.93s/it]


Input Audio Duration with trimmed silence: 0 hours 41 minutes 56 seconds 

Automated Transcription Duration (Faria interview deepfake.aac): 0:08:47 




In [7]:
print(f"Total Original Audio Duration: {hms_format(orig_audio_length, explicit_format=True)} \n")
print(f"Total Silence Trimmed Audio Duration: {hms_format(trim_audio_length, explicit_format=True)} \n")
print(f"Total Transcription Duration: {hms_format(trans_audio_length, explicit_format=True)} \n")

Total Original Audio Duration: 8 hours 13 minutes 33 seconds 

Total Silence Trimmed Audio Duration: 7 hours 38 minutes 04 seconds 

Total Transcription Duration: 1 hours 18 minutes 28 seconds 



In [8]:
trans = pd.read_excel("/kaggle/working/audio_transcripts(final).xlsx")
trans

Unnamed: 0,file_name,transcriptions,original_audio_length,trimmed_audio_length,total_transcription_time
0,KII 07_Audio.m4a,ভাইয়া আমার উইন্ডোটা কি দেখা যাচ্ছে?\nহ্যাঁ দেখ...,1 hours 12 minutes 60 seconds,1 hours 11 minutes 16 seconds,0:09:11
1,Nadia interview deepfake.m4a,"জি আপু, আমাদের ইন্টারনাল কিছু ডকুমেন্টেশনের জন...",0 hours 56 minutes 55 seconds,0 hours 53 minutes 31 seconds,0:10:10
2,Swarna interview deepfake.m4a,ওকে আপু ঠিক আছে। তাহলে আমরা শুরু করছি আপু। আমি...,0 hours 47 minutes 01 seconds,0 hours 43 minutes 52 seconds,0:08:51
3,Bristi dey deepfake interview.m4a,<UNK>\nআচ্ছা ঠিক আছে আপু। আচ্ছা আপু আফনার ডেইল...,0 hours 46 minutes 06 seconds,0 hours 44 minutes 09 seconds,0:06:57
4,Afra.m4a,রেকর্ডিং স্টার্ট হয়েছে।\nআচ্ছা। আফরা তুমি কি আ...,0 hours 24 minutes 06 seconds,0 hours 19 minutes 59 seconds,0:04:21
5,Titly.m4a,ওকেই সো আমরা শুরু কইতেছি এই ইন্টারভিওটা।\nআমাদ...,0 hours 28 minutes 01 seconds,0 hours 22 minutes 20 seconds,0:04:51
6,Sushmita.m4a,"ওকে, ফার্স্ট অফ অন, আমার কিছু ডেমোগ্রাফিক ইনফর...",0 hours 31 minutes 01 seconds,0 hours 28 minutes 18 seconds,0:05:03
7,Zerin.m4a,সো হ্যালো আপু আমাদের রিসার্চ পারপাসে কিছু ডেমো...,0 hours 24 minutes 24 seconds,0 hours 20 minutes 05 seconds,0:03:57
8,Naznin sultana 01.aac,জ্বি আচ্ছা আপু আপনার নামটা এবং আপনার পরিচয়টা য...,0 hours 00 minutes 34 seconds,0 hours 00 minutes 34 seconds,0:00:06
9,Interviewer 01_2nd part.m4a,"আচ্ছা, হ্যাঁ, মোহাম্মদ যেটা বলতেছিলাম যে\nআ\n<...",0 hours 09 minutes 55 seconds,0 hours 09 minutes 16 seconds,0:01:30


In [9]:
trans = pd.read_excel("/kaggle/working/audio_transcripts.xlsx")
trans

Unnamed: 0,file_name,transcriptions,original_audio_length,trimmed_audio_length,total_transcription_time
0,KII 07_Audio.m4a,ভাইয়া আমার উইন্ডোটা কি দেখা যাচ্ছে?\nহ্যাঁ দেখ...,1 hours 12 minutes 60 seconds,1 hours 11 minutes 16 seconds,0:09:11
1,Nadia interview deepfake.m4a,"জি আপু, আমাদের ইন্টারনাল কিছু ডকুমেন্টেশনের জন...",0 hours 56 minutes 55 seconds,0 hours 53 minutes 31 seconds,0:10:10
2,Swarna interview deepfake.m4a,ওকে আপু ঠিক আছে। তাহলে আমরা শুরু করছি আপু। আমি...,0 hours 47 minutes 01 seconds,0 hours 43 minutes 52 seconds,0:08:51
3,Bristi dey deepfake interview.m4a,<UNK>\nআচ্ছা ঠিক আছে আপু। আচ্ছা আপু আফনার ডেইল...,0 hours 46 minutes 06 seconds,0 hours 44 minutes 09 seconds,0:06:57
4,Afra.m4a,রেকর্ডিং স্টার্ট হয়েছে।\nআচ্ছা। আফরা তুমি কি আ...,0 hours 24 minutes 06 seconds,0 hours 19 minutes 59 seconds,0:04:21
5,Titly.m4a,ওকেই সো আমরা শুরু কইতেছি এই ইন্টারভিওটা।\nআমাদ...,0 hours 28 minutes 01 seconds,0 hours 22 minutes 20 seconds,0:04:51
6,Sushmita.m4a,"ওকে, ফার্স্ট অফ অন, আমার কিছু ডেমোগ্রাফিক ইনফর...",0 hours 31 minutes 01 seconds,0 hours 28 minutes 18 seconds,0:05:03
7,Zerin.m4a,সো হ্যালো আপু আমাদের রিসার্চ পারপাসে কিছু ডেমো...,0 hours 24 minutes 24 seconds,0 hours 20 minutes 05 seconds,0:03:57
8,Naznin sultana 01.aac,জ্বি আচ্ছা আপু আপনার নামটা এবং আপনার পরিচয়টা য...,0 hours 00 minutes 34 seconds,0 hours 00 minutes 34 seconds,0:00:06
9,Interviewer 01_2nd part.m4a,"আচ্ছা, হ্যাঁ, মোহাম্মদ যেটা বলতেছিলাম যে\nআ\n<...",0 hours 09 minutes 55 seconds,0 hours 09 minutes 16 seconds,0:01:30
