In [2]:
import whisper
from pydub import AudioSegment
import json
import datetime
import numpy as np 
from paths import *
import time 
import psutil
import os
import soundfile  as sf 
from langdetect import detect, detect_langs

In [3]:

def transcribe_audio_segment(segment_path):
    model = whisper.load_model("base")
    audio = whisper.load_audio(segment_path)
    audio = whisper.pad_or_trim(audio)
    result = model.transcribe(audio)
    return result["text"]


In [4]:
def format_time(ms):
    return str(datetime.timedelta(milliseconds=ms))

In [5]:
# Function to transcribe a single audio segment
def transcribe_segment(file_path, start_time, end_time, tokenizer, model):
    audio = AudioSegment.from_wav(file_path)
    segment = audio[start_time:end_time]
    segment.export("segment.wav", format="wav")
    speech, sample_rate = sf.read("segment.wav")
    
    # Transcribe the audio segment
    inputs = tokenizer(speech, return_tensors="pt", padding="longest")
    logits = model(**inputs).logits
    predicted_ids = logits.argmax(axis=-1)
    transcription = tokenizer.decode(predicted_ids[0])
    
    return transcription

In [6]:
def segment_and_transcribe(file_path, interval_ms, output_json_path):
    audio = AudioSegment.from_file(file_path)
    duration_ms = len(audio)
    num_segments = int(np.ceil(duration_ms / interval_ms))
    
    transcript_data = []
    
    for i in range(num_segments):
        start_time_ms = i * interval_ms
        end_time_ms = min((i + 1) * interval_ms, duration_ms)
        
        segment = audio[start_time_ms:end_time_ms]
        segment_file_path = f"segment_{i + 1}.wav"
        segment.export(segment_file_path, format="wav")
        
        transcript = transcribe_audio_segment(segment_file_path)
        
        transcript_data.append({
            "offset": f'{format_time(start_time_ms)}, {format_time(end_time_ms)}',
            "text": transcript,
            'lang':detect(transcript)
        })
        os.remove(segment_file_path)
    
    
    if output_json_path :
        with open(output_json_path, 'a+') as f:
            f.seek(0)
            try:
                existing_data = json.load(f)
            except json.JSONDecodeError:
                existing_data = []
            existing_data.append({
                       "type": "audio",
                        "ref": file_path,
                        'met_data': transcript_data,
                        })
            f.seek(0)
            f.truncate()

            json.dump(existing_data, f,indent=4)

            print(f'Data successfully written to {file_path}')    
    else:
        return json.dumps(transcript_data, indent=4)


In [7]:
def monitor_resources(file_path, time_interval, out_put_json):
    # Record the start time and resource usage
    start_time = time.time()
    start_cpu_percent = psutil.cpu_percent(interval=None)
    start_memory_info = psutil.virtual_memory().used / (1024 * 1024)  # Convert to MB


    print(segment_and_transcribe(file_path, time_interval,out_put_json))

    # Record the end time and resource usage
    end_time = time.time()
    end_cpu_percent = psutil.cpu_percent(interval=None)
    end_memory_info = psutil.virtual_memory().used / (1024 * 1024)  # Convert to MB

    # Calculate elapsed time
    elapsed_time = end_time - start_time

    # Print the results
    print(f"Elapsed time: {elapsed_time:.2f} seconds")
    print(f"CPU usage at end: {end_cpu_percent}%")
    print(f"Memory used at start: {start_memory_info:.2f} MB")
    print(f"Memory used at end: {end_memory_info:.2f} MB")

In [10]:
time_interval_ms = 80000 # Segment length in milliseconds (e.g., 10000 ms = 10 seconds)
output_json = "transcriptions.json"  # Output JSON file path



print('-------------------------------Transformer-------------------------------------------')
print()
print('--------------------------audio1_converted.wav---------------------------------------')
print ( monitor_resources(ENGPATH1, time_interval_ms,output_json) )
print('--------------------------audio2_converted.wav---------------------------------------')
print( monitor_resources(ENGPATH2, time_interval_ms,output_json) )
print('--------------------------audio3_converted.wav---------------------------------------')
print( monitor_resources(ENGPATH3, time_interval_ms,output_json) )
print('--------------------------img-processing.mp3-----------------------------------------')
print(monitor_resources(ENGPATH4, time_interval_ms,output_json) )

-------------------------------Transformer-------------------------------------------

--------------------------audio1_converted.wav---------------------------------------




Data successfully written to /home/ameer/Kaleidoo/Audio_Data/English/A1.wav
None
Elapsed time: 4.01 seconds
CPU usage at end: 43.3%
Memory used at start: 4645.68 MB
Memory used at end: 4680.36 MB
None
--------------------------audio2_converted.wav---------------------------------------




Data successfully written to /home/ameer/Kaleidoo/Audio_Data/English/A2.wav
None
Elapsed time: 6.27 seconds
CPU usage at end: 45.7%
Memory used at start: 4680.36 MB
Memory used at end: 4511.80 MB
None
--------------------------audio3_converted.wav---------------------------------------




Data successfully written to /home/ameer/Kaleidoo/Audio_Data/English/A3.wav
None
Elapsed time: 2.32 seconds
CPU usage at end: 41.5%
Memory used at start: 4511.80 MB
Memory used at end: 4539.42 MB
None
--------------------------img-processing.mp3-----------------------------------------




Data successfully written to /home/ameer/Kaleidoo/Audio_Data/English/img-processing.mp3
None
Elapsed time: 24.96 seconds
CPU usage at end: 44.5%
Memory used at start: 4539.42 MB
Memory used at end: 4506.57 MB
None
