In [1]:
import whisper
from pydub import AudioSegment
import json
import datetime
import numpy as np 
from paths import *
import soundfile  as sf 
from langdetect import detect, detect_langs
import time 
import psutil
import os 

In [2]:

def transcribe_audio_segment(segment_path):
    model = whisper.load_model("base")
    audio = whisper.load_audio(segment_path)
    audio = whisper.pad_or_trim(audio)
    result = model.transcribe(audio)
    return result["text"]


In [3]:
def format_time(ms):
    return str(datetime.timedelta(milliseconds=ms))

In [4]:
# Function to transcribe a single audio segment
def transcribe_segment(file_path, start_time, end_time, tokenizer, model):
    audio = AudioSegment.from_wav(file_path)
    segment = audio[start_time:end_time]
    segment.export("segment.wav", format="wav")
    speech, sample_rate = sf.read("segment.wav")
    
    # Transcribe the audio segment
    inputs = tokenizer(speech, return_tensors="pt", padding="longest")
    logits = model(**inputs).logits
    predicted_ids = logits.argmax(axis=-1)
    transcription = tokenizer.decode(predicted_ids[0])
    
    return transcription

In [5]:
def segment_and_transcribe(file_path, interval_ms, output_json_path):
    audio = AudioSegment.from_file(file_path)
    duration_ms = len(audio)
    num_segments = int(np.ceil(duration_ms / interval_ms))
    
    transcript_data = []
    
    for i in range(num_segments):
        start_time_ms = i * interval_ms
        end_time_ms = min((i + 1) * interval_ms, duration_ms)
        
        segment = audio[start_time_ms:end_time_ms]
        segment_file_path = f"{GARBAGE}segment_{i + 1}.wav"
        segment.export(segment_file_path, format="wav")
        
        transcript = transcribe_audio_segment(segment_file_path)
        
        transcript_data.append({
            "offset": f'{format_time(start_time_ms)}, {format_time(end_time_ms)}',
            "text": transcript,
            'lang':detect(transcript)
        })
        os.remove(segment_file_path)
    
    
    if output_json_path :
        with open(output_json_path, 'a+') as f:
            f.seek(0)
            try:
                existing_data = json.load(f)
            except json.JSONDecodeError:
                existing_data = []
            existing_data.append({
                       "type": "audio",
                        "ref": file_path,
                        'met_data': transcript_data,
                        })
            f.seek(0)
            f.truncate()

            json.dump(existing_data, f,indent=4)

            print(f'Data successfully written to {file_path}')    
    else:
        return json.dumps(transcript_data, indent=4)


In [6]:
def monitor_resources(file_path, time_interval, out_put_json):
    # Record the start time and resource usage
    start_time = time.time()
    start_cpu_percent = psutil.cpu_percent(interval=None)
    start_memory_info = psutil.virtual_memory().used / (1024 * 1024)  # Convert to MB


    print(segment_and_transcribe(file_path, time_interval,out_put_json))

    # Record the end time and resource usage
    end_time = time.time()
    end_cpu_percent = psutil.cpu_percent(interval=None)
    end_memory_info = psutil.virtual_memory().used / (1024 * 1024)  # Convert to MB

    # Calculate elapsed time
    elapsed_time = end_time - start_time

    # Print the results
    print(f"Elapsed time: {elapsed_time:.2f} seconds")
    print(f"CPU usage at end: {end_cpu_percent}%")
    print(f"Memory used at start: {start_memory_info:.2f} MB")
    print(f"Memory used at end: {end_memory_info:.2f} MB")

In [7]:
%pip install moviepy
%pip install SpeechRecognition

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [8]:
import moviepy.editor as mp
import speech_recognition as sr

In [9]:
def extract_audio_from_video(video_path, audio_path):
    try:
        # Load the video file
        video = mp.VideoFileClip(video_path)
        
        # Extract the audio and save it as an audio file
        video.audio.write_audiofile(audio_path)
        print(f"Audio extracted and saved to {audio_path}")
    except Exception as e:
        print(f"Error extracting audio: {e}")

def transcribe_audio_segment(segment_path):
    # Initialize the recognizer
    recognizer = sr.Recognizer()

    # Load the audio file
    with sr.AudioFile(segment_path) as source:
        audio_data = recognizer.record(source)  # Read the audio file

    # Convert speech to text
    try:
        text = recognizer.recognize_google(audio_data)
        return text
    except sr.UnknownValueError:
        return "Speech was not understood"
    except sr.RequestError:
        return "Could not request results from the speech recognition service"

def format_time(ms):
    return str(datetime.timedelta(milliseconds=ms))

def segment_and_transcribe(audio_path, interval_ms, output_json_path):
    audio = AudioSegment.from_file(audio_path)
    duration_ms = len(audio)
    num_segments = int(np.ceil(duration_ms / interval_ms))
    
    transcript_data = []
    
    for i in range(num_segments):
        start_time_ms = i * interval_ms
        end_time_ms = min((i + 1) * interval_ms, duration_ms)
        
        segment = audio[start_time_ms:end_time_ms]
        segment_file_path = f"{GARBAGE}segment_{i + 1}.wav"
        segment.export(segment_file_path, format="wav")
        
        transcript = transcribe_audio_segment(segment_file_path)
        
        transcript_data.append({
            "offset": f'{format_time(start_time_ms)}, {format_time(end_time_ms)}',
            "text": transcript,
            'lang': detect(transcript)
        })
        os.remove(segment_file_path)
    
    if output_json_path:
        with open(output_json_path, 'a+') as f:
            f.seek(0)
            try:
                existing_data = json.load(f)
            except json.JSONDecodeError:
                existing_data = []
            existing_data.append({
                "type": "video",
                "ref": audio_path,
                'met_data': transcript_data,
            })
            f.seek(0)
            f.truncate()
            json.dump(existing_data, f, indent=4)
            print(f'Data successfully written to {output_json_path}')
    else:
        return json.dumps(transcript_data, indent=4)

def monitor_resources(audio_path, time_interval, output_json):
    # Record the start time and resource usage
    start_time = time.time()
    start_cpu_percent = psutil.cpu_percent(interval=None)
    start_memory_info = psutil.virtual_memory().used / (1024 * 1024)  # Convert to MB

    print(segment_and_transcribe(audio_path, time_interval, output_json))

    # Record the end time and resource usage
    end_time = time.time()
    end_cpu_percent = psutil.cpu_percent(interval=None)
    end_memory_info = psutil.virtual_memory().used / (1024 * 1024)  # Convert to MB

    # Calculate elapsed time
    elapsed_time = end_time - start_time

    # Print the results
    print(f"Elapsed time: {elapsed_time:.2f} seconds")
    print(f"CPU usage at end: {end_cpu_percent}%")
    print(f"Memory used at start: {start_memory_info:.2f} MB")
    print(f"Memory used at end: {end_memory_info:.2f} MB")


In [10]:
# Example usage
output_json = f'{DUMPVIDEO}transcriptions2.json'
time_interval_ms = 80000  # Segment length in milliseconds (e.g., 10000 ms = 10 seconds)

# Step 1: Extract audio from video
extract_audio_from_video(TONNY, AUDIO_TONNY)

# Step 2: Segment and transcribe the extracted audio
monitor_resources(AUDIO_TONNY, time_interval_ms, output_json)

# Clean up
if os.path.exists(AUDIO_TONNY):
    os.remove(AUDIO_TONNY)
    print(f"Temporary audio file {AUDIO_TONNY} removed")

MoviePy - Writing audio in /home/ameer/Kaleidoo/Data/Audio_Data/English/audio.wav


                                                                      

MoviePy - Done.
Audio extracted and saved to /home/ameer/Kaleidoo/Data/Audio_Data/English/audio.wav
Data successfully written to /home/ameer/Kaleidoo/Data/Data-Dumper/Video/transcriptions2.json
None
Elapsed time: 69.52 seconds
CPU usage at end: 0.7%
Memory used at start: 3017.07 MB
Memory used at end: 2720.25 MB
Temporary audio file /home/ameer/Kaleidoo/Data/Audio_Data/English/audio.wav removed
