In [None]:
!pip install google-auth google-auth-httplib2 google-api-python-client google-auth-oauthlib transformers

In [None]:
!pip install psycopg2-binary
!pip install python-dotenv
!pip install tqdm 
!pip install pyannote.audio

In [None]:
import sys
sys.path.append('../util')

from util import read_spreadsheet
from util import collect_segments
from util import split_audio_files

In [None]:
from pydub import AudioSegment
import pandas as pd

# Function to get audio duration from an audio file
def get_audio_duration(audio_filename):
    try:
        audio = AudioSegment.from_file(audio_filename)
        duration_in_sec = len(audio) // 1000  # Duration in seconds
        duration = pd.to_datetime(duration_in_sec, unit='s').strftime('%H:%M:%S')  # Duration in H:M:S format
        return duration
    except Exception as e:
        print(f"Error fetching duration for {audio_filename}: {e}")
        return None

In [None]:
import subprocess

def download_s3_file(s3_path, local_filename):
    """
    Downloads a file from S3 using AWS CLI.

    Args:
    - s3_path (str): S3 path of the file to download.
    - local_filename (str): Local filename to save the downloaded file.

    Returns:
    - bool: True if download successful, False otherwise.
    """
    try:
        s3_bucket = "monlam.ai.stt"  # Replace with your S3 bucket name

        # Run aws s3 cp command to download the file
        download_command = f"aws s3 cp s3://{s3_bucket}/{s3_path} {local_filename}"
        subprocess.run(download_command, shell=True, check=True)
        print(f"Downloaded {s3_path} to {local_filename}")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error downloading {s3_path}: {e}")
        return False

# Example usage:
from_id = 305
to_id = 319

durations = []

df = read_spreadsheet(sheet_id="1aShlQ9I2FS_PX_8Ukvc1nr8qOiMePdAzO-hs6-vWKko")

for index, row in df.iterrows():
    if not isinstance(row['File Name'], str) or not isinstance(row['STT_NS_P0000'], str):
        break
    id = row['STT_NS_P0000']
    s3_path = row['File Name']
    sr_no = row.iloc[0]
    if sr_no >= from_id and sr_no <= to_id:
        local_filename = f"audio_from_s3/{id}.mp3"
        download_s3_file(s3_path, local_filename)
        audio_filename = f"full_audio/{id}"
        duration = get_audio_duration(audio_filename)
        print(f"Audio duration for {audio_filename}: {duration}")
        durations.append((id, duration))



In [None]:
# Convert list to DataFrame and save to CSV
df_durations = pd.DataFrame(durations, columns=['File Name', 'Duration'])
df_durations.to_csv('nw_gh_audio_durations.csv', index=False)

print("Durations have been saved to audio_durations.csv")