In [None]:
! pip install python-dotenv
! pip install pydub
! pip install pandas
! pip install psycopg2
! pip install google-auth google-auth-httplib2 google-api-python-client google-auth-oauthlib
! pip install tqdm

In [None]:
import pandas as pd
from pydub import AudioSegment

In [None]:
import sys

sys.path.append('../util')

from util import read_spreadsheet

In [None]:
df = read_spreadsheet(sheet_id="1f5KlXPou3UtmRCCEVgwgoJms5WkRiAqb")

In [None]:
import requests
import os
import subprocess

def download_gh_file(url, local_filename):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(local_filename.strip(), 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded {local_filename.strip()}")
    else:
        print(f"Failed to download {local_filename.strip()} from {url}. Status code: {response.status_code}")


In [None]:
def extract_audio(video_file, audio_file):
    command = [
        "ffmpeg",
        "-i", video_file,
        "-q:a", "0",
        "-map", "a",
        audio_file
    ]
    try:
        subprocess.run(command, check=True)
        print(f"Extracted audio to {audio_file}")
    except subprocess.CalledProcessError as e:
        print(f"Failed to extract audio from {video_file}. Error: {e}")


In [None]:
from pydub import AudioSegment
import pandas as pd

# Function to get audio duration from an audio file
def get_audio_duration(audio_filename):
    try:
        audio = AudioSegment.from_file(audio_filename)
        duration_in_sec = len(audio) // 1000  # Duration in seconds
        duration = pd.to_datetime(duration_in_sec, unit='s').strftime('%H:%M:%S')  # Duration in H:M:S format
        return duration
    except Exception as e:
        print(f"Error fetching duration for {audio_filename}: {e}")
        return None


In [None]:
# Ensure directories exist
os.makedirs('video_from_gh', exist_ok=True)
os.makedirs('audio_from_gh', exist_ok=True)

# Example usage:
from_id = 214 
to_id = 253

df = read_spreadsheet(sheet_id="1g6RVrHXygLzg4d2IduliUpg8fzvJzqYcbddS02nvyv8")

durations = []

for index, row in df.iterrows():
    if not isinstance(row['File Name'], str) or not isinstance(row['asset_url'], str):
        break
    id = row['File Name']
    url_path = row['asset_url']
    video_filename = f"video_from_gh/{id}.MP4".strip()
    audio_filename = f"audio_from_gh/{id}.wav".strip()
    sr_no = row.iloc[0]
    if sr_no >= from_id and sr_no <= to_id:
        if os.path.exists(audio_filename):
            print(f"Audio file {audio_filename} already exists. Skipping extraction.")
            continue
        if not os.path.exists(video_filename):
            download_gh_file(url_path, video_filename)
        extract_audio(video_filename, audio_filename)
        duration = get_audio_duration(audio_filename)
        print(f"Audio duration for {audio_filename}: {duration}")
        durations.append((id, duration))


In [None]:
# Convert list to DataFrame and save to CSV
df_durations = pd.DataFrame(durations, columns=['File Name', 'Duration'])
df_durations.to_csv('nw_gh_audio_durations.csv', index=False)

print("Durations have been saved to audio_durations.csv")