In [7]:
import yt_dlp
import pandas as pd
import os
from pathlib import Path


# Define the CSV path and output folder
csv_path = "/Users/libiv/code/VERA/data/raw/speechesURLS - Sheet1.csv"
output_folder = "/Users/libiv/code/VERA/data/raw/extracted-videos"


def parse_time_to_seconds(time_str):
    """
    Convert MM:SS format to total seconds.

    Args:
        time_str: Time string in MM:SS format

    Returns:
        Total seconds as integer
    """
    try:
        parts = time_str.strip().split(':')
        if len(parts) == 2:
            minutes, seconds = int(parts[0]), int(parts[1])
            return minutes * 60 + seconds
        else:
            raise ValueError(f"Invalid time format: {time_str}")
    except Exception as e:
        raise ValueError(f"Failed to parse time '{time_str}': {e}")


def download_trimmed_video(url, start_time, end_time, video_name, output_folder):
    """
    Download a trimmed segment of a video using yt-dlp with ffmpeg.

    Args:
        url: Video URL (YouTube or other supported site)
        start_time: Start time in MM:SS format
        end_time: End time in MM:SS format
        video_name: Output filename (without extension)
        output_folder: Folder to save the video
    """
    try:
        # Parse times to seconds
        start_seconds = parse_time_to_seconds(start_time)
        end_seconds = parse_time_to_seconds(end_time)

        # Convert back to HH:MM:SS format for ffmpeg
        start_hms = f"{start_seconds // 3600:02d}:{(start_seconds % 3600) // 60:02d}:{start_seconds % 60:02d}"
        end_hms = f"{end_seconds // 3600:02d}:{(end_seconds % 3600) // 60:02d}:{end_seconds % 60:02d}"

        # Ensure video_name has .mp4 extension
        if not video_name.endswith('.mp4'):
            video_name = f"{video_name}.mp4"

        # Full output path
        output_path = os.path.join(output_folder, video_name)

        # Configure yt_dlp options with ffmpeg trimming
        ydl_opts = {
            'format': 'best[ext=mp4]+bestaudio[ext=m4a]/best',
            'outtmpl': output_path,
            'merge_output_format': 'mp4',
            'external_downloader': 'ffmpeg',
            'external_downloader_args': [
                '-ss', start_hms,
                '-to', end_hms
            ],
        }

        # Download the trimmed video
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])

        print(f"✓ Successfully downloaded: {video_name} (from {start_time} to {end_time})")

    except ValueError as ve:
        print(f"✗ Time parsing error for {video_name}: {ve}")
    except yt_dlp.utils.DownloadError as de:
        print(f"✗ Download failed for {video_name}: {de}")
    except Exception as e:
        print(f"✗ Unexpected error for {video_name}: {e}")


def main():
    """
    Main function to read CSV and process each row.
    """
    # Create output folder if it doesn't exist
    Path(output_folder).mkdir(parents=True, exist_ok=True)
    print(f"Output folder: {output_folder}")

    try:
        # Read the CSV file
        df = pd.read_csv(csv_path)

        # Validate required columns
        required_columns = ['Link', 'Start-time', 'End-time', 'Video-name']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"Error: Missing required columns: {missing_columns}")
            return

        print(f"Found {len(df)} videos to process\n")

        # Iterate over each row and download trimmed video
        for index, row in df.iterrows():
            print(f"[{index + 1}/{len(df)}] Processing: {row['Video-name']}")
            download_trimmed_video(
                url=row['Link'],
                start_time=row['Start-time'],
                end_time=row['End-time'],
                video_name=row['Video-name'],
                output_folder=output_folder
            )

        print(f"\n✓ Processing complete! All videos saved to '{output_folder}' folder.")

    except FileNotFoundError:
        print(f"Error: CSV file not found at '{csv_path}'")
    except pd.errors.EmptyDataError:
        print(f"Error: CSV file is empty at '{csv_path}'")
    except Exception as e:
        print(f"Error reading CSV: {e}")


if __name__ == "__main__":
    main()


Output folder: /Users/libiv/code/VERA/data/raw/extracted-videos
Found 24 videos to process

[1/24] Processing: sp_1_1
[youtube] Extracting URL: https://www.youtube.com/watch?v=mTozO0GPcCI
[youtube] mTozO0GPcCI: Downloading webpage




[youtube] mTozO0GPcCI: Downloading android sdkless player API JSON
[youtube] mTozO0GPcCI: Downloading web safari player API JSON




[youtube] mTozO0GPcCI: Downloading m3u8 information




[info] mTozO0GPcCI: Downloading 1 format(s): 96
[download] /Users/libiv/code/VERA/data/raw/extracted-videos/sp_1_1.mp4 has already been downloaded
[download] 100% of    3.71MiB
✓ Successfully downloaded: sp_1_1.mp4 (from 0:28 to 0:51)
[2/24] Processing: sp_1_2
[youtube] Extracting URL: https://www.youtube.com/watch?v=mTozO0GPcCI
[youtube] mTozO0GPcCI: Downloading webpage




[youtube] mTozO0GPcCI: Downloading android sdkless player API JSON
[youtube] mTozO0GPcCI: Downloading web safari player API JSON




[youtube] mTozO0GPcCI: Downloading m3u8 information




[info] mTozO0GPcCI: Downloading 1 format(s): 96
[download] /Users/libiv/code/VERA/data/raw/extracted-videos/sp_1_2.mp4 has already been downloaded
[download] 100% of    2.38MiB
✓ Successfully downloaded: sp_1_2.mp4 (from 1:11 to 1:24)
[3/24] Processing: sp_1_3
[youtube] Extracting URL: https://www.youtube.com/watch?v=mTozO0GPcCI
[youtube] mTozO0GPcCI: Downloading webpage




[youtube] mTozO0GPcCI: Downloading android sdkless player API JSON
[youtube] mTozO0GPcCI: Downloading web safari player API JSON




[youtube] mTozO0GPcCI: Downloading m3u8 information




[info] mTozO0GPcCI: Downloading 1 format(s): 96
[download] /Users/libiv/code/VERA/data/raw/extracted-videos/sp_1_3.mp4 has already been downloaded
[download] 100% of    2.70MiB
✓ Successfully downloaded: sp_1_3.mp4 (from 1:39 to 1:57)
[4/24] Processing: sp_1_4
[youtube] Extracting URL: https://www.youtube.com/watch?v=mTozO0GPcCI
[youtube] mTozO0GPcCI: Downloading webpage




[youtube] mTozO0GPcCI: Downloading android sdkless player API JSON
[youtube] mTozO0GPcCI: Downloading web safari player API JSON




[youtube] mTozO0GPcCI: Downloading m3u8 information




[info] mTozO0GPcCI: Downloading 1 format(s): 96
[download] /Users/libiv/code/VERA/data/raw/extracted-videos/sp_1_4.mp4 has already been downloaded
[download] 100% of    2.78MiB
✓ Successfully downloaded: sp_1_4.mp4 (from 2:20 to 2:37)
[5/24] Processing: sp_1_5
[youtube] Extracting URL: https://www.youtube.com/watch?v=mTozO0GPcCI
[youtube] mTozO0GPcCI: Downloading webpage




[youtube] mTozO0GPcCI: Downloading android sdkless player API JSON
[youtube] mTozO0GPcCI: Downloading web safari player API JSON




[youtube] mTozO0GPcCI: Downloading m3u8 information




[info] mTozO0GPcCI: Downloading 1 format(s): 96
[download] /Users/libiv/code/VERA/data/raw/extracted-videos/sp_1_5.mp4 has already been downloaded
[download] 100% of    1.62MiB
✓ Successfully downloaded: sp_1_5.mp4 (from 2:50 to 3:03)
[6/24] Processing: nan
✗ Time parsing error for nan: Failed to parse time 'nan': 'float' object has no attribute 'strip'
[7/24] Processing: nan
✗ Time parsing error for nan: Failed to parse time 'nan': 'float' object has no attribute 'strip'
[8/24] Processing: nan
✗ Time parsing error for nan: Failed to parse time 'nan': 'float' object has no attribute 'strip'
[9/24] Processing: nan
✗ Time parsing error for nan: Failed to parse time 'nan': 'float' object has no attribute 'strip'
[10/24] Processing: nan
✗ Time parsing error for nan: Failed to parse time 'nan': 'float' object has no attribute 'strip'
[11/24] Processing: nan
✗ Time parsing error for nan: Failed to parse time 'nan': 'float' object has no attribute 'strip'
[12/24] Processing: nan
✗ Time parsin