In [10]:
import yt_dlp
import pandas as pd
import os
from pathlib import Path


# Define the CSV path and output folder
csv_path = "/Users/louiscoussement/code/VERA/data/raw/speechesURLS - Sheet1.csv"
output_folder = "/Users/louiscoussement/code/VERA/data/raw/extracted-videos"


def parse_time_to_seconds(time_str):
    """
    Convert MM:SS format to total seconds.

    Args:
        time_str: Time string in MM:SS format

    Returns:
        Total seconds as integer
    """
    try:
        parts = time_str.strip().split(':')
        if len(parts) == 2:
            minutes, seconds = int(parts[0]), int(parts[1])
            return minutes * 60 + seconds
        else:
            raise ValueError(f"Invalid time format: {time_str}")
    except Exception as e:
        raise ValueError(f"Failed to parse time '{time_str}': {e}")


def download_trimmed_video(url, start_time, end_time, video_name, output_folder):
    """
    Download a trimmed segment of a video using yt-dlp with ffmpeg.

    Args:
        url: Video URL (YouTube or other supported site)
        start_time: Start time in MM:SS format
        end_time: End time in MM:SS format
        video_name: Output filename (without extension)
        output_folder: Folder to save the video
    """
    try:
        # Parse times to seconds
        start_seconds = parse_time_to_seconds(start_time)
        end_seconds = parse_time_to_seconds(end_time)

        # Convert back to HH:MM:SS format for ffmpeg
        start_hms = f"{start_seconds // 3600:02d}:{(start_seconds % 3600) // 60:02d}:{start_seconds % 60:02d}"
        end_hms = f"{end_seconds // 3600:02d}:{(end_seconds % 3600) // 60:02d}:{end_seconds % 60:02d}"

        # Ensure video_name has .mp4 extension
        video_name = str(video_name)
        if not video_name.endswith('.mp4'):
            video_name = f"{video_name}.mp4"

        # Full output path
        output_path = os.path.join(output_folder, video_name)

        # Configure yt_dlp options with ffmpeg trimming
        ydl_opts = {
            'format': 'best[ext=mp4]+bestaudio[ext=m4a]/best',
            'outtmpl': output_path,
            'merge_output_format': 'mp4',
            'external_downloader': 'ffmpeg',
            'external_downloader_args': [
                '-ss', start_hms,
                '-to', end_hms
            ],
        }

        # Download the trimmed video
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])

        print(f"✓ Successfully downloaded: {video_name} (from {start_time} to {end_time})")

    except ValueError as ve:
        print(f"✗ Time parsing error for {video_name}: {ve}")
    except yt_dlp.utils.DownloadError as de:
        print(f"✗ Download failed for {video_name}: {de}")
    except Exception as e:
        print(f"✗ Unexpected error for {video_name}: {e}")



def main():
    """
    Main function to read CSV and process each row.
    """
    # Create output folder if it doesn't exist
    Path(output_folder).mkdir(parents=True, exist_ok=True)
    print(f"Output folder: {output_folder}")

    try:
        # Read the CSV file
        df = pd.read_csv(csv_path)

        # Validate required columns
        required_columns = ['Link', 'Start-time', 'End-time', 'Video-name']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"Error: Missing required columns: {missing_columns}")
            return

        print(f"Found {len(df)} videos to process\n")

        # Iterate over each row and download trimmed video
        for index, row in df.iterrows():
            print(f"[{index + 1}/{len(df)}] Processing: {row['Video-name']}")
            download_trimmed_video(
                url=row['Link'],
                start_time=row['Start-time'],
                end_time=row['End-time'],
                video_name=row['Video-name'],
                output_folder=output_folder
            )

        print(f"\n✓ Processing complete! All videos saved to '{output_folder}' folder.")

    except FileNotFoundError:
        print(f"Error: CSV file not found at '{csv_path}'")
    except pd.errors.EmptyDataError:
        print(f"Error: CSV file is empty at '{csv_path}'")
    except Exception as e:
        print(f"Error reading CSV: {e}")

if __name__ == "__main__":
    main()

Output folder: /Users/louiscoussement/code/VERA/data/raw/extracted-videos
Found 57 videos to process

[1/57] Processing: 23
[youtube] Extracting URL: https://www.youtube.com/watch?v=PczCM3GwB4Q
[youtube] PczCM3GwB4Q: Downloading webpage




[youtube] PczCM3GwB4Q: Downloading android sdkless player API JSON
[youtube] PczCM3GwB4Q: Downloading web safari player API JSON




[youtube] PczCM3GwB4Q: Downloading m3u8 information




[info] PczCM3GwB4Q: Downloading 1 format(s): 301
[download] Sleeping 6.00 seconds as required by the site...
[download] Destination: /Users/louiscoussement/code/VERA/data/raw/extracted-videos/23.mp4


[hls @ 0x15d022c90] Opening 'https://rr1---sn-ovn-v2vr.googlevideo.com/videoplayback/id/3dccc23371b00784/itag/301/source/youtube/expire/1764978726/ei/xhszacvSA_eZv_IPvca_2Q4/ip/195.23.151.229/requiressl/yes/ratebypass/yes/pfa/1/sgoap/clen%3D2936140%3Bdur%3D181.371%3Bgir%3Dyes%3Bitag%3D140%3Blmt%3D1746272343518041/sgovp/clen%3D62511487%3Bdur%3D181.314%3Bgir%3Dyes%3Bitag%3D299%3Blmt%3D1746272346831159/rqh/1/hls_chunk_host/rr1---sn-ovn-v2vr.googlevideo.com/xpc/EgVo2aDSNQ%3D%3D/cps/0/met/1764957126,/mh/NN/mm/31,29/mn/sn-ovn-v2vr,sn-ovn-apn6/ms/au,rdu/mv/m/mvi/1/pl/24/rms/au,au/initcwndbps/2078750/bui/AYUSA3A4dHmI_Q9Di-xGI8nccWk3JcicabCOQns2x0JvIv7W-mvaAVc1mLWD5TeALjJTCeyEwm8MCTRP/spc/wH4Qq67_BVwD93zWEdlrBa_K7p4nr-mHTcV0Z3PQEjOoPnzulh6etZ6LAIz0D8ftgL-KquekJ28/vprv/1/playlist_type/CLEAN/txp/5309224/mt/1764956715/fvip/2/keepalive/yes/fexp/51355912,51552689,51565116,51565682,51580968/sparams/expire,ei,ip,id,itag,source,requiressl,ratebypass,pfa,sgoap,sgovp,rqh,xpc,bui,spc,vprv,playlist_type/si

[download] 100% of   24.06MiB in 00:00:02 at 8.21MiB/s
✓ Successfully downloaded: 23.mp4 (from 1:25 to 2:30)
[2/57] Processing: 6
[youtube] Extracting URL: https://www.youtube.com/watch?v=5BKZ3Syet2I
[youtube] 5BKZ3Syet2I: Downloading webpage


[https @ 0x15c857600] Opening 'https://rr1---sn-ovn-v2vr.googlevideo.com/videoplayback/id/3dccc23371b00784/itag/301/source/youtube/expire/1764978726/ei/xhszacvSA_eZv_IPvca_2Q4/ip/195.23.151.229/requiressl/yes/ratebypass/yes/pfa/1/sgoap/clen%3D2936140%3Bdur%3D181.371%3Bgir%3Dyes%3Bitag%3D140%3Blmt%3D1746272343518041/sgovp/clen%3D62511487%3Bdur%3D181.314%3Bgir%3Dyes%3Bitag%3D299%3Blmt%3D1746272346831159/rqh/1/hls_chunk_host/rr1---sn-ovn-v2vr.googlevideo.com/xpc/EgVo2aDSNQ%3D%3D/cps/0/met/1764957126,/mh/NN/mm/31,29/mn/sn-ovn-v2vr,sn-ovn-apn6/ms/au,rdu/mv/m/mvi/1/pl/24/rms/au,au/initcwndbps/2078750/bui/AYUSA3A4dHmI_Q9Di-xGI8nccWk3JcicabCOQns2x0JvIv7W-mvaAVc1mLWD5TeALjJTCeyEwm8MCTRP/spc/wH4Qq67_BVwD93zWEdlrBa_K7p4nr-mHTcV0Z3PQEjOoPnzulh6etZ6LAIz0D8ftgL-KquekJ28/vprv/1/playlist_type/CLEAN/txp/5309224/mt/1764956715/fvip/2/keepalive/yes/fexp/51355912,51552689,51565116,51565682,51580968/sparams/expire,ei,ip,id,itag,source,requiressl,ratebypass,pfa,sgoap,sgovp,rqh,xpc,bui,spc,vprv,playlist_type/

[youtube] 5BKZ3Syet2I: Downloading android sdkless player API JSON
[youtube] 5BKZ3Syet2I: Downloading web safari player API JSON




[youtube] 5BKZ3Syet2I: Downloading m3u8 information




[info] 5BKZ3Syet2I: Downloading 1 format(s): 96
[download] Sleeping 5.00 seconds as required by the site...
[download] Destination: /Users/louiscoussement/code/VERA/data/raw/extracted-videos/6.mp4


[hls @ 0x125722d60] Opening 'https://rr5---sn-ovn-v2ve.googlevideo.com/videoplayback/id/e41299dd2c9eb762/itag/96/source/youtube/expire/1764978736/ei/0BszaZSRJ6eC6dsP4JCMsQg/ip/195.23.151.229/requiressl/yes/ratebypass/yes/pfa/1/sgoap/clen%3D3188514%3Bdur%3D196.974%3Bgir%3Dyes%3Bitag%3D140%3Blmt%3D1672133986562294/sgovp/clen%3D38141815%3Bdur%3D196.920%3Bgir%3Dyes%3Bitag%3D137%3Blmt%3D1672134002999882/rqh/1/hls_chunk_host/rr5---sn-ovn-v2ve.googlevideo.com/xpc/EgVo2aDSNQ%3D%3D/cps/0/met/1764957136,/mh/kQ/mm/31,29/mn/sn-ovn-v2ve,sn-ovn-apne/ms/au,rdu/mv/m/mvi/5/pcm2cms/yes/pl/24/rms/au,au/initcwndbps/1426250/bui/AYUSA3CUTF9E5lZGf2qjxB_9ikJxmnPFAOm-Sj0vu1YsvNvfQvlmGQF6XBwfIk0Hz7t_tYJIka3f5l1f/spc/wH4QqxceAmjuO4Hwa_pNpf1329tT7pyjJ-ThXtgQQhXyo6reS9o9PjY8UdSSj8xqstzO1g/vprv/1/playlist_type/CLEAN/txp/2216224/mt/1764956715/fvip/8/keepalive/yes/fexp/51355912,51552689,51565115,51565681,51580968/sparams/expire,ei,ip,id,itag,source,requiressl,ratebypass,pfa,sgoap,sgovp,rqh,xpc,bui,spc,vprv,playlist_t

[download] 100% of   11.47MiB in 00:00:02 at 4.67MiB/s


[https @ 0x12587b400] Opening 'https://rr5---sn-ovn-v2ve.googlevideo.com/videoplayback/id/e41299dd2c9eb762/itag/96/source/youtube/expire/1764978736/ei/0BszaZSRJ6eC6dsP4JCMsQg/ip/195.23.151.229/requiressl/yes/ratebypass/yes/pfa/1/sgoap/clen%3D3188514%3Bdur%3D196.974%3Bgir%3Dyes%3Bitag%3D140%3Blmt%3D1672133986562294/sgovp/clen%3D38141815%3Bdur%3D196.920%3Bgir%3Dyes%3Bitag%3D137%3Blmt%3D1672134002999882/rqh/1/hls_chunk_host/rr5---sn-ovn-v2ve.googlevideo.com/xpc/EgVo2aDSNQ%3D%3D/cps/0/met/1764957136,/mh/kQ/mm/31,29/mn/sn-ovn-v2ve,sn-ovn-apne/ms/au,rdu/mv/m/mvi/5/pcm2cms/yes/pl/24/rms/au,au/initcwndbps/1426250/bui/AYUSA3CUTF9E5lZGf2qjxB_9ikJxmnPFAOm-Sj0vu1YsvNvfQvlmGQF6XBwfIk0Hz7t_tYJIka3f5l1f/spc/wH4QqxceAmjuO4Hwa_pNpf1329tT7pyjJ-ThXtgQQhXyo6reS9o9PjY8UdSSj8xqstzO1g/vprv/1/playlist_type/CLEAN/txp/2216224/mt/1764956715/fvip/8/keepalive/yes/fexp/51355912,51552689,51565115,51565681,51580968/sparams/expire,ei,ip,id,itag,source,requiressl,ratebypass,pfa,sgoap,sgovp,rqh,xpc,bui,spc,vprv,playlist

✓ Successfully downloaded: 6.mp4 (from 0:27 to 1:28)
[3/57] Processing: 24
[youtube] Extracting URL: https://www.youtube.com/watch?v=y8hXl-RZe_s
[youtube] y8hXl-RZe_s: Downloading webpage




[youtube] y8hXl-RZe_s: Downloading android sdkless player API JSON
[youtube] y8hXl-RZe_s: Downloading web safari player API JSON




[youtube] y8hXl-RZe_s: Downloading m3u8 information




[info] y8hXl-RZe_s: Downloading 1 format(s): 96
[download] Sleeping 5.00 seconds as required by the site...
[download] Destination: /Users/louiscoussement/code/VERA/data/raw/extracted-videos/24.mp4


[hls @ 0x15a722cc0] Opening 'https://rr1---sn-ovn-v2ve.googlevideo.com/videoplayback/id/cbc85797e4597bfb/itag/96/source/youtube/expire/1764978745/ei/2Rszaa6XM8mC6dsP0Ke10AI/ip/195.23.151.229/requiressl/yes/ratebypass/yes/pfa/1/sgoap/clen%3D3338177%3Bdur%3D206.216%3Bgir%3Dyes%3Bitag%3D140%3Blmt%3D1758074269758527/sgovp/clen%3D30790802%3Bdur%3D206.160%3Bgir%3Dyes%3Bitag%3D137%3Blmt%3D1758074279722718/rqh/1/hls_chunk_host/rr1---sn-ovn-v2ve.googlevideo.com/xpc/EgVo2aDSNQ%3D%3D/cps/0/met/1764957145,/mh/7a/mm/31,29/mn/sn-ovn-v2ve,sn-ovn-apnl/ms/au,rdu/mv/m/mvi/1/pl/24/rms/au,au/initcwndbps/1426250/bui/AYUSA3Dd6oHylIVYE1stJLdKF7sSaWL7ngXt01uWv_97QSYA3aKI5MRpdS9YeGepxJN-WV-Admmbm1J5/spc/wH4Qq-J_tvFOPu4vyjHvTP_BLpikLCulF3VS14GFMOOoZehkiDJ5WSKyGBmoaGpIzFjak0BjYS0/vprv/1/playlist_type/CLEAN/txp/5309224/mt/1764956715/fvip/8/keepalive/yes/fexp/51355912,51552689,51565116,51565681,51580968/sparams/expire,ei,ip,id,itag,source,requiressl,ratebypass,pfa,sgoap,sgovp,rqh,xpc,bui,spc,vprv,playlist_type/sig

[download] 100% of   10.34MiB in 00:00:05 at 2.01MiB/s
✓ Successfully downloaded: 24.mp4 (from 2:00 to 3:00)
[4/57] Processing: 21
[youtube] Extracting URL: https://www.youtube.com/watch?v=eHbpbGi7keg
[youtube] eHbpbGi7keg: Downloading webpage




[youtube] eHbpbGi7keg: Downloading android sdkless player API JSON
[youtube] eHbpbGi7keg: Downloading web safari player API JSON




[youtube] eHbpbGi7keg: Downloading m3u8 information




[info] eHbpbGi7keg: Downloading 1 format(s): 95
[download] Sleeping 6.00 seconds as required by the site...
[download] Destination: /Users/louiscoussement/code/VERA/data/raw/extracted-videos/21.mp4


[hls @ 0x141f05020] Opening 'https://rr2---sn-ovn-v2vr.googlevideo.com/videoplayback/id/7876e96c68bb91e8/itag/95/source/youtube/expire/1764978757/ei/5RszadWUIfKfp-oP8YGd4Qk/ip/195.23.151.229/requiressl/yes/ratebypass/yes/pfa/1/sgoap/clen%3D4647790%3Bdur%3D287.137%3Bgir%3Dyes%3Bitag%3D140%3Blmt%3D1727098921476752/sgovp/clen%3D24955026%3Bdur%3D287.086%3Bgir%3Dyes%3Bitag%3D136%3Blmt%3D1727098897079224/rqh/1/hls_chunk_host/rr2---sn-ovn-v2vr.googlevideo.com/xpc/EgVo2aDSNQ%3D%3D/cps/0/met/1764957157,/mh/qk/mm/31,29/mn/sn-ovn-v2vr,sn-1vo-apn6/ms/au,rdu/mv/m/mvi/2/pcm2cms/yes/pl/24/rms/au,au/initcwndbps/2078750/bui/AdEuB5ToNUsr2ONDwfw_M4YAir0QHEre5Xcw5OKIo3aQpHfbrz9OOSpAhTrnFycCKIPXC-YyAKhXQ-Fb/spc/6b0G_MnnMw3RENhA4j52Mv5ws4IN3s6SJH-hJuZs5JIA9VbTbBIFyGOKl31JwId38Yh0Ag/vprv/1/playlist_type/CLEAN/txp/5309224/mt/1764956715/fvip/1/keepalive/yes/fexp/51355912,51552689,51565115,51565681,51580968/sparams/expire,ei,ip,id,itag,source,requiressl,ratebypass,pfa,sgoap,sgovp,rqh,xpc,bui,spc,vprv,playlist_t

[download] 100% of    3.86MiB in 00:00:03 at 1.03MiB/s


[out#0/mp4 @ 0x141e42f00] video:2965KiB audio:955KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.955614%
frame= 1624 fps=551 q=-1.0 Lsize=    3958KiB time=00:04:00.03 bitrate= 135.1kbits/s speed=81.4x elapsed=0:00:02.94    


✓ Successfully downloaded: 21.mp4 (from 3:00 to 4:00)
[5/57] Processing: 22


KeyboardInterrupt: 