In [None]:
!pip install youtube_transcript_api

Collecting youtube_transcript_api
  Downloading youtube_transcript_api-1.0.2-py3-none-any.whl.metadata (23 kB)
Downloading youtube_transcript_api-1.0.2-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/1.9 MB[0m [31m21.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m34.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube_transcript_api
Successfully installed youtube_transcript_api-1.0.2


In [29]:
import re
import random
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi

def extract_video_id(youtube_url):
    """
    Extracts the Video ID from a given YouTube URL.
    """
    pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
    match = re.search(pattern, youtube_url)
    return match.group(1) if match else None

def get_youtube_transcript(video_id):
    """
    Fetches the transcript of a YouTube video along with timestamps.
    """
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return transcript  # Returns a list of dictionaries with text, start time, and duration
    except Exception as e:
        return str(e)

def pair_timestamps(df, window=30):
    """
    Pairs timestamps that form a window of around 30 seconds
    and ensures timestamps within a selected window are skipped.
    """
    timestamps = df["start"].tolist()
    paired = []
    i = 0

    while i < len(timestamps) - 1:
        start_time = timestamps[i]

        # Find the next timestamp that creates a window close to 30 seconds
        for j in range(i + 1, len(timestamps)):
            end_time = timestamps[j]
            if end_time - start_time >= window:
                paired.append((start_time, end_time))
                i = j  # Skip timestamps within this window
                break
        i += 1

    return paired

def generate_segments(pairs, num_segments=4):
    """
    Randomly selects `num_segments` pairs and prints the time range.
    """
    if len(pairs) < num_segments:
        print("Not enough timestamp pairs available, selecting all available pairs.")
        selected_pairs = pairs
    else:
        selected_pairs = random.sample(pairs, num_segments)

    segments = []
    for start, end in selected_pairs:
        start_time_str = pd.to_datetime(start, unit="s").strftime("%H:%M:%S")
        end_time_str = pd.to_datetime(end, unit="s").strftime("%H:%M:%S")
        segments.append((start_time_str, end_time_str))

    return segments

# User Input
youtube_url = input("Enter YouTube URL: ")
video_id = extract_video_id(youtube_url)

if video_id:
    transcript_data = get_youtube_transcript(video_id)

    if isinstance(transcript_data, list):
        # Convert transcript into a Pandas DataFrame
        df = pd.DataFrame(transcript_data)

        # Convert start time to HH:MM:SS format
        df["timestamp"] = pd.to_datetime(df["start"], unit="s").dt.strftime("%H:%M:%S")

        # Get timestamp pairs
        pairs = pair_timestamps(df)

        # Select 4 random segments
        selected_segments = generate_segments(pairs)

        # Display selected segments
        print("\nSelected Video Segments:")
        for start, end in selected_segments:
            print(f"Segment: {start} - {end}")
    else:
        print("Error:", transcript_data)
else:
    print("Invalid YouTube URL")


Enter YouTube URL: https://www.youtube.com/watch?v=clKiW01SVgQ

Selected Video Segments:
Segment: 00:21:34 - 00:22:05
Segment: 00:18:17 - 00:18:49
Segment: 00:19:57 - 00:20:28
Segment: 00:13:52 - 00:14:23


In [33]:
import re
import random
import pandas as pd
import os
import subprocess
from youtube_transcript_api import YouTubeTranscriptApi
import yt_dlp

def extract_video_id(youtube_url):
    """Extracts the Video ID from a given YouTube URL."""
    pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
    match = re.search(pattern, youtube_url)
    return match.group(1) if match else None

def get_youtube_transcript(video_id):
    """Fetches the transcript of a YouTube video along with timestamps."""
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return transcript
    except Exception as e:
        return str(e)

def pair_timestamps(df, window=30):
    """Pairs timestamps that form a window of around 30 seconds."""
    timestamps = df["start"].tolist()
    paired = []
    i = 0

    while i < len(timestamps) - 1:
        start_time = timestamps[i]
        for j in range(i + 1, len(timestamps)):
            end_time = timestamps[j]
            if end_time - start_time >= window:
                paired.append((start_time, end_time))
                i = j
                break
        i += 1

    return paired

def generate_segments(pairs, num_segments=4):
    """Randomly selects `num_segments` pairs and formats them."""
    if len(pairs) < num_segments:
        selected_pairs = pairs
    else:
        selected_pairs = random.sample(pairs, num_segments)

    segments = []
    for start, end in selected_pairs:
        start_time_str = pd.to_datetime(start, unit="s").strftime("%H:%M:%S")
        end_time_str = pd.to_datetime(end, unit="s").strftime("%H:%M:%S")
        segments.append((start, end, start_time_str, end_time_str))

    return segments

def download_clip(youtube_url, start, end, output_name):
    """Downloads a video clip from YouTube using yt-dlp and trims it using ffmpeg."""
    temp_filename = "temp_video.mp4"

    # Ensure we get MP4 format
    ydl_opts = {
        'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]',
        'outtmpl': temp_filename,
        'merge_output_format': 'mp4'
    }

    print(f"Downloading full video from {youtube_url}...")
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])

    # Ensure the file exists before trimming
    if not os.path.exists(temp_filename):
        print(f"Error: {temp_filename} not found. Check the download format.")
        return None

    output_path = f"clips/{output_name}.mp4"
    os.makedirs("clips", exist_ok=True)

    # Trim the segment using FFmpeg
    ffmpeg_cmd = [
        "ffmpeg", "-y",
        "-i", temp_filename,
        "-ss", str(start),
        "-to", str(end),
        "-c", "copy",
        output_path
    ]

    print(f"Trimming segment {output_name}: {start} - {end}")
    subprocess.run(ffmpeg_cmd)

    # Remove the full video after trimming
    os.remove(temp_filename)

    return output_path

# User Input
youtube_url = input("Enter YouTube URL: ")
video_id = extract_video_id(youtube_url)

if video_id:
    transcript_data = get_youtube_transcript(video_id)

    if isinstance(transcript_data, list):
        df = pd.DataFrame(transcript_data)
        pairs = pair_timestamps(df)
        selected_segments = generate_segments(pairs)

        for idx, (start, end, start_str, end_str) in enumerate(selected_segments):
            print(f"\nDownloading Segment {idx+1}: {start_str} - {end_str}")
            clip_path = download_clip(youtube_url, start, end, f"segment_{idx+1}")
            print(f"Saved: {clip_path}" if clip_path else "Download failed.")

    else:
        print("Error:", transcript_data)
else:
    print("Invalid YouTube URL")


Enter YouTube URL: https://www.youtube.com/watch?v=clKiW01SVgQhttps://www.youtube.com/watch?v=clKiW01SVgQ

Downloading Segment 1: 00:24:18 - 00:24:49
Downloading full video from https://www.youtube.com/watch?v=clKiW01SVgQhttps://www.youtube.com/watch?v=clKiW01SVgQ...
[youtube] Extracting URL: https://www.youtube.com/watch?v=clKiW01SVgQhttps://www.youtube.com/watch?v=clKiW01SVgQ
[youtube] clKiW01SVgQ: Downloading webpage
[youtube] clKiW01SVgQ: Downloading tv client config
[youtube] clKiW01SVgQ: Downloading player 69f581a5
[youtube] clKiW01SVgQ: Downloading tv player API JSON
[youtube] clKiW01SVgQ: Downloading ios player API JSON
[youtube] clKiW01SVgQ: Downloading m3u8 information
[info] Testing format 625
[info] clKiW01SVgQ: Downloading 1 format(s): 625+140
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 287
[download] Destination: temp_video.f625.mp4
[download] 100% of    1.04GiB in 00:00:36 at 29.15MiB/s                  
[download] Destination: temp_video.f140.m4a


In [35]:
import os
import subprocess

def crop_to_portrait(input_folder):
    for filename in os.listdir(input_folder):
        if filename.endswith(".mp4"):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(input_folder, "portrait_" + filename)

            # FFmpeg command to crop the center of the video to 9:16
            command = [
                "ffmpeg", "-i", input_path,
                "-vf", "crop=in_h*9/16:in_h",
                "-c:v", "libx264", "-preset", "fast",
                "-c:a", "aac", "-b:a", "128k",
                output_path
            ]

            subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

            # Remove original landscape video after conversion
            if os.path.exists(output_path):
                os.remove(input_path)
                print(f"Cropped and deleted: {filename}")

input_folder = "clips"
crop_to_portrait(input_folder)


Cropped and deleted: segment_3.mp4
Cropped and deleted: segment_2.mp4
