In [4]:
import re
from googleapiclient.discovery import build

# Function to get comments from a YouTube video
def get_comments(video_id, api_key):
    youtube = build("youtube", "v3", developerKey=api_key)
    comments = []

    # Get the first page of comments
    request = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        textFormat="plainText"
    )
    response = request.execute()

    # Extract comments from the response
    while request:
        for item in response["items"]:
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)

        # Check if there are more pages of comments
        if "nextPageToken" in response:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                pageToken=response["nextPageToken"],
                textFormat="plainText"
            )
            response = request.execute()
        else:
            break

    return comments

# Function to extract timestamps from comments and format them as HH:MM:SS
def extract_timestamps(comments):
    timestamps = []
    timestamp_pattern = r'(\d{1,2}):(\d{2})|(\d{1,2}):(\d{2}):(\d{2})|(\d{2}):(\d{2}):(\d{2})'

    for comment in comments:
        matches = re.findall(timestamp_pattern, comment)
        for match in matches:
            # Match might contain empty strings, so process each part individually
            if match[0] and match[1]:  # MM:SS format
                timestamp = f"00:{match[0].zfill(2)}:{match[1].zfill(2)}"
            elif match[2] and match[3] and match[4]:  # H:MM:SS format
                timestamp = f"{match[2].zfill(2)}:{match[3].zfill(2)}:{match[4].zfill(2)}"
            elif match[5] and match[6] and match[7]:  # HH:MM:SS format
                timestamp = f"{match[5].zfill(2)}:{match[6].zfill(2)}:{match[7].zfill(2)}"
            timestamps.append(timestamp)

    return timestamps

# Main function to fetch comments and extract timestamps
def get_video_timestamps(video_url, api_key):
    # Extract video ID from the URL
    video_id = video_url.split("v=")[-1].split("&")[0]

    comments = get_comments(video_id, api_key)
    timestamps = extract_timestamps(comments)

    return timestamps

# Example usage
api_key = "AIzaSyDI3VMWo0MnXHrvEhXw34GtqAgc_5B6hiw"  # Replace with your API key
video_url = "https://www.youtube.com/watch?v=clKiW01SVgQ"

timestamps = get_video_timestamps(video_url, api_key)
print("Timestamps found:", timestamps)


Timestamps found: ['00:03:36', '00:20:45', '00:07:53', '00:01:24', '00:01:31', '00:09:46', '00:05:40', '00:10:50', '00:01:32', '00:03:04', '00:07:42', '00:07:08', '00:13:33', '00:01:31', '00:11:50', '00:01:30', '00:10:36', '00:07:11', '00:00:38', '00:12:18', '00:05:11', '00:12:02']


In [8]:
import re
import random
import pandas as pd
import os
import subprocess
from youtube_transcript_api import YouTubeTranscriptApi
import yt_dlp

def extract_video_id(youtube_url):
    """Extracts the Video ID from a given YouTube URL."""
    pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
    match = re.search(pattern, youtube_url)
    return match.group(1) if match else None

def get_youtube_transcript(video_id):
    """Fetches the transcript of a YouTube video along with timestamps."""
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return transcript
    except Exception as e:
        return str(e)

def generate_segments_from_timestamps(timestamps, duration=5):
    """Generates video segments starting from the first 3 timestamps, ending 5 seconds after each."""
    segments = []
    for i in range(min(3, len(timestamps))):  # Only use the first 3 timestamps
        start_time = timestamps[i]
        end_time = start_time + duration  # Ending 5 seconds after the start time
        start_time_str = pd.to_datetime(start_time, unit="s").strftime("%H:%M:%S")
        end_time_str = pd.to_datetime(end_time, unit="s").strftime("%H:%M:%S")
        segments.append((start_time, end_time, start_time_str, end_time_str))
    return segments

def download_clip(youtube_url, start, end, output_name):
    """Downloads a video clip from YouTube using yt-dlp and trims it using ffmpeg."""
    temp_filename = "temp_video.mp4"

    # Ensure we get MP4 format
    ydl_opts = {
        'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]',
        'outtmpl': temp_filename,
        'merge_output_format': 'mp4'
    }

    print(f"Downloading full video from {youtube_url}...")
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])

    # Ensure the file exists before trimming
    if not os.path.exists(temp_filename):
        print(f"Error: {temp_filename} not found. Check the download format.")
        return None

    output_path = f"clips/{output_name}.mp4"
    os.makedirs("clips", exist_ok=True)

    # Trim the segment using FFmpeg
    ffmpeg_cmd = [
        "ffmpeg", "-y",
        "-i", temp_filename,
        "-ss", str(start),
        "-to", str(end),
        "-c", "copy",
        output_path
    ]

    print(f"Trimming segment {output_name}: {start} - {end}")
    subprocess.run(ffmpeg_cmd)

    # Remove the full video after trimming
    os.remove(temp_filename)

    return output_path

# User Input
youtube_url = input("Enter YouTube URL: ")
video_id = extract_video_id(youtube_url)

if video_id:
    transcript_data = get_youtube_transcript(video_id)

    if isinstance(transcript_data, list):
        df = pd.DataFrame(transcript_data)
        timestamps = df["start"].tolist()
        selected_segments = generate_segments_from_timestamps(timestamps)

        for idx, (start, end, start_str, end_str) in enumerate(selected_segments):
            print(f"\nDownloading Segment {idx+1}: {start_str} - {end_str}")
            clip_path = download_clip(youtube_url, start, end, f"segment_{idx+1}")
            print(f"Saved: {clip_path}" if clip_path else "Download failed.")

    else:
        print("Error:", transcript_data)
else:
    print("Invalid YouTube URL")


Enter YouTube URL: https://www.youtube.com/watch?v=clKiW01SVgQ

Downloading Segment 1: 00:00:00 - 00:00:05
Downloading full video from https://www.youtube.com/watch?v=clKiW01SVgQ...
[youtube] Extracting URL: https://www.youtube.com/watch?v=clKiW01SVgQ
[youtube] clKiW01SVgQ: Downloading webpage
[youtube] clKiW01SVgQ: Downloading tv client config
[youtube] clKiW01SVgQ: Downloading player 69f581a5
[youtube] clKiW01SVgQ: Downloading tv player API JSON
[youtube] clKiW01SVgQ: Downloading ios player API JSON
[youtube] clKiW01SVgQ: Downloading m3u8 information
[info] Testing format 625
[info] clKiW01SVgQ: Downloading 1 format(s): 625+140
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 287
[download] Destination: temp_video.f625.mp4
[download] 100% of    1.04GiB in 00:00:23 at 46.12MiB/s                  
[download] Destination: temp_video.f140.m4a
[download] 100% of   23.09MiB in 00:00:00 at 58.49MiB/s  
[Merger] Merging formats into "temp_video.mp4"
Deleting original file t