<a href="https://colab.research.google.com/github/Shibhaditya2/YouTube-Scrapper/blob/main/YouTubeScrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Imports required libraries
!pip install yt-dlp faster-whisper opencv-python google-api-python-client pandas ffmpeg-python
import os, csv, glob
from datetime import timedelta
import yt_dlp, cv2, pandas as pd
import ffmpeg as ffmpeg_py
from faster_whisper import WhisperModel
import googleapiclient.discovery
from google.colab import userdata

# YouTube Data API key
DEVELOPER_KEY = userdata.get('youtubeAPI')

# Download video using yt-dlp and return metadata
def download_video(video_url):
    opts = {
        'format': 'bestvideo+bestaudio/best',
        'outtmpl': '%(title)s_[%(id)s].%(ext)s',
        'merge_output_format': 'mp4'
    }

    # Download video and extract info
    with yt_dlp.YoutubeDL(opts) as ydl:
        info = ydl.extract_info(video_url, download=True)
    fname = ydl.prepare_filename(info)
    if not fname.endswith(".mp4"):
        fname = os.path.splitext(fname)[0] + ".mp4"
    return fname, info.get("id", ""), info.get("title", "Untitled"), info

# Fetch and format metadata into a readable text file
def fetch_metadata(video_id, out_dir):
    # Connect to YouTube API and fetch metadata
    yt_api = googleapiclient.discovery.build("youtube", "v3", developerKey=DEVELOPER_KEY)
    resp = yt_api.videos().list(part="snippet,statistics", id=video_id).execute()
    meta = resp.get("items", [{}])[0]
    snippet = meta.get("snippet", {})
    stats = meta.get("statistics", {})

    # Extract and format key metadata fields
    lines = [
        f"Title: {snippet.get('title', '')}",
        f"Author: {snippet.get('channelTitle', '')}",
        f"Published At: {snippet.get('publishedAt', '')}",
        f"Views: {stats.get('viewCount', '0')}",
        f"Likes: {stats.get('likeCount', '0')}",
        f"Comments: {stats.get('commentCount', '0')}",
        f"\nDescription:\n{snippet.get('description', '')}"
    ]
    with open(os.path.join(out_dir, "video_metadata.txt"), "w", encoding="utf-8") as f:
        f.write("\n\n".join(lines))

# Fetch and save top level comments into CSV
def fetch_comments(video_id, out_dir):
    # Connect to YouTube API and request comments
    yt_api = googleapiclient.discovery.build("youtube", "v3", developerKey=DEVELOPER_KEY)
    comments = []
    req = yt_api.commentThreads().list(part="snippet", videoId=video_id, maxResults=100)
    resp = req.execute()
    # Loop through comments and collect data
    for item in resp.get('items', []):
        c = item['snippet']['topLevelComment']['snippet']
        comments.append([c.get('authorDisplayName', ''), c.get('publishedAt', ''),
                         c.get('likeCount', 0), c.get('textOriginal', ''),
                         item['snippet'].get('isPublic', True)])

    # Paginate through all comments
    while "nextPageToken" in resp:
        req = yt_api.commentThreads().list(part="snippet", videoId=video_id, maxResults=100, pageToken=resp["nextPageToken"])
        resp = req.execute()
        for item in resp.get('items', []):
            c = item['snippet']['topLevelComment']['snippet']
            comments.append([c.get('authorDisplayName', ''), c.get('publishedAt', ''),
                             c.get('likeCount', 0), c.get('textOriginal', ''),
                             item['snippet'].get('isPublic', True)])

    # Save comments
    df = pd.DataFrame(comments, columns=['author', 'updated_at', 'like_count', 'text', 'public'])
    df.to_csv(os.path.join(out_dir, "comments.csv"), index=False, encoding="utf-8")

# Transcribe video using Faster-Whisper and save transcript as CSV
def generate_transcript(video_file, out_dir, model):
    # Convert video to audio
    audio_path = os.path.join(out_dir, "audio.wav")
    ffmpeg_py.input(video_file).output(audio_path, ac=1, ar='16000').run(overwrite_output=True)
    # Transcribe using Faster-Whisper
    segments, _ = model.transcribe(audio_path, word_timestamps=True)
    # Flatten segments into word-level transcript and add time stamps
    transcript = [{"start": round(w.start, 2), "end": round(w.end, 2), "word": w.word.strip()}
                  for seg in segments for w in seg.words]
    # Save transcript
    with open(os.path.join(out_dir, "transcript.csv"), "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["start", "end", "word"])
        writer.writeheader()
        writer.writerows(transcript)

# Extract all frames from the video using OpenCV
def extract_frames(video_file, out_dir):
    frames_dir = os.path.join(out_dir, "frames")
    os.makedirs(frames_dir, exist_ok=True)  # Create folder to save frames
    cap = cv2.VideoCapture(video_file)
    fps = cap.get(cv2.CAP_PROP_FPS)  # Get frames per second
    n = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret: break
        ts = n / fps
        fname = f"frame_{n:05d}_{str(timedelta(seconds=int(ts)))}.jpg"
        cv2.imwrite(os.path.join(frames_dir, fname), frame)
        n += 1
    cap.release()

# Function to execute all tasks
def process_video(video_url, model):
    # Download video and get info
    print("Downloading Video:")
    video_file, vid_id, title, _ = download_video(video_url)
    # Create a folder name
    folder = "".join(c for c in title if c.isalnum() or c in (" ", "_", "-")).rstrip()
    out_dir = os.path.join(os.getcwd(), f"{folder}_[{vid_id}]")
    os.makedirs(out_dir, exist_ok=True)
    # Move video into output folder
    os.rename(video_file, os.path.join(out_dir, os.path.basename(video_file)))
    video_file = os.path.join(out_dir, os.path.basename(video_file))
    # Run metadata, comments, transcript, and frame extraction
    print("Getting Metadata:")
    fetch_metadata(vid_id, out_dir)
    print("Meta Data Fetched Successfully\nFetching Comments:")
    fetch_comments(vid_id, out_dir)
    print("Fetched Comments Successfully\nGenerating Transcript:")
    generate_transcript(video_file, out_dir, model)
    print("Transcritpt Generated Successfully\nExtracting Frames:")
    extract_frames(video_file, out_dir)
    print("All Frames Extracted Successfully")
    print(f"Processed: {title} ({vid_id})")

def main():
    print("Downloading Faster-Whisper model...")
    model = WhisperModel("base", compute_type="auto")
    print("Model ready.")
    video_url = "https://www.youtube.com/watch?v=zhWDdy_5v2w"  # video link
    process_video(video_url, model)
    print("Video processed!")

if __name__ == "__main__":
    main()


Downloading Faster-Whisper model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/145M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

vocabulary.txt: 0.00B [00:00, ?B/s]

Model ready.
Downloading Video:
[youtube] Extracting URL: https://www.youtube.com/watch?v=zhWDdy_5v2w
[youtube] zhWDdy_5v2w: Downloading webpage
[youtube] zhWDdy_5v2w: Downloading tv client config
[youtube] zhWDdy_5v2w: Downloading player e12fbea4-main
[youtube] zhWDdy_5v2w: Downloading tv player API JSON
[youtube] zhWDdy_5v2w: Downloading ios player API JSON
[youtube] zhWDdy_5v2w: Downloading m3u8 information
[info] Testing format 616
[info] zhWDdy_5v2w: Downloading 1 format(s): 616+251
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 12
[download] Destination: What Happens In One Minute？_[zhWDdy_5v2w].f616.mp4
[download] 100% of    7.94MiB in 00:00:01 at 4.01MiB/s                 
[download] Destination: What Happens In One Minute？_[zhWDdy_5v2w].f251.webm
[download] 100% of 1000.54KiB in 00:00:00 at 3.39MiB/s   
[Merger] Merging formats into "What Happens In One Minute？_[zhWDdy_5v2w].mp4"
Deleting original file What Happens In One Minute？_[zhWDdy_5v2w].f616.mp4 (pas