In [2]:
!pip install isodate
!pip install easyocr
!pip install youtube-transcript-api

Collecting isodate
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate
Successfully installed isodate-0.6.1
Collecting easyocr
  Downloading easyocr-1.7.1-py3-none-any.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (908 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m908.3/908.3 kB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ninja (from easyocr)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
import re
import isodate
import cv2
import easyocr
import numpy as np
import matplotlib.pyplot as plt
import requests
import os

CONFIDENCE_THRESHOLD = 0.5
MAX_RESULTS = 2
def get_channel_id_from_url(youtube, url):
    match = re.search(r'(?:https?://)?(?:www\.)?(?:youtube\.com/(?:user/|channel/|c/|@))([a-zA-Z0-9_-]+)', url)
    if not match:
        raise ValueError("Invalid YouTube URL")
    identifier = match.group(1)

    try:
        request = youtube.channels().list(
            part='id',
            forHandle=identifier
        )
        response = request.execute()
        if response['items']:
            return response['items'][0]['id']
    except:
        pass

    request = youtube.search().list(
        part='snippet',
        q=identifier,
        type='channel'
    )
    response = request.execute()
    if response['items']:
        return response['items'][0]['snippet']['channelId']

    raise ValueError("Channel not found")

def get_video_ids(youtube, channel_id):
    request = youtube.search().list(
        part='id',
        channelId=channel_id,
        maxResults=MAX_RESULTS,         # retireve up to 50 videos from the channel
        order='date',                   # order the results by date
        type='video',
        videoDuration='medium',         # only retrieve medium videos
    )
    response = request.execute()
    video_ids = []
    for item in response['items']:
        video_ids.append(item['id']['videoId'])
    return video_ids

def get_statistics(youtube, video_ids):

    # Fetch video data
    stats_request = youtube.videos().list(
        part=['statistics', 'snippet', 'contentDetails'],
        id=','.join(video_ids),

    )
    stats_response = stats_request.execute()

    videos = []
    for item in stats_response['items']:
        if int(item['statistics']['viewCount']) > 10000:
            videos.append({
                'channelId': item['snippet']['channelId'],
                'channelTitle': item['snippet']['channelTitle'],
                'videoId': item['id'],
                'viewCount': item['statistics']['viewCount'],
                'likeCount': item['statistics']['likeCount'],
                # 'dislikeCount': item['statistics']['dislikeCount'],
                "favoriteCount": item['statistics']['favoriteCount'],
                'commentCount': item['statistics']['commentCount'],
                'durationInSeconds': isodate.parse_duration(item['contentDetails']['duration']).total_seconds(),
                'publishedAt': item['snippet']['publishedAt'],
                'title': item['snippet']['title'],
                'description': item['snippet']['description'],
            })
            if 'dislikeCount' in item['statistics']:
                video_data['dislikeCount'] = item['statistics']['dislikeCount']
    return videos

def download_thumbnail(youtube, video_title, video_id, save_folder):
    request = youtube.videos().list(
        part='snippet',
        id=video_id
    )
    response = request.execute()

    thumbnail_url = response['items'][0]['snippet']['thumbnails']['high']['url']
    response = requests.get(thumbnail_url)
    if response.status_code == 200:
        with open(os.path.join(save_folder, f'{video_title}.jpg'), 'wb') as f:
            f.write(response.content)
    else:
        print(f"Failed to download thumbnail for video ID: {video_id}")

def face_recognition(save_folder, image_file):

    # Haar Cascade
    cascade_path = save_folder + "/haarcascade_frontalface_default.xml"

    # Haar Cascade
    face_cascade = cv2.CascadeClassifier(cascade_path)

    # Read image
    image = cv2.imread(image_file)
    if image is None:
        print(f"Error: Could not load image from {image_file}")  # Check if image loaded successfully
        return

    gray  = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Detect face
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

    # Draw a rectangle in the detected face
    for (x, y, w, h) in faces:
        cv2.rectangle(image, (x, y), (x+w, y+h), (255, 0, 0), 2)

    # Show the result
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

def text_recognition(image_file, lang):
    # Read image
    image = cv2.imread(image_file)
    gray  = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    new_width = 400
    scale_ratio = new_width / image.shape[1]
    new_height = int(image.shape[0] * scale_ratio)
    image = cv2.resize(image, (new_width, new_height))

    reader = easyocr.Reader([lang])
    results = reader.readtext(image)
    texts = []
    for (bbox, text, prob) in results:
        if prob >= CONFIDENCE_THRESHOLD:
            (top_left, top_right, bottom_right, bottom_left) = bbox
            top_left = (int(top_left[0]), int(top_left[1]))
            bottom_right = (int(bottom_right[0]), int(bottom_right[1]))

            print(f"Detected Text: {text} (Confidence: {prob:.2f})")
            cv2.rectangle(image, top_left, bottom_right, (0, 255, 0), 2)

            texts.append(text)

    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

    return texts

In [5]:
from google.colab import userdata
YOUTUBE_API_KEY = userdata.get("YOUTUBE_API_KEY")

In [9]:
from youtube_transcript_api import YouTubeTranscriptApi
import googleapiclient.discovery
import os
import pandas as pd

CHANNEL_URLS = [
    "https://www.youtube.com/@Lionfield",
    "https://www.youtube.com/@MrBeast",
    "https://www.youtube.com/@tiger_in_translation"
]
api_service_name = "youtube"
api_version      = "v3"

youtube = googleapiclient.discovery.build(
api_service_name, api_version, developerKey = YOUTUBE_API_KEY)

video_data = []
for url in CHANNEL_URLS:
    CHANNEL_ID = get_channel_id_from_url(youtube, url)
    print(CHANNEL_ID)
    video_ids = get_video_ids(youtube, CHANNEL_ID)
    video_data += get_statistics(youtube, video_ids)

save_folder = "./"
THUMBNAILS_FOLDER = save_folder + 'thumbnails/'
if not os.path.exists(THUMBNAILS_FOLDER):
    os.makedirs(THUMBNAILS_FOLDER)

for data in video_data:
    download_thumbnail(youtube, data['title'], data['videoId'], THUMBNAILS_FOLDER)
    # face_recognition(THUMBNAILS_FOLDER + video_id + '.jpg')
    video_data[video_data.index(data)]['thumbnailText'] = text_recognition(THUMBNAILS_FOLDER + data['title'] + '.jpg', "en")

print(video_data)


Output hidden; open in https://colab.research.google.com to view.