In [1]:
"""
In this script we want to set up YouTube API service to get video metadata.
Specifically, we want to extract information about the video, such as the title, description, tags, length of video, number 
of views, thumbnail, channel name.

We will save the metadata in a csv file.
"""
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import os
from googleapiclient.discovery import build

# API key
API_KEY = os.environ['YOUTUBE_API_KEY']

# YouTube API service
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

# Get the YouTube API service
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY)

In [2]:
# We want to extract information about the video, such as the title, description, and tags
def get_video_metadata(video_id):
    # Get the video metadata
    video_metadata = (
        youtube.videos()
        .list(id=video_id, part="snippet,contentDetails,statistics")
        .execute()
    )

    # Get the video title and duration
    title = video_metadata["items"][0]["snippet"]["title"]
    duration = video_metadata["items"][0]["contentDetails"]["duration"]

    # Get the video description
    try:
        description = video_metadata["items"][0]["snippet"]["description"]
    except:
        description = ""
        #print("Couldn't get description for video: ", video_id)

    # Get the video title
    try:
        tags = video_metadata["items"][0]["snippet"]["tags"]
    except:
        tags = []
        #print("Couldn't get tags for video: ", video_id)

    # Get likes
    try:
        likes = video_metadata["items"][0]["statistics"]["likeCount"]
    except:
        likes = 0
        #print("Couldn't get likes for video: ", video_id)

    # Get number of views
    try:
        views = video_metadata["items"][0]["statistics"]["viewCount"]
    except:
        views = -1
        #print("Couldn't get views: ", video_id)

    # Convert duration which is in format ISO 8601 to seconds
    # find index of H, M, S in duration if they exist
    h_index = duration.find("H")
    m_index = duration.find("M")
    s_index = duration.find("S")

    seconds = 0
    for idx, loc in enumerate([h_index, m_index, s_index]):
        if loc != -1:  # it exists
            try: # take out 2 characters before and after the H, M, or S if possible (e.g. 01H, 12M, 23S) 
                seconds += int(duration[loc - 2 : loc]) * 60 ** (2 - idx)
            except: # if not possible, take out 1 character before and after the H, M, or S (e.g. 1H, 2M, 3S)
                seconds += int(duration[loc - 1 : loc]) * 60 ** (2 - idx)

    # Return the video metadata
    return {
        "title": title,
        "description": description,
        "tags": tags,
        "duration": seconds,
        "likes": likes,
        "views": views,
    }

#  Find the channel ID from the channel name
def get_channel_id(channel_name):
    # Get the top search result channel for the channel name and return the channel ID
    return (
        youtube.search()
        .list(part="snippet", q=channel_name, type="channel", maxResults=1)
        .execute()["items"][0]["snippet"]["channelId"]
    )

def get_top_comment(video_id):
    # Get the top comment with the most likes for the video
    top_comments = (
        youtube.commentThreads()
        .list(
            part="snippet",
            videoId=video_id,
            order="relevance",
            textFormat="plainText",
            maxResults=20,
        )
        .execute()
    )

    # go through each comment and find the one with the most likes
    top_comment = ""
    top_comment_likes = 0
    for comment in top_comments["items"]:
        if (
            comment["snippet"]["topLevelComment"]["snippet"]["likeCount"]
            > top_comment_likes
        ):
            top_comment = comment["snippet"]["topLevelComment"]["snippet"][
                "textDisplay"
            ]
            top_comment_likes = comment["snippet"]["topLevelComment"]["snippet"][
                "likeCount"
            ]

    return top_comment


def get_video_title(video_id):
    # Get the video metadata including video duration
    video_metadata = (
        youtube.videos()
        .list(id=video_id, part="snippet,contentDetails,statistics")
        .execute()
    )

    # Get the video title
    title = video_metadata["items"][0]["snippet"]["title"]
    return title

def get_channel_videos(channel_id, max_result):
    # Get the channel metadata
    channel_metadata = (
        youtube.channels().list(part="contentDetails", id=channel_id).execute()
    )
    # Get the channel uploads playlist ID
    uploads_playlist_id = channel_metadata["items"][0]["contentDetails"][
        "relatedPlaylists"
    ]["uploads"]
    # Get the channel uploads playlist videos
    videos = []
    next_page_token = None
    while len(videos) < max_result:
        res = (
            youtube.playlistItems()
            .list(
                part="snippet",
                playlistId=uploads_playlist_id,
                maxResults=50,
                pageToken=next_page_token,
            )
            .execute()
        )
        # extract video ids to videos
        videos += [item["snippet"]["resourceId"]["videoId"] for item in res["items"]]
        next_page_token = res.get("nextPageToken")
        if next_page_token is None:
            break
    return videos

In [3]:
channel_names = [
    "Lex Fridman",
    "Huberman",
    "Ali Abdaal",
    "Ben Awad",
    "Tech with Tim",
    "Serrano.Academy",
    "MrBeast",
    "VigorousSteve",
    "Machine Learning Street Talk",
    "Yannic Kilcher",
    "Aladdin Persson",
    "PowerfulJRE",
    "Chris Williamson",
    "IMPAULSIVE",
    "Weights & Biases",
    "Vision & Graphics Seminar at MIT",
    "Nicolai Nielsen - Computer Vision & AI",
    "Other Level’s",
    "William Fiset",
    "What's AI by Louis Bouchard",
    "Daniel Bourke",
    "Ben Felix",
    "Cyrill Stachniss",
    "Andrej Karpathy",
    "NSCA",
    "Nothing",
    "Machine Learning Street Talk",
    "StatQuest with Josh Starmer",
    "Visual Studio Code",
    "3Blue1Brown",
    "Emergent Garden",
]

urls = []
descriptions = []
durations = []
top_comments = []
titles = []
likes = []
views = []
tags = []

# Loop through the channel names, get the channel IDs, and get the videos
for channel_name in channel_names:
    # Get the channel ID
    channel_id = get_channel_id(channel_name)

    # Get the videos
    video_ids = get_channel_videos(channel_id, max_result=500)

    # Loop through the video IDs and get the video metadata
    for video_id in tqdm(video_ids):
        # Get the video metadata
        video_metadata = get_video_metadata(video_id)

        # extract information from video_metadata dictionary
        title = video_metadata["title"]
        description = video_metadata["description"]
        duration = video_metadata["duration"]
        tag = video_metadata["tags"]
        like = video_metadata["likes"]
        views = video_metadata["views"]

        # Get the top comment
        try:
            top_comment = get_top_comment(video_id)

            if len(top_comment) > 70:
                top_comment = ""
        except:
            #print("No comments or error")
            top_comment = ""

        # print all information in nice format, all on one line
        #print(
        #    f"{channel_name} - {title} - {duration} - {top_comment} - {tag} - {like} - {views}"
        #)
        #print("Top comment: ", top_comment)

        # Convert video id to youtube url
        url = "https://www.youtube.com/watch?v=" + video_id
        #print(url)
        #print("\n\n")

        urls.append(url)
        descriptions.append(description)
        durations.append(duration)
        top_comments.append(top_comment)
        titles.append(title)
        likes.append(like)
        tags.append(tag)

100%|██████████| 86/86 [00:20<00:00,  4.14it/s]
100%|██████████| 235/235 [01:01<00:00,  3.84it/s]
100%|██████████| 485/485 [01:43<00:00,  4.70it/s]
100%|██████████| 127/127 [00:49<00:00,  2.59it/s]
100%|██████████| 18/18 [00:04<00:00,  4.03it/s]


In [4]:
# Create a dataframe with urls, descriptions, duratins, title and top comment
df = pd.DataFrame(
    {
        "url": urls,
        "description": descriptions,
        "duration": durations,
        "title": titles,
        "top_comment": top_comments,
        "likes": likes,
        "tags": tags,
    }
)

# Save the dataframe to a CSV file
df.to_csv("youtube_data_v3_extra_1.csv", index=False)

In [5]:
df.head()

Unnamed: 0,url,description,duration,title,top_comment,likes,tags
0,https://www.youtube.com/watch?v=1lkdWduuN14,Dr. Petar Veličković is a Staff Research Scie...,2472,#85 Dr. Petar Veličković (Deepmind) - Categori...,,19,[]
1,https://www.youtube.com/watch?v=5Yd28ssDutA,"In this NeurIPS interview, we speak with Laura...",1668,#84 LAURA RUIS - Large language models are not...,,161,[]
2,https://www.youtube.com/watch?v=46A-BcBbMnA,First in our unplugged series from #NeurIPS202...,1238,#83 Dr. ANDREW LAMPINEN (Deepmind) - Natural L...,Great interview. He's such a grounded and inte...,176,[]
3,https://www.youtube.com/watch?v=LgwjcqhkOA4,AI Helps Ukraine - Charity Conference\nA chari...,4519,"#82 - Dr. JOSCHA BACH - Digital Physics, DL an...",,378,[]
4,https://www.youtube.com/watch?v=iqkkkA9dtY0,Support us!\nhttps://www.patreon.com/mlst \n\n...,4187,"#81 JULIAN TOGELIUS, Prof. KEN STANLEY - AGI,...",,98,[]


In [6]:
df["top_comment"].iloc[2]

"Great interview. He's such a grounded and intelligent guy."

In [7]:
# Count how many top_comment are ""
percentage_top_comment = df["top_comment"].apply(lambda x: len(x) > 2).sum() / df["top_comment"].shape[0]
print(f"{percentage_top_comment*100:.2f}% of the videos has a top comment")

# Count how many has tags (not a empty list)
percentage_has_tags = 1 - df["tags"].apply(lambda x: len(x) > 2).sum() / df["tags"].shape[0]
print(f"{percentage_has_tags*100:.2f}% of the videos has tags")

# Count how many has description (not NaN)
percentage_has_description = 1 - df["description"].isna().sum() / df["description"].shape[0]
print(f"{percentage_has_description*100:.2f}% of the videos has a description")

32.28% of the videos has a top comment
21.14% of the videos has tags
100.00% of the videos has a description
