# Task
Do the same thing we did with Mr.Beast Videos but for all top 10 youtube channels?

In [32]:
%load_ext lab_black

In [33]:
from googleapiclient.discovery import build
import pandas as pd
import os
from tqdm import tqdm
import seaborn as sns
import matplotlib.pylab as plt
import sys
import plotly_express as px
import urllib.request

sys.path.append("../src/")
from yt_pullers import get_search_results, get_video_stats

In [34]:
top_channels = pd.read_html(
    "https://en.wikipedia.org/wiki/List_of_most-viewed_YouTube_channels"
)[0]
top_channels_v2 = pd.read_html(
    "https://en.wikipedia.org/wiki/List_of_most-subscribed_YouTube_channels"
)[0]
top_channel_list = top_channels_v2.loc[
    top_channels_v2["Country"].str.lower().str.contains("united")
]["Channel"].values

In [37]:
top_channels_v2.loc[top_channels_v2["Country"].str.lower().str.contains("united")]

Unnamed: 0,Rank,Channel,Link,Subscribers(millions),Primary language(s),Contentcategory,Country,Brand Channel
1,2,Cocomelon - Nursery Rhymes,Channel-Link,124.0,English,Education,United States,
5,6,MrBeast,Channel-Link,85.3,English[14][15],Entertainment,United States,—
6,7,WWE,Channel-Link,84.1,English[16][17],Sports,United States,
20,21,Dude Perfect,Channel-Link,56.9,English,Sports,United States,—
21,22,Movieclips,Channel-Link,54.2,English[24],Film,United States,
22,23,Marshmello,Channel-Link,53.9,English[24],Music,United States,—
25,26,Ariana Grande,Channel-Link,49.7,English[26],Music,United States,—
26,27,EminemMusic,Channel-Link,49.0,English[26],Music,United States,—
27,28,Ed Sheeran,Channel-Link,48.9,English[26],Music,United Kingdom,—
32,33,LooLoo Kids - Nursery Rhymes and Children's Songs,Channel-Link,47.2,English,Music,United States,


# Step 2 - Loop Through Channels

In [38]:
with open("../apikey/youtube.key") as f:
    youTubeApiKey = f.read()
# youTubeApiKey = _ #Read here
youtube = build("youtube", "v3", developerKey=youTubeApiKey)

In [42]:
def get_channel_id(channel_name, youtube):
    res = (
        youtube.search().list(q=channel_name, part="id,snippet", maxResults=1).execute()
    )
    return res["items"][0]["snippet"]["channelId"]

In [79]:
KEEP_COLS = [
    "id",
    "title",
    "description",
    "publishTime",
    "kind_stats",
    "duration_seconds",
    "statistics.viewCount",
    "statistics.likeCount",
    "statistics.commentCount",
    "thumbnails.default.url",
    "thumbnails.default.width",
    "thumbnails.default.height",
    "thumbnails.medium.url",
    "thumbnails.medium.width",
    "thumbnails.medium.height",
    "thumbnails.high.url",
    "thumbnails.high.width",
    "thumbnails.high.height",
    "contentDetails.duration",
    "contentDetails.dimension",
    "topicDetails.topicCategories",
    "snippet.defaultLanguage",
    "localizations.en.title",
    "localizations.en.description",
    "snippet.tags",
]


def format_stats(df_stats):
    df_stats["publishedAt"] = pd.to_datetime(df_stats["publishedAt"])
    df_stats["publishTime"] = pd.to_datetime(df_stats["publishTime"])
    df_stats["duration"] = df_stats["contentDetails.duration"].apply(pd.Timedelta)
    df_stats["duration_seconds"] = (
        df_stats["duration"].astype("timedelta64[s]").fillna(0).astype("int")
    )

    df_final = df_stats[KEEP_COLS].copy()

    df_final = df_final.rename(
        columns={
            "statistics.viewCount": "viewCount",
            "statistics.likeCount": "likeCount",
            "statistics.favoriteCount": "favoriteCount",
            "statistics.commentCount": "commentCount",
        }
    ).copy()

    df_final["likeCount"] = pd.to_numeric(df_final["likeCount"])
    df_final["viewCount"] = pd.to_numeric(df_final["viewCount"])
    df_final["commentCount"] = pd.to_numeric(df_final["commentCount"])
    return df_final

In [80]:
def pull_thumbnails(df_final, channelTitle):
    if not os.path.exists(f"../out/{channelTitle}/thumbnails"):
        os.mkdir(f"../out/{channelTitle}/thumbnails")
    for i, d in df_final.dropna(subset=["thumbnails.high.url"]).iterrows():
        myurl = d["thumbnails.high.url"]
        videoId = d["id"]
        urllib.request.urlretrieve(
            myurl, f"../out/{channelTitle}/thumbnails/{videoId}.jpg"
        )


def pull_all_video_info(channel_name, youtube):
    channelId = get_channel_id(channel_name, youtube)
    df = get_search_results(query="", channel_id=channelId, youtube=youtube)
    channelTitle = df["channelTitle"].values[0]
    channelTitle = "_".join(channelTitle.split(" "))
    df_stats = get_video_stats(df, youtube)
    df_final = format_stats(df_stats)

    if not os.path.exists(f"../out/{channelTitle}"):
        os.mkdir(f"../out/{channelTitle}")

    df_final.to_csv(
        f"../out/{channelTitle}/{channelTitle}_youtube_stats.csv", index=False
    )
    pull_thumbnails(df_final, channelTitle)

In [81]:
df = pull_all_video_info("Dude Perfect", youtube)

 20%|██        | 4/20 [00:01<00:04,  3.71it/s]


In [85]:
remaining = [c for c in top_channel_list if c not in ["Dude Perfect", "MrBeast"]]

In [None]:
for r in remaining:
    pull_all_video_info(r, youtube)

 20%|██        | 4/20 [00:01<00:04,  3.23it/s]
 55%|█████▌    | 11/20 [00:03<00:03,  2.84it/s]
