In [None]:
# Selenium stuff
# For a custom wait
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# For parsing and saving
import json
import datetime as dt
import csv
import sys
import os
import time
from tqdm import tqdm
import urllib.request
from urllib.error import HTTPError

from pprint import pprint

from util.helpers import startWebdriver, get_logging_decorator

from util.custom_values import DATA_DIR
from util.constants import channel_list_URL, CHANNEL_STATS_URL, CHANNEL_URL, thumbnail_URL, ThumbnailURL, Topic

Scrape list of channels

In [None]:
def scrape_channels(driver) -> list:
    """Scrapes channel links from url"""
    # Css selectors
    element_css = 'table.top-charts'

    # Wait 10 seconds for the element to show up
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, element_css))
    )

    href_list = driver.execute_script("""
        let href_list = []; 
        document.querySelectorAll('table.top-charts')[0].querySelectorAll('tr td a').forEach((e)=>{
            href_list.push(e.href)
        }); 
        return href_list
    """)
    href_list = href_list[::2] # Skip the other link for each row

    return href_list

In [None]:
channel_list_filepath = os.path.join("..", "data", "channel-list.json")

In [None]:
# Scrape data
driver = startWebdriver()

with open(channel_list_filepath, "r") as f:
    channel_list_by_category = json.load(f)

try:
    for cat in Topic._member_names_:
        print(cat)

        href_set = set()
        for geography in ["global", "united-states", "united-kingdom", "australia", "netherlands"]:
            url = channel_list_URL(Topic[cat], geography)
            driver.get(url)

            href_set = href_set.union(set(scrape_channels(driver)))

        channel_list_by_category[cat] = [href.split('/')[3] for href in href_set]

        with open(channel_list_filepath, "w") as f:
            json.dump(channel_list_by_category, f)
finally:
    driver.quit()

In [None]:
with open(channel_list_filepath, "r") as f:
    channel_list_by_category = json.load(f)
href_list_by_category = {cat: [CHANNEL_STATS_URL.format(name=name) for name in channels] for cat,channels in channel_list_by_category.items()}
for c,l in href_list_by_category.items():
    print(c)
    print(l[:10])

Scrape info about channels

In [None]:
def scrape_channel_info(driver) -> dict:
    """Scrapes channel info from link"""
    channel_info = driver.execute_script(r"""
        let channel_info = {}; 
        document.querySelectorAll('.box').forEach((e)=>{
            channel_info[e.querySelector('.top-part').innerHTML] = e.querySelector('.bottom-part').innerHTML
        }); 
        return channel_info
    """)
    channel_info["logo_url"] = driver.execute_script("""
        return document.querySelector('.profile-image a img').src
    """)

    return channel_info

In [None]:
def clean_entry(entry):
    return entry.lstrip("\n ").rstrip("\n ")

In [None]:
def clean_channel_info(channel_info):
    clean_channel_info = {clean_entry(k): clean_entry(v) for k,v in channel_info.items()}
    new_channel_info = {}
    category = clean_channel_info["Category"]
    new_channel_info["Category"]    = category.split("/")[2] if "/" in category else 'undefined'
    country = clean_channel_info["Country"]
    new_channel_info["Country"]     = country.split("/")[1]  if "/" in country  else 'undefined'
    new_channel_info["Subscribers"] = int(clean_channel_info["Subscribers"].replace(',',''))
    new_channel_info["Video count"] = int(clean_channel_info["Video count"].replace(',',''))
    new_channel_info["Video views"] = int(clean_channel_info["Video views"].replace(',',''))
    new_channel_info["Video views"] = int(clean_channel_info["Video views"].replace(',',''))
    new_channel_info["logo_url"]    = clean_channel_info["logo_url"]
    return new_channel_info

In [None]:
driver = startWebdriver()

try:
    for cat in Topic._member_names_:
        print(cat)

        filepath = os.path.join("..", "data", f"channels-info_{cat}.json")
        if not os.path.isfile(filepath):
            with open(filepath, "w") as f:
                f.write(r"{}")

        with open(filepath, "r") as f:
            channels_info = json.load(f)

        for url in tqdm(href_list_by_category[cat]):
            channel = url.split('/')[3]
            if channel in channels_info:
                continue

            driver.get(url)
            time.sleep(.2)

            try:
                channel_info = scrape_channel_info(driver)
                channels_info.update({channel: clean_channel_info(channel_info)})
            except Exception as e:
                print(f"failed to scrape {channel}:\n{e}")
                continue
        
            with open(filepath, "w") as f:
                json.dump(channels_info, f)
finally:
    driver.quit()

Scrape info about videos

In [None]:
channels_info_by_category = {}
for cat in Topic._member_names_:
    with open(os.path.join("..", "data", F"channels-info_{cat}.json"), "r") as f:
        channels_info_by_category[cat] = json.load(f)
channels_info_by_category

In [None]:
def clean_video_info(video_info):
    return {
        "published": video_info["publishedTimeText"]["simpleText"],
        "title":     video_info["title"]["runs"][0]["text"],
        "id":        video_info["videoId"],
        "views": int(video_info["viewCountText"]["simpleText"].replace(' views','').replace(',','')),
    }

In [None]:
def scrape_video_info(driver, limit=500):
    n = 0
    delta = 30
    while delta >= 30 and limit <= n:
        element_css = f'ytd-grid-video-renderer:nth-child({n+1})'

        # Wait 2 seconds for the element to show up
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, element_css))
        )

        n_new = driver.execute_script("""
            var l = document.querySelectorAll('#dismissible'); 
                    document.querySelectorAll('#dismissible')[l.length-1].scrollIntoView(); 
            return  document.querySelectorAll('#dismissible').length
        """)
        delta = n_new-n
        n = n_new

    return driver.execute_script("""
        let data = []; 
        let l = document.querySelectorAll("#dismissible");
        l.forEach((e)=>{data.push(e.__dataHost.data)});
        return data
    """)

In [None]:
sorted_channels_by_cat = {cat: [channel for channel,_ in sorted(channels_info_by_category[cat].items(), key=lambda x:x[1]["Subscribers"], reverse=True)] for cat in Topic._member_names_}

interleaved_channels = []
delta = 1
counter = 0
group_size = 1
while delta > 0:
    len_before = len(interleaved_channels)
    for cat in Topic._member_names_:
        for i in range(group_size):
            idx = counter + i
            channel_list = sorted_channels_by_cat[cat]
            if idx >= len(channel_list):
                continue

            interleaved_channels.append((cat, channel_list[idx]))
    delta = len(interleaved_channels) - len_before
    counter += group_size
interleaved_channels

In [None]:
driver = startWebdriver(headless=False)

name2id_filepath = os.path.join("..", "data", "channel_name2id.json")
with open(name2id_filepath, "r") as f:
    name2id = json.load(f)

videos_info_by_category = {}
for cat in Topic._member_names_:
    with open(os.path.join("..", "data", f"videos-info_{cat}.json"), "r") as f:
        videos_info_by_category[cat].update(json.load(f))

try:
    for cat,channel in tqdm(interleaved_channels):
        videos_info = videos_info_by_category[cat]
        if channel in videos_info and channel in name2id:
            continue
        elif channel in name2id:
            yt_url = f"https://www.youtube.com/channel/{name2id[channel]}"
        else:
            relay_url = CHANNEL_URL.format(name=channel)

            driver.get(relay_url)
            name2id.update({channel: driver.current_url.split("/")[4]})
            yt_url = driver.current_url

        if channel in videos_info:
            continue
        driver.get(yt_url+"/videos")

        try:
            raw_videos_info = scrape_video_info(driver, 30)
            clean_videos_info = [clean_video_info(vid_info) for vid_info in raw_videos_info]
            videos_info_by_category[cat][channel] = clean_videos_info
        except Exception as e:
            print(f"failed to scrape {channel}:\n{e}")
            continue
        
finally:
    driver.quit()

In [None]:
with open(name2id_filepath, "w") as f:
    json.dump(name2id, f)

for cat in Topic._member_names_:
    videos_info = {c:v for c,v in videos_info_by_category[cat].items() if len(v) >= 30}

    with open(os.path.join("..", "data", f"videos-info_{cat}.json"), "w") as f:
        json.dump(videos_info, f)

Fetch thumbnails

In [None]:
videos_info_by_category = {}
for cat in Topic._member_names_:
    with open(os.path.join("..", "data", F"videos-info_{cat}.json"), "r") as f:
        videos_info_by_category[cat] = json.load(f)
videos_info_by_category

In [None]:
for cat in Topic._member_names_:
    print(cat)

    for channel_name in tqdm(videos_info_by_category[cat]):
        for video_info in videos_info[channel_name]:
            id = video_info["id"]
            quality = ThumbnailURL.high
            url = thumbnail_URL(id, quality)

            filepath = os.path.join("..", "data", "thumbnails", f"{id}_{quality.name}.jpg")
            if os.path.isfile(filepath):
                continue

            try:
                urllib.request.urlretrieve(url, filepath)
            except HTTPError:
                print(f"couldn't fetch {id} by {channel_name}")