In [None]:
# For parsing and saving
import json
import datetime as dt
import csv
import sys
import os
import time

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

from pprint import pprint

from util.helpers import startWebdriver, get_logging_decorator
from util.log_videos import update_video_log

from util.custom_values import DATA_DIR, CHANNEL_ID, CHROMEDRIVER_PATH, USER_DATA_PATH, CHROME_PROFILE
from util.constants import METRICS, TimePeriod, TRAFFIC_SOURCES_IMP, \
    TRAFFIC_SOURCES, TRAFFIC_SOURCES_INV, Dimensions, ADV_URL

In [None]:
update_video_log()

In [None]:
CHROMEDRIVER_PATH = 'C:\\chromedriver\\chromedriver_107'

def startWebdriver() -> webdriver.Chrome:
    """Starts the selenium webdriver and adds options"""

    chrome_options = Options()
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("user-data-dir="+USER_DATA_PATH)
    chrome_options.add_argument("profile-directory="+CHROME_PROFILE)
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("disable-infobars")
    chrome_options.binary_location = "C:\Program Files\Google\Chrome Beta\Application\chrome.exe"

    return webdriver.Chrome(CHROMEDRIVER_PATH, options=chrome_options)

driver = startWebdriver()
driver.get("https://www.pictureofhotdog.com/")
driver.quit()

In [None]:
ChromeDriverManager().driver.get_url()

In [None]:
from util.api_scrape import scrape_channels_info

In [None]:
driver = startWebdriver()
output = scrape_channels_info(driver, [CHANNEL_ID]*5)
output

YT ANALYTICS SCRAPE TEST

In [None]:
# Selenium stuff
# For a custom wait
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from util.log_videos import adjust_video_log, get_videos, update_video_log

In [None]:
def scrape(driver) -> list:
    """Scrapes YouTube analytics from the chart"""
    # Css selectors
    element_css = 'yta-line-chart-base'

    # Wait 10 seconds for the line element to show up
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, element_css))
    )
    data = []
    totals_data = driver.execute_script("return document.querySelector('#explore-app > yta-explore-deep-dive').fetchedData.data.chartProperties.mainMetricData.data[0].totalsSeries")
    series_data_array = driver.execute_script("return Array.from( document.querySelector('#explore-app > yta-explore-deep-dive').fetchedData.data.chartProperties.mainMetricData.data[0].series.values() )")
    data.append(totals_data)
    data.extend(series_data_array)

    return data

In [None]:
# Scrape data
driver = startWebdriver()
lines = {}
base_url = ADV_URL.format(
    video_id="wRMVMSysvf4",
    time_period=TimePeriod.first_24h.value,
    metric="{metric}",
    dimension="{dimension}"
)
# Get data for totals
metric_key, metric_code = "views", "VIEWS"

try:
    url = base_url.format(metric=metric_code, dimension=Dimensions.traffic_source.value)
    driver.get(url)

    data = scrape(driver)
finally:
    driver.quit()

In [None]:
names = [category['name'] for category in data]

data_dict = {
    ca['name']: [
        {'rel': point['hovercardInfo']['relativeDateFormatted'], 'x': dt.datetime.fromtimestamp(point['x']/1000, dt.timezone.utc), 'y': point['y']} 
        for point in ca['data']
    ] for ca in data}

pprint(data_dict)

category_dict = {ca['name']: ca['data'][0]['hovercardInfo']['entityTitle'] for ca in data}
pprint(category_dict)

GET NEWEST VIDEOS IDs

In [None]:
VIDEOS_URL = "https://studio.youtube.com/channel/{channel_id}/videos/{upload_or_live}"

def scrape_latest_video_ids(driver, upload_or_live="live"):
    """Return list of recent video ids. upload_type can be upload or live. live actually finds all video ids, not just lives, for some reason."""
    driver.get(VIDEOS_URL.format(channel_id=CHANNEL_ID, upload_or_live=upload_or_live))

    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "ytcp-video-list-cell-video"))
    )
    return driver.execute_script('return Array.from(document.querySelectorAll("ytcp-video-list-cell-video")).map((e)=>{return e.__data.video.videoId})')

driver = startWebdriver()
video_ids = scrape_latest_video_ids(driver)
driver.quit()

In [None]:
video_ids

In [None]:
from util.api_scrape import switch_to_iframe, execute
from util.api_scrape_utils import clean_response
import datetime as dt
from typing import List, Dict, Union

API_VIDEOS_URL = "https://developers.google.com/youtube/v3/docs/videos/list?apix=true"

def scrape_videos(driver, video_ids: List[str]) -> Dict[str, Dict[str,Union[str,dt.datetime]]]:
    """
    Return dictionary of video upload date and title by video id.
    """
    videos = {}

    try:
        driver.get(API_VIDEOS_URL)

        switch_to_iframe(driver)
        
        # Enter arguments
        driver.find_element(By.ID, "part[0]").send_keys("snippet") # Enter parts
        driver.find_element(By.CSS_SELECTOR, "label.mat-checkbox-layout").click() # Disable auth

        for vid_id in video_ids:
            # Enter playlist id
            element = driver.find_element(By.ID, f"id[0]")
            element.clear()
            element.send_keys(vid_id)

            # Get result in json string
            json_string = clean_response(execute(driver))

            # Parse each video json into our format
            title = json_string.split('"title": "')[1].split("\n")[0][:-2]
            date_string = json_string.split('"publishedAt": "')[1][:20] # 20 is the length of 2022-10-24T15:00:24Z
            date = dt.datetime.strptime(date_string[:16], "%Y-%m-%dT%H:%M")
            videos[vid_id] = {
                "title": title,
                "date": date,
            }
    finally:
        driver.quit()
        return videos

In [None]:
driver = startWebdriver()
videos = scrape_videos(driver, video_ids)

In [None]:
videos