In [2]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium_stealth import stealth

import time
import random
import string
import json
import pafy

import pandas as pd
import numpy as np

# Hide video unavailable warnings
import warnings
warnings.filterwarnings("ignore")

# Data Inladen

In [3]:
mails = pd.read_csv("../data/mails.csv")

In [4]:
videos = pd.read_csv("../data/uploads.csv")

# Helpers

In [5]:
def vid_to_watch(videos, usertype = 1):
    """Finds a conspiracy video to watch and 
    makes sure it's still accessible and under 2 hours long"""
    
    if usertype == 1:
        # Choose a non-conspiracy video to watch
        to_watch = np.random.choice(videos[~videos["conspiracy"]]["video_id"])
    else:
        # Choose a conspiracy video to watch
        to_watch = np.random.choice(videos[videos["conspiracy"]]["video_id"])
    
    # Check its length
    url = f"http://www.youtube.com/watch?v={to_watch}"
    try:
        video = pafy.new(url);
        vid_len = video.length
    except: # Video no longer accessible
        vid_len = 9999
    
    # Videos over 2 hours will be skipped
    while vid_len > 7200:
        to_watch = np.random.choice(videos[videos["conspiracy"]]["video_id"])
        
        url = f"http://www.youtube.com/watch?v={to_watch}"
        try:
            video = pafy.new(url);
            vid_len = video.length
        except:
            vid_len = 9999
            
    # Calculate how much of the video will be watched
    percentage = np.random.normal(0.55, 0.25)
    
    # Make sure a video is watched at most 100%
    if percentage > 1:
        percentage = 1
    
    watch_time = percentage * vid_len

    return watch_time, to_watch

In [6]:
def init_driver():
    """Initializes a selenium driver in stealth mode"""
        
    # Add options
    options = webdriver.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-infobars")

    options.add_argument("start-maximized")

    # options.add_argument("--headless")

    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    
    # Start driver
    driver = webdriver.Chrome(options=options)

    # Hide the fact we're using a bot
    stealth(driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True,
            )
    
    return driver

In [7]:
def login_google(driver, mail, password):
    """Logs in on Google given a driver, email-address and password.
    This code only works if the email-address was made within chromedriver itself"""
    
    # login on google
    driver.get("https://accounts.google.com/ServiceLogin")
    time.sleep(np.random.uniform(1, 1.5))

    # Fillin mail
    driver.find_element_by_id("identifierId").send_keys(mail)
    driver.find_element_by_id("identifierNext").click()

    time.sleep(np.random.uniform(1, 1.5))

    # Fill in password
    driver.find_element_by_xpath("//input[@name = 'password']").send_keys(password)
    driver.find_element_by_xpath('//button[contains(@class, "VfPpkd-LgbsSe")]').click()

    time.sleep(0.5)

    # If security-check is asked, skip it
    try:
        driver.find_element_by_xpath("//div[contains(@class, 'U26fgb O0WRkf')]").click()
    except:
        pass

In [8]:
def in_seconds(timestamp):
    """Converts a YouTube timestamp to a number of seconds"""
    # Order times in ascending order
    times = timestamp.split(":")[::-1]
    
    # Convert to seconds
    return sum([int(times[i]) * 60**i for i in range(len(times))])

In [9]:
def prepare_video(driver):
    """Does everything necessary to start watching a video"""

    # Skip ad(s) if they are there
    for ad in range(2):
        time.sleep(0.5)
        if driver.find_elements_by_xpath("//img[@class = 'ytp-ad-image']"):
            time.sleep(5.5)
            try:
                driver.find_element_by_xpath("//div[@class = 'ytp-ad-text ytp-ad-skip-button-text']").click()
            except:
                pass

    # Toggle autoplayer
    time.sleep(1)
    autoplay = driver.find_element_by_xpath("//div[@class = 'ytp-autonav-toggle-button']")
    
    if autoplay.get_attribute("aria-checked") == "true":
        autoplay.click()
        
    pause = driver.find_element_by_xpath("//button[contains(@class, 'ytp-play-button')]")
    
    if pause.get_attribute("aria-label") == "Play (k)":
        pause.click()

In [10]:
def check_video_running(driver, watch_time):
    timestamp = driver.find_element_by_class_name("ytp-time-current").text
            
    # Sometimes the timestamp cannot be displayed
    if timestamp:   
        return in_seconds(timestamp) < watch_time
                    
    # Check if the video is done (can happen when ads play and the video is very short)
    try:
        driver.find_element_by_xpath("//div[contains(@class, 'ended-mode')]")
        return False
    except:
        pass
    
    # Video is still running
    return True

In [17]:
def choose_recommendation(recommendations, usertype = 3):
    """Selects a video to watch based on a top 10 of given recommendations"""
    
    # These odds are not based on anything yet.
    # TODO: Find a paper that has done propensity weighting on YouTube (or a similar site)
    choice = np.random.choice(recommendations, p = [0.4, 0.2, 0.1, 0.1, 0.05, 0.05, 0.05, 0.025, 0.0125, 0.0125])
    
    vid_len = pafy.new(choice).length
    
    # Make sure no livestreams are clicked
    while not vid_len:
        choice = np.random.choice(recommendations, p = [0.4, 0.2, 0.1, 0.1, 0.05, 0.05, 0.05, 0.025, 0.0125, 0.0125])
        vid_len = pafy.new(choice).length
            
    # Calculate how much of the video will be watched
    watch_time = np.random.normal(0.6, 0.15) * vid_len
    
    return watch_time, choice.replace("https://www.youtube.com/watch?v=", "")

In [18]:
def get_video_info(driver):
    """Gets basic information about a video"""
    
    views = driver.find_element_by_xpath("//span[contains(@class, 'view-count')]").text
    likes, dislikes = driver.find_elements_by_xpath("//yt-formatted-string[contains(@aria-label, 'likes')]")
    date = driver.find_element_by_xpath("//div[@id = 'date']/yt-formatted-string").text
    
    return int(views.replace(",", "").replace(" views", "")), likes.text, dislikes.text, date

# Het experiment

In [19]:
def run_experiment(videos, mails, usertype=1):
    experiment_results = {"user":[], "vids_watched":[], "video":[], "channel":[]}
    videos_per_user = {"user":[], "video_number":[], "views":[], 
                       "likes":[], "dislikes":[], "date":[]}

    for user in mails.itertuples():
        # Find current user's info
        mail, password = user[1], user[2]

        # Initialize driver
        driver = init_driver()    

        # Login to google
        login_google(driver, mail, password)
        
        rec_to_watch = None

        # Start watching videos
        for i in range(1, 3):
            if not rec_to_watch:
                # Get a random video from the dataset and watch it
                watch_time, to_watch = vid_to_watch(videos, usertype)
                driver.get(f"http://youtube.com/watch?v={to_watch}")
                vid_running = True
            else:
                driver.get(f"http://youtube.com/watch?v={rec_to_watch}")
                vid_running = True

            # Skip ads, disable autoplay
            prepare_video(driver)
            
            # Save information of current video being watched
            views, likes, dislikes, date = get_video_info(driver)
            
            videos_per_user["user"].append(mail)
            videos_per_user["video_number"].append(i)
            videos_per_user["views"].append(views)
            videos_per_user["likes"].append(likes)
            videos_per_user["dislikes"].append(dislikes)
            videos_per_user["date"].append(date)           

            # Watch video
            while check_video_running(driver, watch_time):
                time.sleep(1)
            
            # If we have a usertype that relies on direct recommendations
            if usertype == 3:
                video_recs = driver.find_elements_by_xpath("//a[contains(@class, 'ytd-compact-video-renderer')]")
                watch_time, rec_to_watch = choose_recommendation([rec.get_attribute("href")
                                                                  for rec in video_recs][:10],
                                                                 usertype)

            # Go to the youtube homepage
            driver.get("http://youtube.com")
            time.sleep(1)

            # Get videos on youtube home
            channels = driver.find_elements_by_xpath("//a[@id = 'avatar-link']")
            vids = driver.find_elements_by_xpath("//a[@id = 'video-title-link']")

            # If we have a user that relies on homepage recommendations
            if usertype == 4:
                watch_time, rec_to_watch = choose_recommendation([rec.get_attribute("href")
                                                                  for rec in vids][:10],
                                                                 usertype)
            
            # Get top 15 recommendations
            for rec in range(15):
                experiment_results["user"].append(mail)
                experiment_results["vids_watched"].append(i)

                experiment_results["video"].append(vids[rec].get_attribute("href"))
                experiment_results["channel"].append(channels[rec].get_attribute("href"))
        break

    return pd.DataFrame(experiment_results), pd.DataFrame(videos_per_user)

df1, df2 = run_experiment(videos, mails, 3)

In [20]:
df1

Unnamed: 0,user,vids_watched,video,channel
0,scriptiebot@gmail.com,1,https://www.youtube.com/watch?v=bohZt32_5wk,https://www.youtube.com/channel/UCI8gcSTo1Fows...
1,scriptiebot@gmail.com,1,https://www.youtube.com/watch?v=9Pndh0ORaeQ,https://www.youtube.com/c/PBSNewsHour
2,scriptiebot@gmail.com,1,https://www.youtube.com/watch?v=QfsDQ2QH0ZY,https://www.youtube.com/channel/UCyIx2SS_gRy7l...
3,scriptiebot@gmail.com,1,https://www.youtube.com/watch?v=BoA8xkiuts0,https://www.youtube.com/c/Fearlessmotivation
4,scriptiebot@gmail.com,1,https://www.youtube.com/watch?v=aldW3Gd-c04,https://www.youtube.com/user/lisab1230
5,scriptiebot@gmail.com,1,https://www.youtube.com/watch?v=IBE7GX2fcdc,https://www.youtube.com/c/TheChosenSeries
6,scriptiebot@gmail.com,1,https://www.youtube.com/watch?v=uVFJXb_cooc,https://www.youtube.com/c/InnerMotivation
7,scriptiebot@gmail.com,1,https://www.youtube.com/watch?v=9bLgDoFkdqo,https://www.youtube.com/user/InTouchMinistries
8,scriptiebot@gmail.com,1,https://www.youtube.com/watch?v=7Smrw-BYe1I,https://www.youtube.com/c/CosmoSapiens
9,scriptiebot@gmail.com,1,https://www.youtube.com/watch?v=p2I0Al8rhmk,https://www.youtube.com/c/Terrian


In [21]:
df2

Unnamed: 0,user,video_number,views,likes,dislikes,date
0,scriptiebot@gmail.com,1,6895,693,19,"Feb 1, 2021"
1,scriptiebot@gmail.com,2,772839,24K,576,"Mar 6, 2021"
